<a href="https://colab.research.google.com/github/maedeamooshahi/datamining-6-7/blob/main/maedeamooshahi_parsbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf

In [None]:
!pip install -q transformers
!pip install -qU hazm

[K     |████████████████████████████████| 4.4 MB 8.4 MB/s 
[K     |████████████████████████████████| 101 kB 5.2 MB/s 
[K     |████████████████████████████████| 596 kB 46.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 38.8 MB/s 
[K     |████████████████████████████████| 316 kB 8.5 MB/s 
[K     |████████████████████████████████| 1.4 MB 53.7 MB/s 
[K     |████████████████████████████████| 233 kB 64.3 MB/s 
[?25h  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone


In [None]:
import numpy as np
import pandas as pd

import hazm

import transformers 
from transformers import AutoTokenizer, AutoConfig
from transformers import TFAutoModelForTokenClassification

import os
from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout

print()
print('tensorflow', tf.__version__)
print('transformers', transformers.__version__)
print('numpy', np.__version__)
print('pandas', pd.__version__)
print()

if tf.test.gpu_device_name() != '/device:GPU:0':
    print()
    print('WARNING: GPU device not found.')
else:
    print()
    print('SUCCESS: Found GPU: {}'.format(tf.test.gpu_device_name()))


tensorflow 2.8.2
transformers 4.20.1
numpy 1.21.6
pandas 1.3.5




In [None]:
texts = [
    "کسانی که در آینده، تاریخ دانشگاه علوم پزشکی اصفهان را می نویسند، بدون اغراق نام دکتر یوسفی را به عنوان سردار سازندگی حوزه سلامت استان و سرلوحه سه دهه سازندگی در این دانشگاه خواهند شناخت.",
    "دکتر جهانپور در ادامه سخنان خود،  تمرکز دکتر یوسفی طی سی سال خدمت در حوزه سازندگی را بسیار تحسین برانگیز برشمرد و آن را به عنوان یک توفیق الهی که نصیب ایشان شده است عنوان کرد..",
    "امروز شاهد تغییر و تحولی هستیم که از جنس فارغ شدن از یک کار و مشغول شدن در عرصه نو می باشد و در این روند، فقط جنس ماموریت تغییر پیدا کرده است.",
]

peyma_translate = {
    "B_DAT": "تاریخ",
    "B_LOC": "موقعیت",
    "B_MON": "پول",
    "B_ORG": "سازمنان",
    "B_PCT": "درصد",
    "B_PER": "شخص",
    "B_TIM": "زمان",
    "I_DAT": "تاریخ",
    "I_LOC": "موقعیت",
    "I_MON": "پول",
    "I_ORG": "سازمان",
    "I_PCT": "درصد",
    "I_PER": "شخص",
    "I_TIM": "زمان",
    "O": None,
}
arman_translate = {
    "B-event": "رویداد",
    "B-fac": "امکانات",
    "B-loc": "موقعیت",
    "B-org": "سازمان",
    "B-pers": "شخص",
    "B-pro": "محصول",
    "I-event": "رویداد",
    "I-fac": "امکانات",
    "I-loc": "موقعیت",
    "I-org": "سازمان",
    "I-pers": "شخص",
    "I-pro": "محصول",
    "O": None
}

ner_translate = {
    "B-date": "تاریخ",
    "B-event": "رویداد",
    "B-facility": "امکانات",
    "B-location": "موقعیت",
    "B-money": "پول",
    "B-organization": "سازمان",
    "B-person": "شخص",
    "B-product": "محصول",
    "B-time": "زمان",
    "B-percent": "درصد",
    "I-date": "تاریخ",
    "I-event": "رویداد",
    "I-facility": "امکانات",
    "I-location": "موقعیت",
    "I-money": "پول",
    "I-organization": "سازمان",
    "I-person": "شخص",
    "I-product": "محصول",
    "I-time": "زمان",
    "I-percent": "درصد",
    "O": None
}

In [None]:
normalizer = hazm.Normalizer()


def cleanize(text):
    """A way to normalize and even clean the text"""
    # clean text
    # do some fns
    return normalizer.normalize(text)


def parsbert_ner_load_model(model_name):
    """Load the model"""
    try:
        config = AutoConfig.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = TFAutoModelForTokenClassification.from_pretrained(model_name)
        labels = list(config.label2id.keys())

        return model, tokenizer, labels
    except:
        return [None] * 3

def parsbert_ner(texts, model_name, label_translate, visualize=True):
    """Predict and visualize the NER!"""
    global css_is_load
    
    css_is_load = False
    css = """<style>
    .ner-box {
        direction: rtl;
        font-size: 18px !important;
        line-height: 20px !important;
        margin: 0 0 15px;
        padding: 10px;
        text-align: justify;
        color: #343434 !important;
    }
    .token, .token span {
        display: inline-block !important;
        padding: 2px;
        margin: 2px 0;
    }
    .token.token-ner {
        background-color: #f6cd61;
        font-weight: bold;
        color: #000;
    }
    .token.token-ner .ner-label {
        color: #9a1f40;
        margin: 0px 2px;
    }
    </style>"""

    if not css_is_load:
        display(HTML(css))
        css_is_load = True

    model, tokenizer, labels = parsbert_ner_load_model(model_name)

    if not model or not tokenizer or not labels:
        return 'Something wrong has been happened!'
    
    output_predictions = []
    for sequence in texts:
        sequence = cleanize(sequence)
        tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
        inputs = tokenizer.encode(sequence, return_tensors="tf")
        outputs = model(inputs)[0]
        predictions = tf.argmax(outputs, axis=2)
        predictions = [(token, label_translate[labels[prediction]]) for token, prediction in zip(tokens, predictions[0].numpy())]
        
        if not visualize:
            output_predictions.append(predictions)
        else:
            pred_sequence = []
            for token, label in predictions:
                if token not in ['[CLS]', '[SEP]']:
                    if label:
                        pred_sequence.append(
                            '<span class="token token-ner">%s<span class="ner-label">%s</span></span>' 
                            % (token, label))
                    else:
                        pred_sequence.append(
                            '<span class="token">%s</span>' 
                            % token)
                
            html = '<p class="ner-box">%s</p>' % ' '.join(pred_sequence) 
            display(HTML(html))

    return output_predictions

In [None]:
model_name = 'HooshvareLab/bert-base-parsbert-armanner-uncased'
_ = parsbert_ner(texts, model_name, arman_translate, visualize=True)

Downloading:   0%|          | 0.00/937 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/621M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForTokenClassification.

All the layers of TFBertForTokenClassification were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-armanner-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [None]:
model_name = 'HooshvareLab/bert-base-parsbert-peymaner-uncased'
_ = parsbert_ner(texts, model_name, peyma_translate, visualize=True)

Downloading:   0%|          | 0.00/997 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/621M [00:00<?, ?B/s]

Some layers from the model checkpoint at HooshvareLab/bert-base-parsbert-peymaner-uncased were not used when initializing TFBertForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-peymaner-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [None]:
#@title Live Playground { display-mode: "form" }

submit_wd = widgets.Button(description='Send', disabled=False, button_style='success', tooltip='Submit')
text_wd = widgets.Textarea(placeholder='Please enter you text ...', rows=5, layout=Layout(width='90%'))

config_wd = widgets.RadioButtons(options=['ARMAN', 'PEYMA', 'ARMAN+PEYMA'], description='Select your model', disabled=False)

output_wd = widgets.Output()

display(HTML("""
<h2>Persian NER <small>[ARMAN, PEYMA, ARMAN+PEYMA]</small></h2>
<p style="padding: 2px 20px; margin: 0 0 20px;">
</p>
<br /><br />
"""))

display(config_wd)
display(text_wd)
display(submit_wd)
display(output_wd)

configs = {
    'ARMAN': [
        'HooshvareLab/bert-base-parsbert-armanner-uncased',
        arman_translate
    ], 
    'PEYMA': [
        'HooshvareLab/bert-base-parsbert-peymaner-uncased',
        peyma_translate
    ],
    'ARMAN+PEYMA': [
        'HooshvareLab/bert-base-parsbert-ner-uncased',
        ner_translate
    ]
}

def submit_text(sender):
    with output_wd:
        clear_output(wait=True)
        text = text_wd.value
        config = configs[config_wd.value]
        print('Predicting .... [please wait!]')
        _ = parsbert_ner([text], config[0], config[1], visualize=True)


submit_wd.on_click(submit_text)

RadioButtons(description='Select your model', options=('ARMAN', 'PEYMA', 'ARMAN+PEYMA'), value='ARMAN')

Textarea(value='', layout=Layout(width='90%'), placeholder='Please enter you text ...', rows=5)

Button(button_style='success', description='Send', style=ButtonStyle(), tooltip='Submit')

Output()