In [1]:
import os
import glob
import string
import pandas as pd
import re
from nltk.tokenize import TweetTokenizer
from pyarabic.araby import strip_tashkeel,strip_tatweel,normalize_ligature
from transformers import AutoTokenizer, TFAutoModel
from tqdm import tqdm
import numpy as np
import tensorflow as tf

In [2]:
labels = ['DIAL_EGY', 'DIAL_GLF', 'DIAL_LEV', 'MSA']
label2id={label:idx for idx,label in enumerate(labels)}
id2label={idx:label for label,idx in label2id.items()}

In [3]:
def preprocess_text(sequence):
    outputs=[]
    tokenizer = TweetTokenizer()
    for tweet in sequence:
        tweet = str(tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
        # remove hashtags
        # only removing the hash # sign from the word
        tweet = re.sub(r'#', '', tweet)
        #removing mentions
        tweet = re.sub(r':', '', tweet)
        tweet = re.sub(r'@[\w]+','',tweet)
        #replace punctuations with space
        tweet = re.sub(r"[,.;@#?!&$_]+\ *", " ", tweet)
        #find arabic letters only
        tweet = ' '.join(re.findall(r'[\u0600-\u06FF]+',tweet))
        #remove tashkeel
        tweet = strip_tashkeel(tweet)
        #remove tatweel
        tweet = strip_tatweel(tweet)
        #apply normalization
        #tweet = normalize_ligature(tweet)
        #tokenize tweets
        tweet_tokens = tokenizer.tokenize(tweet)
        #tweet_tokens = tweet.split(' ')
        tweet_clean=[]
        for word in tweet_tokens: # Go through every word in your tokens list
            #if word not in string.punctuation:  # remove punctuation
            #    tweet_clean.append(word)
            word_reg = re.compile(r'\w')
            if word_reg.search(word):
                tweet_clean.append(word)
        outputs.append((' '.join(tweet_clean)))
    return outputs

In [4]:
tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
model = TFAutoModel.from_pretrained("asafaya/bert-base-arabic")

Some weights of the model checkpoint at asafaya/bert-base-arabic were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at asafaya/bert-base-arabic.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
def tokenize_tweets(tweets, tokenizer, max_seq_len = 128):
    tokenized_tweets = []
    oov_tokens=set()
    for tweet in tqdm(tweets):
        tokenized_tweet = tokenizer.encode(
                            tweet,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                            truncation=True
                    )
        #oov_tokens.update([w for w in tweet.split(' ') if w not in tokenizer.get_vocab()])
        tokenized_tweets.append(tokenized_tweet)
    #print('num of oov',len(oov_tokens))
    return tokenized_tweets

def create_attention_masks(tokenized_and_padded_tweets):
    attention_masks = []
    for tweet in tokenized_and_padded_tweets:
        att_mask = [int(token_id > 0) for token_id in tweet]
        attention_masks.append(att_mask)
    return np.asarray(attention_masks)

In [6]:
class BertModel(tf.keras.Model):
    def __init__(self,bert,units,rate=0.0):
        super(BertModel,self).__init__()
        self.bert=bert
        self.dropout=tf.keras.layers.Dropout(rate)
        self.dense=tf.keras.layers.Dense(4,activation='softmax')

    def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        x=self.bert(input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,position_ids=position_ids,head_mask=head_mask)#(batch,seqlength,emb_dim)
        outputs=self.dropout(x[1])
        outputs=self.dense(outputs)
        return outputs

In [7]:
DI_model=BertModel(model,len(labels),0.5)

In [8]:
def inference(model,tokenizer,input_text):
    input_text = preprocess_text([input_text])
    input_text_tokenized = tokenize_tweets(input_text,tokenizer)
    input_text_padded = tf.keras.preprocessing.sequence.pad_sequences(input_text_tokenized,128)
    input_text_att_mask = create_attention_masks(input_text_padded)
    outputs = model(input_ids=input_text_padded, attention_mask=input_text_att_mask, training=False)[0]
    return (id2label[outputs.numpy().argmax(axis=-1)])
    
DI_model.load_weights('models/BertBase asafaya/')
prediction = inference(DI_model,tokenizer,"انا بحب  كده")
print(prediction)

100%|██████████| 1/1 [00:00<?, ?it/s]DIAL_EGY

