In [3]:
import os
import glob
import string
import pandas as pd
import re
from nltk.tokenize import TweetTokenizer
from pyarabic.araby import strip_tashkeel,strip_tatweel,normalize_ligature
from transformers import AutoTokenizer, TFAutoModel
from farasa.segmenter import FarasaSegmenter
from tqdm import tqdm
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [4]:
train_dataset=pd.read_csv('MultiTrain.Shuffled.csv')
test_dataset=pd.read_csv('MultiTest.csv')
dev_dataset=pd.read_csv('MultiDev.csv')

In [5]:
train_dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,MSA,بالإضافة لقيام معلمو الجيزة للذهاب إلي جريدة ا...
1,1,MSA,بعدين والله حرام تجي تلقى الي واقف عند الاشاره...
2,2,DIAL_LEV,لمسه اليد مرتين واضحة جدا والحكم
3,3,DIAL_LEV,بخصوص الهاتريك عمرها ما راح تصير
4,4,DIAL_GLF,الله يجبر كسرهم ويرجع و لدهم اليوم قبل بكرى ،


In [6]:
train_data=train_dataset.iloc[:,1:].values
test_data=test_dataset.iloc[:,1:].values
dev_data=dev_dataset.iloc[:,1:].values

In [7]:
labels=list(np.unique(train_data[:,0]))
label2id={label:idx for idx,label in enumerate(labels)}
id2label={idx:label for label,idx in label2id.items()}

In [8]:
def preprocess_text(sequence):
    outputs=[]
    tokenizer = TweetTokenizer()
    for tweet in sequence:
        tweet = str(tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
        # remove hashtags
        # only removing the hash # sign from the word
        tweet = re.sub(r'#', '', tweet)
        #removing mentions
        tweet = re.sub(r':', '', tweet)
        tweet = re.sub(r'@[\w]+','',tweet)
        #replace punctuations with space
        tweet = re.sub(r"[,.;@#?!&$_]+\ *", " ", tweet)
        #find arabic letters only
        tweet = ' '.join(re.findall(r'[\u0600-\u06FF]+',tweet))
        #remove tashkeel
        tweet = strip_tashkeel(tweet)
        #remove tatweel
        tweet = strip_tatweel(tweet)
        #apply normalization
        #tweet = normalize_ligature(tweet)
        #tokenize tweets
        tweet_tokens = tokenizer.tokenize(tweet)
        #tweet_tokens = tweet.split(' ')
        tweet_clean=[]
        for word in tweet_tokens: # Go through every word in your tokens list
            #if word not in string.punctuation:  # remove punctuation
            #    tweet_clean.append(word)
            word_reg = re.compile(r'\w')
            if word_reg.search(word):
                tweet_clean.append(word)
        outputs.append((' '.join(tweet_clean)))
    return outputs

In [9]:
train_data[:,1]=preprocess_text(train_data[:,1])
test_data[:,1]=preprocess_text(test_data[:,1])
dev_data[:,1]=preprocess_text(dev_data[:,1])

In [10]:
train_data_labels=[label2id[label] for label in train_data[:,0]]
test_data_labels=[label2id[label] for label in test_data[:,0]]
dev_data_labels=[label2id[label] for label in dev_data[:,0]]

In [11]:
tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
model = TFAutoModel.from_pretrained("asafaya/bert-base-arabic")

Some weights of the model checkpoint at asafaya/bert-base-arabic were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at asafaya/bert-base-arabic.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


In [12]:
def tokenize_tweets(tweets, tokenizer, max_seq_len = 128):
    tokenized_tweets = []
    oov_tokens=set()
    for tweet in tqdm(tweets):
        tokenized_tweet = tokenizer.encode(
                            tweet,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                            truncation=True
                    )
        #oov_tokens.update([w for w in tweet.split(' ') if w not in tokenizer.get_vocab()])
        tokenized_tweets.append(tokenized_tweet)
    #print('num of oov',len(oov_tokens))
    return tokenized_tweets

def create_attention_masks(tokenized_and_padded_tweets):
    attention_masks = []
    for tweet in tokenized_and_padded_tweets:
        att_mask = [int(token_id > 0) for token_id in tweet]
        attention_masks.append(att_mask)
    return np.asarray(attention_masks)

In [13]:
#tokenize tweets
train_tokenized_tweets=tokenize_tweets(train_data[:,1],tokenizer)
test_tokenized_tweets=tokenize_tweets(test_data[:,1],tokenizer)
dev_tokenized_tweets=tokenize_tweets(dev_data[:,1],tokenizer)

100%|██████████| 86541/86541 [00:20<00:00, 4138.99it/s]
100%|██████████| 10812/10812 [00:02<00:00, 3815.59it/s]
100%|██████████| 10820/10820 [00:02<00:00, 3962.51it/s]


In [14]:
#pad tweets
train_tokenized_padded_tweets=tf.keras.preprocessing.sequence.pad_sequences(train_tokenized_tweets,128)
test_tokenized_padded_tweets=tf.keras.preprocessing.sequence.pad_sequences(test_tokenized_tweets,128)
dev_tokenized_padded_tweets=tf.keras.preprocessing.sequence.pad_sequences(dev_tokenized_tweets,128)
#attention_mask
train_att_mask=create_attention_masks(train_tokenized_padded_tweets)
test_att_mask=create_attention_masks(test_tokenized_padded_tweets)
dev_att_mask=create_attention_masks(dev_tokenized_padded_tweets)

In [15]:
BATCH_SIZE=16
train_data = tf.data.Dataset.from_tensor_slices((train_tokenized_padded_tweets,train_att_mask,train_data_labels)).batch(BATCH_SIZE, drop_remainder=True).prefetch(1)
validate_data = tf.data.Dataset.from_tensor_slices((dev_tokenized_padded_tweets,dev_att_mask,dev_data_labels)).batch(BATCH_SIZE, drop_remainder=True).prefetch(1)
test_data = tf.data.Dataset.from_tensor_slices((test_tokenized_padded_tweets,test_att_mask,test_data_labels)).batch(BATCH_SIZE).prefetch(1)

In [16]:
class BertModel(tf.keras.Model):
    def __init__(self,bert,units,rate=0.0):
        super(BertModel,self).__init__()
        self.bert=bert
        self.dropout=tf.keras.layers.Dropout(rate)
        self.dense=tf.keras.layers.Dense(4,activation='softmax')

    def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        x=self.bert(input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,position_ids=position_ids,head_mask=head_mask)#(batch,seqlength,emb_dim)
        outputs=self.dropout(x[1])
        outputs=self.dense(outputs)
        return outputs

In [17]:
DI_model=BertModel(model,len(labels),0.8)

In [18]:
#Loss Function and Accuracy measures
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
train_loss = tf.keras.metrics.Mean(name='train_loss')
val_loss = tf.keras.metrics.Mean(name='val_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy() 
optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001)

In [19]:
@tf.function()
def train_step(tweet,att_mask,label):
    with tf.GradientTape() as g:
        outputs=DI_model(tweet,att_mask,training=True)  
        loss=loss_object(label,outputs)
    train_loss.update_state(loss)
    train_accuracy.update_state(label,outputs)
    gradients=g.gradient(loss,DI_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients,DI_model.trainable_variables))
    return loss

In [20]:
@tf.function()
def val_step(tweet,att_mask,label):
    outputs=DI_model(tweet,att_mask,training=False)
    loss=loss_object(label,outputs)
    val_loss.update_state(loss)
    val_accuracy.update_state(label,outputs)
    return loss

In [21]:
@tf.function()
def test_step(saved_model,tweet,att_mask,label):
    outputs=saved_model(input_ids=tweet,attention_mask=att_mask,training=False)
    loss=loss_object(label,outputs)
    test_loss.update_state(loss)
    test_accuracy.update_state(label,outputs)
    return loss

In [22]:
def pred_step(saved_model,tweet,att_mask):
    outputs=saved_model(input_ids=tweet,attention_mask=att_mask,training=False)
    return outputs

In [23]:
EPOCHS=100
min_val_loss=99999
patience=1
for epoch in range(EPOCHS):
    if patience > 0 :
        train_loss.reset_states()
        train_accuracy.reset_states()
        val_loss.reset_states()
        val_accuracy.reset_states()

        for batch_idx,(tweet,att_masks,outputs) in enumerate(train_data):
            loss=train_step(tweet,att_masks,outputs)
            if (batch_idx+1)%1000==0:
                print('Epoch {} Batch {} Loss {}'.format(epoch+1,batch_idx+1,loss.numpy()))

        for tweet,att_masks,outputs in validate_data:
            val_step(tweet,att_masks,outputs)

        template='Epoch {}, Train Loss {:.5}, Train Accuracy {:.5}, Val Loss {:.5}, Val Accuracy {:.5}'
        print(template.format(epoch+1,train_loss.result(),train_accuracy.result(),val_loss.result(),val_accuracy.result()))

        if val_loss.result() < min_val_loss:
            DI_model.save_weights('models/BertBase asafaya/')
            patience=1
            print('new model has been saved')
            min_val_loss=val_loss.result()
        else:
            patience-=1
    else:
        break

# Test Results

DR = 0.0
**test_acc = 0.8424**
**test_precision = 0.8198**
**test_recall = 0.7611**
**test_f1 = 0.7872**

DR = 0.3
**test_acc = 0.8391**
**test_precision = 0.8166**
**test_recall = 0.7679**
**test_f1 = 0.7882**

DR = 0.5
**test_acc = 0.8429**
**test_precision = 0.8204**
**test_recall = 0.7638**
**test_f1 = 0.7889**

DR = 0.8
**test_acc = 0.8396**
**test_precision = 0.8173**
**test_recall = 0.7584**
**test_f1 = 0.7844**


In [24]:
#testing
from sklearn.metrics import classification_report
DI_model.load_weights('models/BertBase asafaya/')
predictions=None
for index,(inputs,att_mask,labels) in enumerate(test_data):
    pred = pred_step(DI_model,inputs,att_mask).numpy()
    if index==0:
        predictions=pred
    else:
        predictions = np.concatenate((predictions,pred),axis=0)
print(classification_report(test_data_labels, predictions.argmax(axis=-1),target_names=list(label2id.keys()),digits=4))