In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/nlp_project

/content/drive/MyDrive/nlp_project


In [3]:
!pip install arabic-stopwords
!pip install qalsadi
!pip install pyarabic


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import numpy as np
import pandas as pd
import spacy
import string
import qalsadi.lemmatizer
from nltk.stem.isri import ISRIStemmer
from pyarabic.araby import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
import torch
from torch import nn
from tqdm import tqdm

from pre_processing_post import processPost
from extract_features import get_unigram_features, get_word_embedding_features



In [5]:
# needed functions
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

# Read train data

In [6]:
train_data = pd.read_csv('./DataSet/train.csv',sep=',',header=0)
test_data = pd.read_csv('./DataSet/dev.csv',sep=',',header=0)
train_data.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,celebrity,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,info_news,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,info_news,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,celebrity,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,personal,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",info_news,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,info_news,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,personal,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,unrelated,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,info_news,1


In [7]:
#remove first row that has the header
train_data['category'] = train_data['category'].astype('category').cat.codes
train_data.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,1,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,2,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,2,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,1,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,4,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",2,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,2,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,4,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,9,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,2,1


##Over Sampling

In [8]:
# !pip install imblearn

In [9]:
# from collections import Counter
# from imblearn.over_sampling import RandomOverSampler
# train_data=train_data.drop('category',axis=1)
# y=train_data['stance']
# print(Counter(train_data['stance']))
# train_data=train_data.drop('stance',axis=1)
# # define oversampling strategy
# oversample = RandomOverSampler(random_state=3)
# # fit and apply the transform
# train_data["text"], train_data['stance'] = oversample.fit_resample(train_data, y)
# print(Counter(train_data['stance']))

# Pre-Processing the tweets

In [10]:
print(train_data["text"][9])
train_data["text"] = train_data['text'].apply(lambda x: processPost(x))
test_data['text'] = test_data['text'].apply(lambda x: processPost(x))
print(train_data["text"][9])

دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية لقاح لعد ما اتابع الاخبار هم بكل مجالاتهم متفوقين وراح يطلع اللقاح قريباً؟<LF>#دعبول_دومه_مسحول
دعبول حضر من نت طلب قائد دول إسلام قاح عد تابع اخبار مجال متفوق طلع قاح قريبا دعبول دوم مسحول


## Ara2Vec Embedding

In [11]:
# load AraVec Spacy model
nlp = spacy.load("./spacy.aravec.model/")

In [12]:
# Define the preprocessing Class
class Preprocessor:
    def __init__(self, tokenizer, **cfg):
        self.tokenizer = tokenizer

    def __call__(self, text):
        # preprocessed = processPost(text)
        return self.tokenizer(text)

In [13]:
# Apply the `Preprocessor` Class
nlp.tokenizer = Preprocessor(nlp.tokenizer)

## create vocablary

In [14]:
train_data_tokenized = train_data['text'].apply(tokenize)
test_data_tokenized = test_data['text'].apply(tokenize)
#merge all the sentences in one list
vocab = [item for sublist in train_data_tokenized for item in sublist]
vocab.append('<فراغ>')
vocab.append('<مجهول>')
vocab = set(vocab)
word2index = {word: i for i, word in enumerate(vocab)}

# Feature Extraction

## 1. TD-IDF

In [15]:
unigramdata_features, word_vectorizer = get_unigram_features(train_data)
unigramdata_features.head()

Unnamed: 0,Unnamed: 1,أبيض,أتي,أجيل,أحد,أخبار,أخذ,أخر,أخير,أدو,...,يزر,يش,يف,يل,يمن,ين,يه,يوم,يون,يونت
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


LinearSVC Classifier

In [16]:
clf = LinearSVC()
pipe_tfidf = make_pipeline(word_vectorizer, clf)
pipe_tfidf.fit(train_data['text'], train_data['stance'])
print_report(pipe_tfidf, test_data['text'], test_data['stance'])

              precision    recall  f1-score   support

          -1       0.58      0.16      0.25        70
           0       0.49      0.33      0.39       126
           1       0.85      0.95      0.90       804

    accuracy                           0.82      1000
   macro avg       0.64      0.48      0.51      1000
weighted avg       0.79      0.82      0.79      1000

accuracy: 0.818


RandomForestClassifier

In [17]:
X_train_tfidf = word_vectorizer.fit_transform(train_data['text'])
X_test_tfidf = word_vectorizer.transform(test_data['text'])
rf = RandomForestClassifier()
rf_tfidf = rf.fit(X_train_tfidf, train_data['stance'])
y_pred = rf_tfidf.predict(X_test_tfidf)

print_report(rf_tfidf, X_test_tfidf, test_data['stance'])

              precision    recall  f1-score   support

          -1       0.50      0.13      0.20        70
           0       0.55      0.25      0.35       126
           1       0.84      0.97      0.90       804

    accuracy                           0.82      1000
   macro avg       0.63      0.45      0.48      1000
weighted avg       0.78      0.82      0.78      1000

accuracy: 0.819


## 2.CBOW

In [18]:
train_data_embeddings = np.array([np.array([nlp(i).vector for i in ls if i in vocab]) for ls in train_data["text"]])
test_data_embeddings = np.array([np.array([nlp(i).vector for i in ls if i in vocab]) for ls in test_data["text"]])

  train_data_embeddings = np.array([np.array([nlp(i).vector for i in ls if i in vocab]) for ls in train_data["text"]])
  test_data_embeddings = np.array([np.array([nlp(i).vector for i in ls if i in vocab]) for ls in test_data["text"]])


In [19]:
X_train_vect_avg, X_test_vect_avg = get_word_embedding_features(train_data_embeddings, test_data_embeddings)

LinearSVC Classifier

In [37]:
clf = LinearSVC(class_weight={-1:0.66, 0:0.29, 1:0.05})
clf.fit(X_train_vect_avg, train_data['stance'])
print_report(clf, X_test_vect_avg, test_data['stance'])

              precision    recall  f1-score   support

          -1       0.19      0.13      0.15        70
           0       0.26      0.16      0.20       126
           1       0.83      0.91      0.87       804

    accuracy                           0.76      1000
   macro avg       0.43      0.40      0.41      1000
weighted avg       0.71      0.76      0.73      1000

accuracy: 0.757




RandomForestClassifier

In [40]:
rf = RandomForestClassifier(class_weight="balanced")
rf_vect = rf.fit(X_train_vect_avg, train_data['stance'].values.ravel())
print_report(rf_vect, X_test_vect_avg, test_data['stance'])

              precision    recall  f1-score   support

          -1       0.40      0.06      0.10        70
           0       0.40      0.06      0.11       126
           1       0.82      0.99      0.89       804

    accuracy                           0.80      1000
   macro avg       0.54      0.37      0.37      1000
weighted avg       0.73      0.80      0.74      1000

accuracy: 0.804


In [22]:
weights_train_matrix = []
for word in vocab:
  weights_train_matrix.append(nlp(word).vector)

weights_train_matrix = torch.from_numpy(np.array(weights_train_matrix))
weights_train_matrix.size()

torch.Size([12538, 100])

# NERDataset
The class that impelements the dataset for NER

In [23]:
class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################
    list_len = [len(i) for i in x]
    MAX_LENGTH = max(list_len) 
    for i in range(len(x)):
      x[i] = np.pad(x[i], (0, MAX_LENGTH-len(x[i])), 'constant', constant_values=(pad))

    self.x = torch.from_numpy(np.array(x)) 
    self.y = torch.from_numpy(np.array(y))
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return self.x.shape[0]
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return (self.x[idx], self.y[idx])
    ##########################################################################################

In [24]:
def create_emb_layer(weights_train_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_train_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_train_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class NER(nn.Module):
  def __init__(self, vocab_size=len(vocab), embedding_dim=100, hidden_size=100, n_classes=3, n_layer=1):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    self.hidden_size = hidden_size
    super(NER, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_train_matrix, True)
    self.hidden_size = hidden_size

    # (2) Create an GRU layer with hidden size = hidden_size and batch_first = True
    self.GRU = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True, num_layers=n_layer)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size, n_classes)
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    final_output, _ = self.GRU(self.embedding(sentences))
    final_output = self.linear(final_output)
    final_output = final_output[:, -1, :]
    ###############################################################################################
    return final_output

In [25]:
model = NER()
print(model)

NER(
  (embedding): Embedding(12538, 100)
  (GRU): GRU(100, 100, batch_first=True)
  (linear): Linear(in_features=100, out_features=3, bias=True)
)


# Training

In [29]:
def train(model, train_dataset, batch_size=32, epochs=10, learning_rate=0.001):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ############################## TODO: replace the Nones in the following code ##################################
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = nn.CrossEntropyLoss(weight=torch.tensor([.5, .4, .1])) 

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    if epoch_num % 5 == 0:
      learning_rate /= 10
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      train_label = train_label.to(device)

      # (5) move the train label to the device
      train_input = train_input.to(device)

      # (6) do the forward pass
      output = model.forward(sentences=train_input)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      batch_loss = criterion(output, train_label) 

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      argmax = torch.argmax(output, dim=1)
      acc = torch.sum(torch.eq(argmax, train_label))
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward()

      # (12) update the weights with your optimizer
      optimizer.step()

    num_of_batches = len(train_dataset) / batch_size
    num_of_batches = int(num_of_batches) + 1
    # epoch loss
    epoch_loss = total_loss_train / num_of_batches
    
    # (13) calculate the accuracy
    epoch_acc = total_acc_train / len(train_dataset)

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [27]:
train_data_tokenized_as_num = train_data_tokenized.apply(lambda x: [word2index[word] for word in x])
# apply the same tokenization to the test set
test_data_tokenized_as_num = test_data_tokenized.apply(lambda x: [word2index[word] for word in x if word in word2index])
train_dataset = NERDataset(list(train_data_tokenized_as_num), train_data['stance'] + 1, word2index['<فراغ>'])
test_dataset = NERDataset(list(test_data_tokenized_as_num), test_data['stance'] + 1, word2index['<فراغ>'])

In [30]:
train(model, train_dataset, epochs=10, learning_rate=0.001)

100%|██████████| 219/219 [00:29<00:00,  7.46it/s]


Epochs: 1 | Train Loss: 0.6746148605869241         | Train Accuracy: 0.7840583920478821



100%|██████████| 219/219 [00:18<00:00, 11.53it/s]


Epochs: 2 | Train Loss: 0.5885888102664251         | Train Accuracy: 0.8106754422187805



100%|██████████| 219/219 [00:19<00:00, 11.37it/s]


Epochs: 3 | Train Loss: 0.5125545530014386         | Train Accuracy: 0.8302804827690125



100%|██████████| 219/219 [00:19<00:00, 11.51it/s]


Epochs: 4 | Train Loss: 0.4384431602203683         | Train Accuracy: 0.8488838076591492



100%|██████████| 219/219 [00:20<00:00, 10.94it/s]


Epochs: 5 | Train Loss: 0.3498884167981474         | Train Accuracy: 0.8739267587661743



100%|██████████| 219/219 [00:19<00:00, 11.31it/s]


Epochs: 6 | Train Loss: 0.30206451340489193         | Train Accuracy: 0.886519730091095



100%|██████████| 219/219 [00:19<00:00, 11.34it/s]


Epochs: 7 | Train Loss: 0.26321067385477565         | Train Accuracy: 0.9034058451652527



100%|██████████| 219/219 [00:19<00:00, 11.05it/s]


Epochs: 8 | Train Loss: 0.21881666896890287         | Train Accuracy: 0.9184315800666809



100%|██████████| 219/219 [00:23<00:00,  9.39it/s]


Epochs: 9 | Train Loss: 0.20933188229254937         | Train Accuracy: 0.9157126545906067



100%|██████████| 219/219 [00:19<00:00, 11.29it/s]

Epochs: 10 | Train Loss: 0.1831825128905305         | Train Accuracy: 0.9327418208122253






# Evaluation

In [31]:
def evaluate(model, test_dataset, batch_size=32):
  """
  This function takes a NER model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a NER model
  - test_dataset: dataset of type NERDataset
  """
  ########################### TODO: Replace the Nones in the following code ##########################

  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0.0
  
  y_test = [] 
  y_predected = [] 
  # (2) disable gradients
  with torch.no_grad():
    report = None
    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device)

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model.forward(sentences=test_input)

      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = torch.sum(torch.eq(torch.argmax(output, dim=1), test_label))
      total_acc_test += acc
      
      # f1 score calculation
      y_test +=(list(test_label.view(-1)))
      y_predected +=(list(torch.argmax(output, dim=1).view(-1)))

    # (6) calculate the over all accuracy
    total_acc_test /= len(test_dataset)
  ##################################################################################################
  report = metrics.classification_report(y_test, y_predected)
  print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_predected)))
  print(report)
  
  print(f'\nTest Accuracy: {total_acc_test}')

In [32]:
evaluate(model, test_dataset)

100%|██████████| 32/32 [00:00<00:00, 83.48it/s]


accuracy: 0.784
              precision    recall  f1-score   support

           0       0.30      0.20      0.24        70
           1       0.37      0.48      0.42       126
           2       0.90      0.88      0.89       804

    accuracy                           0.78      1000
   macro avg       0.52      0.52      0.52      1000
weighted avg       0.79      0.78      0.79      1000


Test Accuracy: 0.7839999794960022
