In [2]:
# !pip install arabic-stopwords
import numpy as np
import pandas as pd
import string
import qalsadi.lemmatizer
from nltk.stem.isri import ISRIStemmer
from pyarabic.araby import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
import torch
from torch import nn
from tqdm import tqdm

from pre_processing_post import processPost
from extract_features import get_unigram_features, get_word_embedding_features

In [3]:
# needed functions
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

# Read train data

In [4]:
train_data = pd.read_csv('./DataSet/train.csv',sep=',',header=0)
test_data = pd.read_csv('./DataSet/dev.csv',sep=',',header=0)
train_data.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,celebrity,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,info_news,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,info_news,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,celebrity,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,personal,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",info_news,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,info_news,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,personal,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,unrelated,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,info_news,1


In [5]:
#remove first row that has the header
train_data['category'] = train_data['category'].astype('category').cat.codes
train_data.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,1,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,2,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,2,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,1,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,4,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",2,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,2,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,4,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,9,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,2,1


In [19]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
train_data=train_data.drop('category',axis=1)
y=train_data['stance']
print(Counter(train_data['stance']))
train_data=train_data.drop('stance',axis=1)
# define oversampling strategy
oversample = RandomOverSampler(random_state=3)
# fit and apply the transform
train_data["text"], train_data['stance'] = oversample.fit_resample(train_data, y)
print(Counter(train_data['stance']))

ModuleNotFoundError: No module named 'imblearn'

In [None]:
!pip install imblearn

# Pre-Processing the tweets

In [6]:
print(train_data["text"][9])
train_data["text"] = train_data['text'].apply(lambda x: processPost(x))
test_data['text'] = test_data['text'].apply(lambda x: processPost(x))
print(train_data["text"][9])

دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية لقاح لعد ما اتابع الاخبار هم بكل مجالاتهم متفوقين وراح يطلع اللقاح قريباً؟<LF>#دعبول_دومه_مسحول
دعبول حضر من نت طلب قائد دول إسلام قاح عد تابع اخبار مجال متفوق طلع قاح قريبا دعبول دوم مسحول


# Feature Extraction

## 1. TD-IDF

In [7]:
unigramdata_features, word_vectorizer = get_unigram_features(train_data)
unigramdata_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


LinearSVC Classifier

In [8]:
clf = LinearSVC()
pipe_tfidf = make_pipeline(word_vectorizer, clf)
pipe_tfidf.fit(train_data['text'], train_data['stance'])
print_report(pipe_tfidf, test_data['text'], test_data['stance'])

              precision    recall  f1-score   support

          -1       0.58      0.16      0.25        70
           0       0.49      0.33      0.39       126
           1       0.85      0.95      0.90       804

    accuracy                           0.82      1000
   macro avg       0.64      0.48      0.51      1000
weighted avg       0.79      0.82      0.79      1000

accuracy: 0.818


RandomForestClassifier

In [9]:
X_train_tfidf = word_vectorizer.fit_transform(train_data['text'])
X_test_tfidf = word_vectorizer.transform(test_data['text'])
rf = RandomForestClassifier()
rf_tfidf = rf.fit(X_train_tfidf, train_data['stance'])
y_pred = rf_tfidf.predict(X_test_tfidf)

print_report(rf_tfidf, X_test_tfidf, test_data['stance'])

              precision    recall  f1-score   support

          -1       0.50      0.11      0.19        70
           0       0.55      0.21      0.31       126
           1       0.84      0.97      0.90       804

    accuracy                           0.82      1000
   macro avg       0.63      0.43      0.46      1000
weighted avg       0.78      0.82      0.77      1000

accuracy: 0.816


## 2.CBOW

In [10]:
X_train_vect_avg, X_test_vect_avg = get_word_embedding_features(train_data, test_data)

  for ls in train_data["text"]])
  for ls in test_data['text']])


LinearSVC Classifier

In [11]:
clf = LinearSVC()
clf.fit(X_train_vect_avg, train_data['stance'])
print_report(clf, X_test_vect_avg, test_data['stance'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        70
           0       0.00      0.00      0.00       126
           1       0.80      1.00      0.89       804

    accuracy                           0.80      1000
   macro avg       0.27      0.33      0.30      1000
weighted avg       0.65      0.80      0.72      1000

accuracy: 0.804


RandomForestClassifier

In [12]:
rf = RandomForestClassifier()
rf_vect = rf.fit(X_train_vect_avg, train_data['stance'].values.ravel())
print_report(rf_vect, X_test_vect_avg, test_data['stance'])

              precision    recall  f1-score   support

          -1       0.75      0.04      0.08        70
           0       0.45      0.11      0.18       126
           1       0.82      0.98      0.89       804

    accuracy                           0.81      1000
   macro avg       0.67      0.38      0.38      1000
weighted avg       0.77      0.81      0.74      1000

accuracy: 0.805


creat vocabulary

In [13]:
train_data_tokenized = train_data['text'].apply(tokenize)
test_data_tokenized = test_data['text'].apply(tokenize)
#merge all the sentences in one list
vocab = [item for sublist in train_data_tokenized for item in sublist]
vocab.append('<فراغ>')
vocab.append('<غير معروف>')
vocab = set(vocab)
word2index = {word: i for i, word in enumerate(vocab)}
index2word = {i: word for i, word in enumerate(vocab)}

# NERDataset
The class that impelements the dataset for NER

In [14]:
class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################
    list_len = [len(i) for i in x]
    MAX_LENGTH = max(list_len) 
    for i in range(len(x)):
      x[i] = np.pad(x[i], (0, MAX_LENGTH-len(x[i])), 'constant', constant_values=(pad))

    self.x = torch.from_numpy(np.array(x)) 
    self.y = torch.from_numpy(np.array(y))
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return self.x.shape[0]
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return (self.x[idx], self.y[idx])
    ##########################################################################################

In [15]:
class NER(nn.Module):
  def __init__(self, vocab_size=len(vocab), embedding_dim=25, hidden_size=25, n_classes=3):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    self.hidden_size = hidden_size
    super(NER, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size, n_classes)
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    final_output, _ = self.lstm(self.embedding(sentences))
    final_output = self.linear(final_output)
    final_output = final_output[:, -1, :]
    ###############################################################################################
    return final_output

TypeError: object of type 'NoneType' has no len()

In [16]:
model = NER()
print(model)

NER(
  (embedding): Embedding(12538, 25)
  (lstm): LSTM(25, 25, batch_first=True)
  (linear): Linear(in_features=25, out_features=3, bias=True)
)


# Training

In [17]:
def train(model, train_dataset, batch_size=32, epochs=10, learning_rate=0.005):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ############################## TODO: replace the Nones in the following code ##################################
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      train_label = train_label.to(device)

      # (5) move the train label to the device
      train_input = train_input.to(device)

      # (6) do the forward pass
      output = model.forward(sentences=train_input)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      batch_loss = criterion(output, train_label) 

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      argmax = torch.argmax(output, dim=1)
      acc = torch.sum(torch.eq(argmax, train_label))
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward()

      # (12) update the weights with your optimizer
      optimizer.step()

    num_of_batches = len(train_dataset) / batch_size
    num_of_batches = int(num_of_batches) + 1
    # epoch loss
    epoch_loss = total_loss_train / num_of_batches
    
    # (13) calculate the accuracy
    epoch_acc = total_acc_train / len(train_dataset)

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [19]:
train_data_tokenized_as_num = train_data_tokenized.apply(lambda x: [word2index[word] for word in x])
# apply the same tokenization to the test set
test_data_tokenized_as_num = test_data_tokenized.apply(lambda x: [word2index[word] for word in x if word in word2index])
train_dataset = NERDataset(list(train_data_tokenized_as_num), train_data['stance'] + 1, word2index['<فراغ>'])
test_dataset = NERDataset(list(test_data_tokenized_as_num), test_data['stance'] + 1, word2index['<فراغ>'])

In [20]:
train(model, train_dataset)

100%|██████████| 219/219 [00:11<00:00, 18.81it/s]


Epochs: 1 | Train Loss: 0.6477955714480518         | Train Accuracy: 0.7896393537521362



100%|██████████| 219/219 [00:10<00:00, 20.10it/s]


Epochs: 2 | Train Loss: 0.6395099582193104         | Train Accuracy: 0.7925014495849609



100%|██████████| 219/219 [00:12<00:00, 18.10it/s]


Epochs: 3 | Train Loss: 0.6409502015810579         | Train Accuracy: 0.7925014495849609



100%|██████████| 219/219 [00:11<00:00, 18.91it/s]


Epochs: 4 | Train Loss: 0.6394016746788809         | Train Accuracy: 0.7925014495849609



100%|██████████| 219/219 [00:11<00:00, 18.89it/s]


Epochs: 5 | Train Loss: 0.6405949427929098         | Train Accuracy: 0.7925014495849609



100%|██████████| 219/219 [00:11<00:00, 19.08it/s]


Epochs: 6 | Train Loss: 0.6402012162829098         | Train Accuracy: 0.7925014495849609



100%|██████████| 219/219 [00:11<00:00, 18.71it/s]


Epochs: 7 | Train Loss: 0.6395751131724005         | Train Accuracy: 0.7925014495849609



100%|██████████| 219/219 [00:11<00:00, 18.99it/s]


Epochs: 8 | Train Loss: 0.6397092882628854         | Train Accuracy: 0.7925014495849609



100%|██████████| 219/219 [00:11<00:00, 18.83it/s]


Epochs: 9 | Train Loss: 0.6401758711768067         | Train Accuracy: 0.7925014495849609



100%|██████████| 219/219 [00:11<00:00, 19.12it/s]

Epochs: 10 | Train Loss: 0.6390413546671062         | Train Accuracy: 0.7925014495849609






# Evaluation

In [28]:
def evaluate(model, test_dataset, batch_size=32):
  """
  This function takes a NER model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a NER model
  - test_dataset: dataset of type NERDataset
  """
  ########################### TODO: Replace the Nones in the following code ##########################

  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0.0
  
  # (2) disable gradients
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device)

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model.forward(sentences=test_input)

      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = torch.sum(torch.eq(torch.argmax(output, dim=1), test_label))
      total_acc_test += acc

    # (6) calculate the over all accuracy
    print(total_acc_test)
    total_acc_test /= len(test_dataset)
  ##################################################################################################

  
  print(f'\nTest Accuracy: {total_acc_test}')

In [29]:
evaluate(model, test_dataset)

100%|██████████| 32/32 [00:00<00:00, 112.08it/s]

tensor(804.)

Test Accuracy: 0.8040000200271606



