In [41]:
from sklearn import metrics
import pandas as pd
from torch import nn
from tqdm import tqdm
from pyarabic.araby import tokenize
import numpy as np
import pickle
import spacy
import torch

# from model_building import Classifier 
from pre_processing_post import processPost
from feature_extraction import get_ngram_features, get_word_embedding_features, avg_word_vector

In [76]:
def evaluate(model, test_dataset, batch_size=32):
  """
  This function takes a model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: the model
  - test_dataset: dataset of type ArabicDataset
  """
  
  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0.0
  
  y_test = [] 
  y_predected = [] 
  y_pred = [] 
  # (2) disable gradients
  with torch.no_grad():
    report = None
    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device)

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model.forward(sentences=test_input)

      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = torch.sum(torch.eq(torch.argmax(output, dim=1), test_label))
      total_acc_test += acc
      
      # f1 score calculation
      y_test +=(list(test_label.view(-1)))
      y_predected +=(list(torch.argmax(output, dim=1).view(-1)))
      y_pred +=(list(np.array(output)))

    # (6) calculate the over all accuracy
    total_acc_test /= len(test_dataset)

  report = metrics.classification_report(y_test, y_predected)
  print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_predected)))
  print(report)
  
  print(f'\nTest Accuracy: {total_acc_test}')
  return np.array(y_pred)

In [35]:
class ArabicDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the ArabicDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    list_len = [len(i) for i in x]
    MAX_LENGTH = max(list_len) 
    for i in range(len(x)):
      x[i] = np.pad(x[i], (0, MAX_LENGTH-len(x[i])), 'constant', constant_values=(pad))

    self.x = torch.from_numpy(np.array(x)) 
    self.y = torch.from_numpy(np.array(y))

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    return self.x.shape[0]

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    return (self.x[idx], self.y[idx])

In [16]:
# needed functions
def print_report(y_pred, y_test):
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

In [17]:
# load AraVec Spacy model
nlp = spacy.load("./spacy.aravec.model/")



In [42]:
train_data = pd.read_csv('./DataSet/train.csv',sep=',',header=0)
test_data = pd.read_csv('./DataSet/dev.csv',sep=',',header=0)
train_data['text'] = train_data['text'].apply(lambda x: processPost(x))
test_data['text'] = test_data['text'].apply(lambda x: processPost(x))

creat vocablary

In [43]:
train_data_tokenized = train_data['text'].apply(tokenize)
test_data_tokenized = test_data['text'].apply(tokenize)
#merge all the sentences in one list
vocab = [item for sublist in train_data_tokenized for item in sublist]
vocab = list(set(vocab))
vocab.append('<فراغ>')
vocab.insert(0, '<مجهول>')
word2index = {word: i for i, word in enumerate(vocab)}

In [48]:
weights_train_matrix = []
for word in vocab:
  weights_train_matrix.append(nlp(word).vector)

weights_train_matrix = torch.from_numpy(np.array(weights_train_matrix))
weights_train_matrix.size()

torch.Size([12538, 100])

In [49]:
def create_emb_layer(weights_train_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_train_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_train_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class Classifier(nn.Module):
  def __init__(self, vocab_size=len(vocab), embedding_dim=100, hidden_size=100, n_classes=3, n_layer=1):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    self.hidden_size = hidden_size
    super(Classifier, self).__init__()
    
    self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_train_matrix, True)
    self.hidden_size = hidden_size

    self.GRU = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True, num_layers=n_layer)

    self.linear = nn.Linear(hidden_size, n_classes)

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    final_output, _ = self.GRU(self.embedding(sentences))
    final_output = final_output[:, -1, :]
    final_output = self.linear(final_output)
    return final_output

td-idf feature

In [108]:
# load naive bayes model
with open('./models/NaiveBayes_tfidf.sav', 'rb') as f:
    naive_bayes_model = pickle.load(f)

with open('./models/TFIDFVectorizer.sav', 'rb') as f:
    word_vectorizer = pickle.load(f)

X_test_tfidf = word_vectorizer.transform(test_data['text'])
y_pred = naive_bayes_model.predict_proba(X_test_tfidf)

Ara2Vec embedding

In [109]:
test_data_embeddings = np.array([np.array([nlp(i).vector for i in ls]) for ls in test_data["text"]], dtype=object)
_, X_test_vect_avg = avg_word_vector([], test_data_embeddings)

# load SVC model
with open('./models/RandomForest_Ara2Vec.sav', 'rb') as f:
    svc_model = pickle.load(f)

y_pred += svc_model.predict_proba(X_test_vect_avg)

In [113]:
#load GRU model
with open('./models/GRU_Ara2Vec.sav', 'rb') as f: 
    gru_model = Classifier()
    gru_model.load_state_dict(torch.load(f))
    gru_model.eval()

test_data_tokenized_as_num = test_data_tokenized.apply(lambda x: [word2index[word] for word in x if word in word2index])
test_dataset = ArabicDataset(list(test_data_tokenized_as_num), test_data['stance'] + 1, word2index['<فراغ>'])
y_pred_ = evaluate(gru_model,test_dataset)

100%|██████████| 32/32 [00:00<00:00, 33.60it/s]


accuracy: 0.199
              precision    recall  f1-score   support

           0       0.08      0.37      0.13        70
           1       0.13      0.58      0.22       126
           2       0.87      0.12      0.22       804

    accuracy                           0.20      1000
   macro avg       0.36      0.36      0.19      1000
weighted avg       0.72      0.20      0.21      1000


Test Accuracy: 0.19900000095367432


UFuncTypeError: Cannot cast ufunc 'add' output from dtype('float64') to dtype('int64') with casting rule 'same_kind'

In [112]:
y_pred /= 2
y_pred = np.argmax(y_pred, axis=1) - 1
print_report(y_pred, test_data['stance'])

              precision    recall  f1-score   support

          -1       0.43      0.04      0.08        70
           0       0.38      0.13      0.20       126
           1       0.82      0.97      0.89       804

    accuracy                           0.80      1000
   macro avg       0.54      0.38      0.39      1000
weighted avg       0.74      0.80      0.75      1000

accuracy: 0.799
