In [14]:
# !pip install arabic-stopwords
import numpy as np
import pandas as pd
import string
import qalsadi.lemmatizer
from nltk.stem.isri import ISRIStemmer
from pyarabic.araby import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
import torch
from torch import nn

from pre_processing_post import processPost
from extract_features import get_unigram_features, get_word_embedding_features

In [15]:
# needed functions
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

# Read train data

In [16]:
train_data = pd.read_csv('./DataSet/train.csv',sep=',',header=0)
test_data = pd.read_csv('./DataSet/dev.csv',sep=',',header=0)
train_data.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,celebrity,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,info_news,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,info_news,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,celebrity,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,personal,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",info_news,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,info_news,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,personal,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,unrelated,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,info_news,1


In [17]:
#remove first row that has the header
train_data['category'] = train_data['category'].astype('category').cat.codes
train_data.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,1,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,2,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,2,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,1,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,4,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",2,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,2,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,4,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,9,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,2,1


In [19]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
train_data=train_data.drop('category',axis=1)
y=train_data['stance']
print(Counter(train_data['stance']))
train_data=train_data.drop('stance',axis=1)
# define oversampling strategy
oversample = RandomOverSampler(random_state=3)
# fit and apply the transform
train_data["text"], train_data['stance'] = oversample.fit_resample(train_data, y)
print(Counter(train_data['stance']))

ModuleNotFoundError: No module named 'imblearn'

In [None]:
!pip install imblearn

# Pre-Processing the tweets

In [5]:
print(train_data["text"][9])
train_data["text"] = train_data['text'].apply(lambda x: processPost(x))
test_data['text'] = test_data['text'].apply(lambda x: processPost(x))
print(train_data["text"][9])

دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية لقاح لعد ما اتابع الاخبار هم بكل مجالاتهم متفوقين وراح يطلع اللقاح قريباً؟<LF>#دعبول_دومه_مسحول
دعبول حضر من نت طلب قائد دول إسلام قاح عد تابع اخبار مجال متفوق طلع قاح قريبا دعبول دوم مسحول


# Feature Extraction

## 1. TD-IDF

In [6]:
unigramdata_features, word_vectorizer, vocab = get_unigram_features(train_data)
unigramdata_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


LinearSVC Classifier

In [7]:
clf = LinearSVC()
pipe_tfidf = make_pipeline(word_vectorizer, clf)
pipe_tfidf.fit(train_data['text'], train_data['stance'])
print_report(pipe_tfidf, test_data['text'], test_data['stance'])

              precision    recall  f1-score   support

          -1       0.58      0.16      0.25        70
           0       0.49      0.33      0.39       126
           1       0.85      0.95      0.90       804

    accuracy                           0.82      1000
   macro avg       0.64      0.48      0.51      1000
weighted avg       0.79      0.82      0.79      1000

accuracy: 0.818


RandomForestClassifier

In [8]:
X_train_tfidf = word_vectorizer.fit_transform(train_data['text'])
X_test_tfidf = word_vectorizer.transform(test_data['text'])
rf = RandomForestClassifier()
rf_tfidf = rf.fit(X_train_tfidf, train_data['stance'])
y_pred = rf_tfidf.predict(X_test_tfidf)

print_report(rf_tfidf, X_test_tfidf, test_data['stance'])

              precision    recall  f1-score   support

          -1       0.50      0.16      0.24        70
           0       0.57      0.25      0.34       126
           1       0.84      0.97      0.90       804

    accuracy                           0.82      1000
   macro avg       0.64      0.46      0.50      1000
weighted avg       0.79      0.82      0.79      1000

accuracy: 0.821


## 2.CBOW

In [9]:
X_train_vect_avg, X_test_vect_avg = get_word_embedding_features(train_data, test_data)

  for ls in train_data["text"]])
  for ls in test_data['text']])


LinearSVC Classifier

In [10]:
clf = LinearSVC()
clf.fit(X_train_vect_avg, train_data['stance'])
print_report(clf, X_test_vect_avg, test_data['stance'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        70
           0       0.00      0.00      0.00       126
           1       0.80      1.00      0.89       804

    accuracy                           0.80      1000
   macro avg       0.27      0.33      0.30      1000
weighted avg       0.65      0.80      0.72      1000

accuracy: 0.804


RandomForestClassifier

In [11]:
rf = RandomForestClassifier()
rf_vect = rf.fit(X_train_vect_avg, train_data['stance'].values.ravel())
print_report(rf_vect, X_test_vect_avg, test_data['stance'])

              precision    recall  f1-score   support

          -1       0.75      0.04      0.08        70
           0       0.42      0.10      0.17       126
           1       0.82      0.98      0.89       804

    accuracy                           0.80      1000
   macro avg       0.66      0.37      0.38      1000
weighted avg       0.76      0.80      0.74      1000

accuracy: 0.803


# NERDataset
The class that impelements the dataset for NER

In [12]:
class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################
    list_len = [len(i) for i in x]
    MAX_LENGTH = max(list_len) 
    for i in range(len(x)):
      x[i] = np.pad(x[i], (0, MAX_LENGTH-len(x[i])), 'constant', constant_values=(pad))
      y[i] = np.pad(y[i], (0, MAX_LENGTH-len(y[i])), 'constant', constant_values=(0))

    self.x = torch.from_numpy(np.array(x)) 
    self.y = torch.from_numpy(np.array(y))

    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return self.x.shape[0]
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return (self.x[idx], self.y[idx])
    ##########################################################################################

In [13]:
class NER(nn.Module):
  def __init__(self, vocab_size=len(vocab), embedding_dim=50, hidden_size=50, n_classes=3):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    self.hidden_size = hidden_size
    super(NER, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size, n_classes)
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    final_output, _ = self.lstm(self.embedding(sentences))
    final_output = self.linear(final_output)
    final_output = final_output[:, -1, :]
    ###############################################################################################
    return final_output

TypeError: object of type 'NoneType' has no len()

In [None]:
model = NER()
print(model)

NER(
  (embedding): Embedding(1000, 50)
  (lstm): LSTM(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=3, bias=True)
)


In [None]:
from keras.utils.np_utils import to_categorical
train_data['stance'].value_counts()

# num_of_categories = 450
# shuffled = train_data.reindex(np.random.permutation(train_data.index))
# e = shuffled[shuffled['stance'] == 1][:num_of_categories]
# b = shuffled[shuffled['stance'] == 0][:num_of_categories]
# t = shuffled[shuffled['stance'] == -1][:num_of_categories]
# concated = pd.concat([e,b,t], ignore_index=True)
# #Shuffle the dataset
# concated = concated.reindex(np.random.permutation(concated.index))
# concated['LABEL'] = 0
# labels = to_categorical(concated['LABEL'], num_classes=3)
# print(labels[100:110])
# if 'stance' in concated.keys():
#     concated.drop(['stance'], axis=1)

#convert label from one number to vector  of three
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(train_data['stance'])
encoded_Y = encoder.transform(train_data['stance'])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

encoder2 = LabelEncoder()
encoder2.fit(test_data['stance'])
encoded_Y2 = encoder2.transform(test_data['stance'])
testt_y = np_utils.to_categorical(encoded_Y2)


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Import the tools needed from keras
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

X_train=train_data["text"]
X_test=test_data["text"]
# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Use that tokenizer to transform the text messages in the training and test sets
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences so each sequence is the same length
X_train_seq_padded = pad_sequences(X_train_seq, 50)
X_test_seq_padded = pad_sequences(X_test_seq, 50)



In [None]:
# Construct a simple RNN model
model = Sequential()

model.add(Embedding(len(tokenizer.index_word)+1, 32))
model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(3, activation='softmax'))
model.summary()
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 32)          407424    
                                                                 
 lstm_6 (LSTM)               (None, 64)                24832     
                                                                 
 dense_10 (Dense)            (None, 3)                 195       
                                                                 
Total params: 432,451
Trainable params: 432,451
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Fit the RNN model
#y_train=train_data["stance"].values
#y_test=test_data["stance"].values
history = model.fit(X_train_seq_padded, dummy_y, 
                    batch_size=32, epochs=10,
                    validation_data=(X_test_seq_padded[:500], testt_y[:500]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#x = model.predict(X_test_seq_padded[500:])
#print(x)
accr = model.evaluate( X_test_seq_padded[500:], testt_y[500:],verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f} \n  f1_scor: {:0.3f}'.format(accr[0],accr[1],accr[2]))

IndexError: list index out of range