In [18]:
# !pip install arabic-stopwords
import numpy as np
import pandas as pd
import arabicstopwords.arabicstopwords as ast
import re
import string
import qalsadi.lemmatizer
from nltk.stem.isri import ISRIStemmer
from tashaphyne.stemming import ArabicLightStemmer
from pyarabic.araby import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

# Read train data

In [25]:
train_data = pd.read_csv('./DataSet/train.csv',sep=',',header=0)
train_data.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,celebrity,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,info_news,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,info_news,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,celebrity,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,personal,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",info_news,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,info_news,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,personal,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,unrelated,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,info_news,1


In [82]:
# st = ISRIStemmer()
# w= 'حركات'
# print(train_data["text"][0])
# print(' '.join(st.stem(word) for word in train_data["text"][0].split()))

In [26]:
#remove first row that has the header
#train_data.drop(0, axis=0, inplace=True)
train_data['category'] = train_data['category'].astype('category').cat.codes
train_data.head(10)

Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,1,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,2,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,2,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,1,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,4,0
5,"لقاح #كورونا في أميركا.. قلق متزايد من ""التوزي...",2,0
6,لبنان اشترى مليونان لقاح امريكي اذا شلنا يلي ع...,2,1
7,من عوارض لقاح كورونا<LF>هو تهكير حسابك عتويتر<...,4,0
8,هناك 1780 مليونيراً في لبنان. ماذا لو فُرضت ال...,9,0
9,دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية...,2,1


# Pre-Processing the tweets

In [7]:
#getting a stopwords_list
stop_words = ast.stopwords_list()

In [9]:
punctuations_list = '''^_-`$%&÷×؛<=>()*&^%][،/;:"؟.,'{}~¦+|!”…“'''
#english_punctuations = string.punctuation
#print(english_punctuations)
#punctuations_list = arabic_punctuations

def remove_punctuations(text):
    translator = str.maketrans(punctuations_list, ' '*len(punctuations_list))
    return text.translate(translator)
    
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text) # no emoji

# a small function to remove stop words
def remove_stop_words(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

# a small function to remove stop words
ArListem = ArabicLightStemmer()
def lemmatiz_word(text):
    # lemmer = qalsadi.lemmatizer.Lemmatizer()
    # return ' '.join(lemmer.lemmatize(word) for word in text.split()) 
    #---------
    #st = ISRIStemmer()
    #return ' '.join(st.stem(word) for word in text.split())
    return ' '.join(ArListem.light_stem(word) for word in text.split())
    
def processPost(tweet): 

    #Remove <LF> from tweet
    tweet = re.sub('<LF>', ' ', tweet)
    
    #Replace @username with empty string
    tweet = re.sub('@[^\s]+', ' ', tweet)
    
    #remove url
    tweet = re.sub('(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})',' ',tweet)
    
    #remove hashtage #
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

    # remove punctuations
    tweet = remove_punctuations(tweet)
    
    # remove emoji  (not sure with this step)
    tweet = remove_emoji(tweet)
    
    # normalize the tweet
    # tweet= normalize_arabic(tweet)
    
    # remove repeated letters
    tweet=re.sub(r'(.)\1+', r'\1', tweet)

    #remove stop words
    tweet = remove_stop_words(tweet)

    tweet=lemmatiz_word(tweet)
    
    return tweet

In [27]:
print(train_data["text"][9])
train_data["text"] = train_data['text'].apply(lambda x: processPost(x))
train_data.head(10)
print(train_data["text"][9])

دعبول حضرتك منو انت وتطلب من قائد دولة إسلامية لقاح لعد ما اتابع الاخبار هم بكل مجالاتهم متفوقين وراح يطلع اللقاح قريباً؟<LF>#دعبول_دومه_مسحول
دعبول حضر من نت طلب قائد دول إسلام قاح عد تابع اخبار مجال متفوق طلع قاح قريبا دعبول دوم مسحول


TD-IDF

In [43]:
word_vectorizer = TfidfVectorizer(
    ngram_range=(1, 1),max_features=1000,token_pattern=r"(?u)\b[أ-ي]*\b")

unigramdataGet= word_vectorizer.fit_transform(train_data['text'].astype('str'))
unigramdataGet = unigramdataGet.toarray()

vocab = word_vectorizer.get_feature_names()
print(len(vocab))
unigramdata_features=pd.DataFrame(np.round(unigramdataGet, 1), columns=vocab)
unigramdata_features[unigramdata_features>0] = 1

unigramdata_features.head()

1000


Unnamed: 0,Unnamed: 1,أبيض,أتي,أجيل,أحد,أخبار,أخذ,أخر,أخير,أدو,...,يزر,يش,يف,يل,يمن,ين,يه,يوم,يون,يونت
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


LinearSVC Classifier

In [None]:
clf = LinearSVC()
pipe_tfidf = make_pipeline(word_vectorizer, clf)
pipe_tfidf.fit(train_data['text'], train_data['stance'])

RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
X_train_tfidf = word_vectorizer.fit_transform(train_data['text'])
X_test_tfidf = word_vectorizer.transform(test_data['text'])
rf = RandomForestClassifier()
rf_tfidf = rf.fit(X_train_tfidf, train_data['stance'])
y_pred = rf_tfidf.predict(X_test_tfidf)

report = metrics.classification_report(test_data['stance'], y_pred)
print(report)
print("accuracy: {:0.3f}".format(metrics.accuracy_score(test_data['stance'], y_pred)))

In [31]:
test_data = pd.read_csv('./DataSet/dev.csv',sep=',',header=0)
test_data["text"] = test_data['text'].apply(lambda x: processPost(x))

In [42]:
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))
print_report(pipe_tfidf, test_data['text'], test_data['stance'])

              precision    recall  f1-score   support

          -1       0.45      0.14      0.22        70
           0       0.47      0.29      0.36       126
           1       0.85      0.95      0.90       804

    accuracy                           0.81      1000
   macro avg       0.59      0.46      0.49      1000
weighted avg       0.77      0.81      0.78      1000

accuracy: 0.812


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round(accuracy, 3)))

In [None]:
train_data["text"]=train_data["text"].apply(tokenize)

# Feature Extraction 
1. TF-IDF 
2. CBoW
3. Bag of Words

CBOW

In [50]:
from gensim.models import Word2Vec

train_data_tokenized=train_data["text"].apply(tokenize)
w2v_model = Word2Vec(train_data_tokenized, vector_size=100, window=5, min_count=1, workers=4)

# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in train_data["text"]])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in test_data['text']])

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


RandomForestClassifier

In [51]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [53]:
rf = RandomForestClassifier()
rf_vect = rf.fit(X_train_vect_avg, train_data['stance'].values.ravel())
print_report(rf_vect, X_test_vect_avg, test_data['stance'])
# y_pred = rf_vect.predict(X_test_vect_avg)
# precision = precision_score(test_data['stance'], y_pred)
# recall = recall_score(test_data['stance'], y_pred)
# accuracy = accuracy_score(test_data['stance'], y_pred)
# print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round(accuracy, 3)))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
  if _joblib.__version__ >= LooseVersion('0.12'):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0

              precision    recall  f1-score   support

          -1       0.18      0.03      0.05        70
           0       0.25      0.12      0.16       126
           1       0.82      0.95      0.88       804

    accuracy                           0.78      1000
   macro avg       0.42      0.36      0.36      1000
weighted avg       0.70      0.78      0.73      1000

accuracy: 0.777


In [83]:
from keras.utils.np_utils import to_categorical
train_data['stance'].value_counts()

# num_of_categories = 450
# shuffled = train_data.reindex(np.random.permutation(train_data.index))
# e = shuffled[shuffled['stance'] == 1][:num_of_categories]
# b = shuffled[shuffled['stance'] == 0][:num_of_categories]
# t = shuffled[shuffled['stance'] == -1][:num_of_categories]
# concated = pd.concat([e,b,t], ignore_index=True)
# #Shuffle the dataset
# concated = concated.reindex(np.random.permutation(concated.index))
# concated['LABEL'] = 0
# labels = to_categorical(concated['LABEL'], num_classes=3)
# print(labels[100:110])
# if 'stance' in concated.keys():
#     concated.drop(['stance'], axis=1)

#convert label from one number to vector  of three
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(train_data['stance'])
encoded_Y = encoder.transform(train_data['stance'])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

encoder2 = LabelEncoder()
encoder2.fit(test_data['stance'])
encoded_Y2 = encoder2.transform(test_data['stance'])
testt_y = np_utils.to_categorical(encoded_Y2)


In [54]:
# Import the tools needed from keras
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

X_train=train_data["text"]
X_test=test_data["text"]
# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Use that tokenizer to transform the text messages in the training and test sets
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences so each sequence is the same length
X_train_seq_padded = pad_sequences(X_train_seq, 50)
X_test_seq_padded = pad_sequences(X_test_seq, 50)



In [89]:
# Construct a simple RNN model
model = Sequential()

model.add(Embedding(len(tokenizer.index_word)+1, 32))
model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(3, activation='softmax'))
model.summary()
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 32)          407424    
                                                                 
 lstm_6 (LSTM)               (None, 64)                24832     
                                                                 
 dense_10 (Dense)            (None, 3)                 195       
                                                                 
Total params: 432,451
Trainable params: 432,451
Non-trainable params: 0
_________________________________________________________________


In [90]:
# Fit the RNN model
#y_train=train_data["stance"].values
#y_test=test_data["stance"].values
history = model.fit(X_train_seq_padded, dummy_y, 
                    batch_size=32, epochs=10,
                    validation_data=(X_test_seq_padded[:500], testt_y[:500]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [94]:
#x = model.predict(X_test_seq_padded[500:])
#print(x)
accr = model.evaluate( X_test_seq_padded[500:], testt_y[500:],verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f} \n  f1_scor: {:0.3f}'.format(accr[0],accr[1],accr[2]))

IndexError: list index out of range