In [12]:
#imports 
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report

In [2]:
#For google colab we mount google drive 
from google.colab import drive 
drive.mount("/content/drive/")

Mounted at /content/drive/


# Reading the tweets file

In [3]:
tweets_data_path = "/content/drive/MyDrive/preprocessed_tweets.csv"

In [4]:
df = pd.read_csv(tweets_data_path)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458197 entries, 0 to 458196
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    458197 non-null  int64 
 1   Unnamed: 0.1  458197 non-null  int64 
 2   id            458197 non-null  int64 
 3   dialect       458197 non-null  int64 
 4   tweets        457992 non-null  object
dtypes: int64(4), object(1)
memory usage: 17.5+ MB


In [6]:
df.head(15)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,dialect,tweets
0,0,0,1175358310087892992,4,بالنهايه ينتفض يغير
1,1,1,1175416117793349632,4,يعني محسوب البشر حيونه وحشيه وتطلبون الغرب يحت...
2,2,2,1175450108898565888,4,مبين كلامه خليجي
3,3,3,1175471073770573824,4,يسلملي مرورك وروحك الحلوه
4,4,4,1175496913145217024,4,وين الغيبه اخ محمد
5,5,5,1175668034146643968,4,ياخي الارهابي اذا عراقي سعودي فلسطيني وين المش...
6,6,6,1175670153884983296,4,مطلبي يقدم استقالته وفوگاها اعتذار
7,7,7,1175671762580856832,4,خلص واله لعيونكم انا ماعندي شيء معه بالعكس متا...
8,8,8,1175715664398561280,4,يمكن سءال فات الكثير الي يصور شنو موقفه وكانه ...
9,9,9,1176019816072777728,4,اولا اني ردت رجل جنوبي والي ذكر حجابها ثانيا ا...


**Splitting the dataset**

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458197 entries, 0 to 458196
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    458197 non-null  int64 
 1   Unnamed: 0.1  458197 non-null  int64 
 2   id            458197 non-null  int64 
 3   dialect       458197 non-null  int64 
 4   tweets        457992 non-null  object
dtypes: int64(4), object(1)
memory usage: 17.5+ MB


In [8]:
features = df.tweets.values.astype(str)
lables = df.dialect.values.astype('float32')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features, lables, random_state=42, test_size=0.15, shuffle=True)

In [10]:
X_train[:5]

array(['ياربي ياحبيبي قريب باقرب وقت يصدر قانون يجرم العنصرين الي يصنفوا المجتمع مجنس واصلي واله عيب وسط التطور الي تشهده بلادنا لسه ناس تصنفنا ياخي الحكومه ساوت الجميع مين اداكم الحق تصنفونا ودحين يتكاثر الذباب العنصري بالمنشن يقولوا اكيد انتي متجنسه علشان كدا',
       'هوي مش عبث بحبو لشيخ سعد هوالحريه',
       'شو هيدا بالاحلام بنسترجي نشوفها هيك',
       'خصرها دقاق وفزرتها تذبح الروح وغصنها ليان وفيها لطف ورهاوه والصدر زامي والحڳي ضحڳ ومزوح والريق طعمه طعم الحلاوه',
       'انها خطوه متاخره مشكورين عليها'], dtype='<U280')

In [11]:
y_train[:5]

array([13.,  7.,  7., 12., 12.], dtype=float32)

# Feature extraction and Machine learning model pipeling

**Random Forest Classifier**

In [None]:
model_rf = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', RandomForestClassifier(criterion='gini',
    n_estimators=15, 
    min_samples_split=10, 
    min_samples_leaf=1, 
    max_features='auto', 
    oob_score=True, 
    random_state=1,
    n_jobs=-1,
    verbose=2)),
              ])
model_rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 15
building tree 2 of 15
building tree 3 of 15
building tree 4 of 15
building tree 5 of 15
building tree 6 of 15
building tree 7 of 15
building tree 8 of 15
building tree 9 of 15
building tree 10 of 15
building tree 11 of 15
building tree 12 of 15
building tree 13 of 15
building tree 14 of 15
building tree 15 of 15


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 19.2min finished


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 RandomForestClassifier(min_samples_split=10, n_estimators=15,
                                        n_jobs=-1, oob_score=True,
                                        random_state=1, verbose=2))])

In [None]:
# Making a prediction on the test set
prediction = model_rf.predict(X_test)

# Evaluating the model
print(f"Test Set Accuracy: {accuracy_score(y_test, prediction) * 100} %\n\n")
print(
    f"Classification Report : \n\n{classification_report(y_test, prediction)}")

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:    7.5s finished


Test Set Accuracy: 40.28953877491634 %


Classification Report : 

              precision    recall  f1-score   support

         0.0       0.32      0.24      0.28      4120
         1.0       0.26      0.18      0.21      3974
         2.0       0.53      0.32      0.40      2477
         3.0       0.52      0.81      0.63      8581
         4.0       0.51      0.36      0.42      2279
         5.0       0.30      0.19      0.23      4232
         6.0       0.31      0.50      0.39      6275
         7.0       0.51      0.53      0.52      4214
         8.0       0.46      0.53      0.49      5479
         9.0       0.62      0.45      0.52      1691
        10.0       0.29      0.20      0.24      2894
        11.0       0.34      0.45      0.39      6464
        12.0       0.35      0.33      0.34      4640
        13.0       0.30      0.26      0.28      4050
        14.0       0.48      0.35      0.41      2116
        15.0       0.37      0.14      0.20      2421
        16.0  

In [None]:
import pickle 
pickle.dump(model_rf, open("Rf_model.pkl", 'wb'))
print("Model saved")

Model saved


In [None]:
!cp Rf_model.pkl /content/drive/MyDrive/

**Naive Bayes classifier for multinomial models.**

In [21]:
model_MNb = Pipeline([('tfidf', TfidfVectorizer()),
               ('mnb',MultinomialNB())])
model_MNb.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])

In [22]:
# Making a prediction on the test set
prediction = model_MNb.predict(X_test)

# Evaluating the model
print(f"Test Set Accuracy: {accuracy_score(y_test, prediction) * 100} %\n\n")
print(
    f"Classification Report : \n\n{classification_report(y_test, prediction)}")

Test Set Accuracy: 40.16586643387167 %


Classification Report : 

              precision    recall  f1-score   support

         0.0       0.71      0.15      0.24      4120
         1.0       0.64      0.07      0.13      3974
         2.0       0.88      0.18      0.30      2477
         3.0       0.35      0.97      0.52      8581
         4.0       0.97      0.12      0.21      2279
         5.0       0.69      0.06      0.11      4232
         6.0       0.29      0.79      0.42      6275
         7.0       0.82      0.44      0.57      4214
         8.0       0.66      0.58      0.62      5479
         9.0       0.99      0.22      0.36      1691
        10.0       0.94      0.04      0.07      2894
        11.0       0.32      0.66      0.43      6464
        12.0       0.51      0.38      0.44      4640
        13.0       0.61      0.15      0.24      4050
        14.0       0.98      0.08      0.15      2116
        15.0       0.99      0.03      0.05      2421
        16.0  

In [23]:
import pickle 
pickle.dump(model_MNb, open("MNb_model.pkl", 'wb'))
print("Model saved")

Model saved


In [24]:
!cp MNb_model.pkl /content/drive/MyDrive/

**Linear models with stochastic gradient descent (SGD) learning**

In [None]:
model_sgd = Pipeline([('tfidf', TfidfVectorizer()),
               ('sgd',SGDClassifier(max_iter=1000, tol=1e-3))])
model_sgd.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('sgd', SGDClassifier())])

In [None]:
# Making a prediction on the test set
prediction = model_sgd.predict(X_test)

# Evaluating the model
print(f"Test Set Accuracy: {accuracy_score(y_test, prediction) * 100} %\n\n")
print(
    f"Classification Report : \n\n{classification_report(y_test, prediction)}")

Test Set Accuracy: 49.47912119889422 %


Classification Report : 

              precision    recall  f1-score   support

         0.0       0.46      0.39      0.42      4120
         1.0       0.36      0.26      0.30      3974
         2.0       0.57      0.45      0.50      2477
         3.0       0.52      0.89      0.66      8581
         4.0       0.54      0.55      0.55      2279
         5.0       0.43      0.23      0.30      4232
         6.0       0.48      0.55      0.51      6275
         7.0       0.53      0.70      0.60      4214
         8.0       0.58      0.66      0.62      5479
         9.0       0.56      0.60      0.58      1691
        10.0       0.42      0.28      0.34      2894
        11.0       0.48      0.45      0.47      6464
        12.0       0.48      0.46      0.47      4640
        13.0       0.41      0.36      0.38      4050
        14.0       0.53      0.37      0.43      2116
        15.0       0.41      0.19      0.26      2421
        16.0  

In [None]:
import pickle 
pickle.dump(model_sgd, open("SGD_model.pkl", 'wb'))
print("Model saved")

Model saved


In [None]:
!cp SGD_model.pkl /content/drive/MyDrive/