In [75]:
import codecs
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier


In [76]:
!gdown --id 1-4vFs-oT5rL1j9U3BURq2NIfkvN8mgPS

Downloading...
From: https://drive.google.com/uc?id=1-4vFs-oT5rL1j9U3BURq2NIfkvN8mgPS
To: /content/df_clean.json
100% 81.5M/81.5M [00:00<00:00, 232MB/s]


In [77]:
df = pd.read_json(codecs.open('df_clean.json', 'r', 'utf-8'))

# Label Encoding

In [78]:
df.groupby('dialect').nunique().T

dialect,AE,BH,DZ,EG,IQ,JO,KW,LB,LY,MA,OM,PL,QA,SA,SD,SY,TN,YE
id,26296,26292,16183,57636,15497,27921,42109,27617,36499,11539,19116,43742,31069,26832,14434,16242,9246,9927
text,26274,26263,16164,57533,15488,27887,42084,27584,36460,11527,19079,43702,31037,26748,14409,16217,9238,9912


In [79]:
le = preprocessing.LabelEncoder()
le.fit(df.dialect)
df['target'] = le.transform(df.dialect)

In [80]:
le_name_mapping = dict(zip(le.transform(le.classes_),le.classes_))
print(le_name_mapping)

{0: 'AE', 1: 'BH', 2: 'DZ', 3: 'EG', 4: 'IQ', 5: 'JO', 6: 'KW', 7: 'LB', 8: 'LY', 9: 'MA', 10: 'OM', 11: 'PL', 12: 'QA', 13: 'SA', 14: 'SD', 15: 'SY', 16: 'TN', 17: 'YE'}


# Split

In [81]:
train, test = train_test_split(df[['text','target']], test_size=0.33, random_state=42, stratify=df['target'])

# TF-IDf

In [95]:
tfidf = TfidfVectorizer(
    # tokenizer= dummy,
    # preprocessor=dummy,
    stop_words=['و'], 
    ngram_range=(1,2), 
    lowercase=False, 
    max_df = 0.3, 
    min_df= .0001, 
    # max_features = 2000
    )

In [96]:
tfidf.fit(train.text)

TfidfVectorizer(lowercase=False, max_df=0.3, min_df=0.0001, ngram_range=(1, 2),
                stop_words=['و'])

In [97]:
X_train = tfidf.transform(train.text)

In [98]:
X_train.shape

(306991, 17724)

In [None]:
tfidf.get_feature_names()

# Model

In [100]:
sgd =  SGDClassifier()
sgd.fit(X_train, train.target)

SGDClassifier()

In [101]:
X_test = tfidf.transform(test.text)

# Evaluation

In [102]:
y_pred = sgd.predict(X_test)

In [103]:
from sklearn import metrics
print(metrics.classification_report(test.target, y_pred, target_names=le.classes_))

              precision    recall  f1-score   support

          AE       0.39      0.30      0.34      8678
          BH       0.24      0.17      0.20      8676
          DZ       0.45      0.37      0.41      5340
          EG       0.53      0.88      0.66     19020
          IQ       0.49      0.50      0.50      5114
          JO       0.31      0.17      0.22      9214
          KW       0.43      0.50      0.46     13896
          LB       0.47      0.66      0.55      9114
          LY       0.51      0.61      0.55     12045
          MA       0.51      0.54      0.53      3808
          OM       0.31      0.23      0.27      6308
          PL       0.43      0.39      0.41     14435
          QA       0.43      0.38      0.40     10253
          SA       0.33      0.28      0.31      8855
          SD       0.50      0.36      0.42      4763
          SY       0.29      0.14      0.19      5360
          TN       0.46      0.30      0.36      3051
          YE       0.16    

# Save model and copy to drive

In [104]:
import joblib
joblib.dump(sgd, "ML.pkl")
joblib.dump(tfidf, "tfidf.pkl")

# joblib.dump(sgd, "DL.pkl")

['tfidf.pkl']

In [105]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [106]:
!cp ML.pkl /content/drive/MyDrive

In [107]:
!cp tfidf.pkl /content/drive/MyDrive