In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix, plot_confusion_matrix, classification_report
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB , GaussianNB
from sklearn.ensemble import AdaBoostClassifier , ExtraTreesClassifier, RandomForestClassifier
import xgboost as xgb
import catboost as ctb 
import lightgbm as lbm

import pickle

import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")


from utils.utils import *

  from pandas import MultiIndex, Int64Index


# Reading Data

In [2]:
data = pd.read_csv('full_data.csv')
data.head()

Unnamed: 0,dialect,txt
0,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .
1,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. حيونه ووحشيه .. وتطلبون من الغرب يحترمكم ويؤمن بدينكم ولاينعتكم بالإرهاب ..
2,IQ,@KanaanRema مبين من كلامه خليجي
3,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺


# Preprocessing

In [3]:
data.txt.sample(3,random_state=42)

217576    #نجاح_حج_هذا_العام١٤٤\n#منتدى_الوطن_السعودي\nعلم اللي مالهم غير الدسايس حيله\nالجيج بخير وإنا كلنا خدامي\n\nمملكتنا دولة كبرى ماهيب دويله\nوالله اللي عزها منارة الاسلامي\n\nالحشود انديرها والحرب ن...
344491                                                                  بنفس هاليوم قبل٢٥سنه تم ايقاف سياره فيها أثنين شباب اعمارهم لا تتجاوز ١٨سنه والتهمه سيارتهم مشغلينها بدون مفتاح(جطل)كلبچوهم ايدهم للخلف1⃣
380119                                                                                                                                            @Mralseyabi @m_namani احسن الظن فيهم يمكن التبن حاجة حلوة عندهم
Name: txt, dtype: object

In [4]:
data.txt = data.txt.apply(preprocessing)

In [5]:
data.txt.sample(3,random_state=5)

109133    رح ناكل خرا بريحه الجماعه  عده بطلع قائد عرص بيحشر طيزه  جولان
10932                                            والله مابيهم اليحجي صدگ
224136                   شكرا  حرف  قلتيه  وياريت الناس  تتعلم منك وتفهم
Name: txt, dtype: object

# splitting to train_valid_test sets

In [6]:
le = LabelEncoder()

data['dialect'] = le.fit_transform(data['dialect'])
data['dialect']

0         4
1         4
2         4
3         4
4         4
         ..
458192    1
458193    1
458194    1
458195    1
458196    1
Name: dialect, Length: 458197, dtype: int32

In [7]:
len(data), 25000/len(data)

(458197, 0.054561684166417504)

In [8]:
data['dialect'].value_counts()

3     57636
11    43742
6     42109
8     36499
12    31069
5     27921
7     27617
13    26832
0     26296
1     26292
10    19116
15    16242
2     16183
4     15497
14    14434
9     11539
17     9927
16     9246
Name: dialect, dtype: int64

In [9]:
# get train and test sets
xtrain,xtest,ytrain,ytest=train_test_split(data['txt'],data['dialect'],test_size=0.05,random_state=45,shuffle=True, stratify = data['dialect'])

In [10]:
len(xtrain),25000/len(xtrain)

(435287, 0.05743337154566987)

In [11]:
xtrain,xvalid,ytrain,yvalid=train_test_split(xtrain,ytrain,test_size=0.05,random_state=45,shuffle=True,stratify = ytrain)

# Creating Pipeline

In [24]:
## Vectorzers
cvec = CountVectorizer()
hvec = HashingVectorizer(ngram_range=(2,5), n_features=50000)
tvec = TfidfVectorizer(max_features=80000,ngram_range=(3,8),analyzer='char')

## classifiers
rf = RandomForestClassifier()
adac = AdaBoostClassifier()
xgbc = xgb.XGBClassifier()
catc = ctb.CatBoostClassifier()
lbmc = lbm.LGBMClassifier()

lrc =LogisticRegression()
svc = SVC()
mnbc = MultinomialNB()
bnbc = BernoulliNB()
gnbc = GaussianNB()

## label encoder 


In [25]:
pipe = Pipeline(steps=[
    ('tfidf', hvec),
    ('clf', gnbc ),
    ])

In [26]:
pipe.fit(xtrain,ytrain)
preds = pipe.predict(xtest)

In [22]:
accuracy_score(ytest, preds)

0.12514185945002182

In [23]:
f1_score(ytest, preds, average='macro')

0.030753352297550163

# Saving Model

In [18]:
with open('assets/mlmodel.pkl', 'wb') as model:
    pickle.dump(pipe,model)

In [19]:
with open('assets/le.pkl', 'wb') as lencoder:
    pickle.dump(le,lencoder)

# Loading and prediction

In [20]:
with open('assets/mlmodel.pkl', 'rb') as f:
    model = pickle.load(f)

In [21]:
preds = model.predict(xvalid)
accuracy_score(yvalid,preds), f1_score(yvalid,preds, average='macro')

(0.45384792097404086, 0.3926517839188842)