In [1]:
import re
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import spacy
import numpy as np
import pickle


In [2]:
country_mapper = {
    0 : "Libya",
    1 : "Morocco",
    2 : "Egypt",
    3 : "Lebanon",
    4 : "Sudan"
}

### **Data Reading**

In [3]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118180 entries, 0 to 118179
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   text     118145 non-null  object
 1   dialect  118180 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29545 entries, 0 to 29544
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     29537 non-null  object
 1   dialect  29545 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 461.8+ KB


In [6]:
train_data.head()

Unnamed: 0,text,dialect
0,كارثة بعد يقعد راجل دمه ثقيل,0
1,رئيس الجمهورية يفعل ويقول ما يشاء واللي مش عاج...,3
2,صالة المغادرة اللي في المطار هي اللي المفروض ا...,3
3,كيفك يا حاج انا ماحجت بس عملت عمرة قوليلى يا عمرى,3
4,ده غير انى مش هكلمك عن الإيجابية والسلبية بقى ...,2


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118180 entries, 0 to 118179
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   text     118145 non-null  object
 1   dialect  118180 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [8]:
train_data = train_data.dropna()

In [9]:
test_data = test_data.dropna()

In [10]:
train_data.isnull().sum()

text       0
dialect    0
dtype: int64

In [11]:
x_train, y_train, x_test, y_test = train_data['text'], train_data['dialect'], test_data['text'], test_data['dialect']

In [12]:
x_train[0]

'كارثة بعد يقعد راجل دمه ثقيل'

In [13]:
x_test[0]

'أحبــك بــ مقدار كرهـي لكــ'

In [14]:
y_train[0]

0

In [15]:
y_test[0]

0

### **Data Preprocessing**

In [16]:
class Transformer():

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
   
    def transform(self, X):
        X = X.apply(self.processing)
        return X
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

    def processing(self, text):

        pattern = re.compile(r'[À-ÿ]')
        text = pattern.sub('', text)

        pattern = re.compile(r'https?://\S+|www\.\S+')
        text = pattern.sub(' ', text)

        pattern = re.compile(r'\b[a-zA-Z0-9]+\b|[@#:()%$؟&*\\u"،\\.!_\\n!?؛/-]')
        text = pattern.sub(' ', text)
        
        pattern = re.compile(r'\b[a-zA-Z0-9]+\b')
        text = pattern.sub(' ', text)

        pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F700-\U0001F77F"
        "\U0001F780-\U0001F7FF"
        "\U0001F800-\U0001F8FF"
        "\U0001F900-\U0001F9FF" 
        "\U0001FA00-\U0001FA6F" 
        "\U0001FA70-\U0001FAFF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)

        text = pattern.sub(' ', text)
        text = re.sub(r'\s+', ' ', text).strip()

        return pattern.sub('', text)

In [17]:
y_train.values

array([0, 3, 3, ..., 2, 3, 1], dtype=int64)

In [18]:
x_train.values

array(['كارثة بعد يقعد راجل دمه ثقيل',
       'رئيس الجمهورية يفعل ويقول ما يشاء واللي مش عاجبه قدامه البحر',
       'صالة المغادرة اللي في المطار هي اللي المفروض اسمها صالة أفراح',
       ..., 'اه عندك حق فى دى', 'عفكرة كأنك طلعتي عونية',
       'ما كانش عندك الفران'], dtype=object)

### **Hyperparameter Tuning**

In [None]:
def objective(trial):

    param = {
        # 'tree_method': 'gpu_hist',
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5, 0.7, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.5, 0.7, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    model = XGBClassifier(**param)

    pipeline = Pipeline(steps=[
        ('preprocessing', Transformer()),
        ('Vectorizing', CountVectorizer()),
        ('model', model),
    ])

    pipeline.fit(x_train, y_train.values)
    preds = pipeline.predict(x_test)

    accuracy = accuracy_score(y_test.values, preds)
    return 1 - accuracy

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

print('Best Hyperparameters: ', study.best_params)
print('Best Performance: ', study.best_value)


## **First Model**
- **XGBoost**

**Best Parameters:**
- {'lambda': 8.348812094600195, 'alpha': 3.1219064105228114, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.04952090626522695, 'n_estimators': 939, 'max_depth': 9, 'min_child_weight': 8}

In [19]:
xgboost = XGBClassifier(
    reg_lambda=8.348812094600195,
    alpha=3.1219064105228114,
    colsample_bytree=0.5,
    subsample=0.5,
    learning_rate=0.04952090626522695,
    n_estimators=939,
    max_depth=9,
    min_child_weight=8,
    objective='multi:softmax'
)

In [20]:
xgboost1 = XGBClassifier(
    n_estimators=100, 
    max_depth=10, 
    learning_rate=0.04952090626522695, 
    objective='multi:softmax'
)

### **Pipeline**

In [21]:
pipeline = Pipeline(steps=[
    ('preprocessing', Transformer()),
    ('Vectorizing', CountVectorizer()),
    ('model', xgboost),
])

In [22]:
pipeline1 = Pipeline(steps=[
    ('preprocessing', Transformer()),
    ('Vectorizing', CountVectorizer()),
    ('model', xgboost1),
])

### **Model Training**

In [23]:
pipeline.fit(x_train, y_train)

In [24]:
pipeline1.fit(x_train, y_train)

### **Model Prediction**

**Pipeline1**

In [25]:
predictions = pipeline.predict(x_test)

In [26]:
predictions

array([0, 3, 0, ..., 3, 0, 0])

In [27]:
f1_score(y_test, predictions, average='macro')

0.7055663070921765

**Pipeline2**

In [28]:
predictions = pipeline1.predict(x_test)
predictions

array([2, 3, 0, ..., 3, 3, 2])

In [29]:
f1_score(y_test, predictions, average='macro')

0.5987974986361383

**Another Traditional ML Models**

In [30]:
model = LogisticRegression(max_iter=500)

pipeline = Pipeline(steps=[
    ('preprocessing', Transformer()),
    ('Vectorizing', CountVectorizer()),
    ('model', model),
])

pipeline.fit(x_train, y_train)
pipeline.predict(x_test)
predictions = pipeline.predict(x_test)

f1_score(y_test, predictions, average='macro')

0.7987408018675097

**Note: LogisticRegression without Hyperparamter Tuning Made Better Results Than XGBoost**
- **I will start to make hyperparameter tuning for LogisticRegression**

In [None]:
def objective(trial):

    C = trial.suggest_float('C', 1e-2, 1)
    tol = trial.suggest_float('tol', 1e-6 , 1e-3)
    solver = trial.suggest_categorical('solver' , ['newton-cg', 'lbfgs','liblinear'])


    model =LogisticRegression(C=C, solver=solver, tol=tol, max_iter=500)

    pipeline = Pipeline(steps=[
        ('preprocessing', Transformer()),
        ('Vectorizing', CountVectorizer()),
        ('model', model),
    ])

    pipeline.fit(x_train, y_train.values)
    preds = pipeline.predict(x_test)

    accuracy = accuracy_score(y_test.values, preds)
    return 1 - accuracy

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

print('Best Hyperparameters: ', study.best_params)
print('Best Performance: ', study.best_value)

In [31]:
model = LogisticRegression(max_iter=500, C=0.7602384809820221, tol=0.00037154793342214157, solver='liblinear')

pipeline = Pipeline(steps=[
    ('preprocessing', Transformer()),
    ('Vectorizing', CountVectorizer()),
    ('model', model),
])

pipeline.fit(x_train, y_train)
pipeline.predict(x_test)
predictions = pipeline.predict(x_test)

f1_score(y_test, predictions, average='macro')

0.7992753918932327

### **Additional Tuning**
- **Using different N-grams**
- **Using TFIDFVetorizer**
- **Using GLoVe**

**N-grams**

In [32]:
for i in range(2, 5): 
    pipeline = Pipeline(steps=[
        ('preprocessing', Transformer()),
        ('Vectorizing', CountVectorizer(ngram_range=(1, i))),
        ('model', model),
    ])

    pipeline.fit(x_train, y_train)
    predictions = pipeline.predict(x_test)

    score = f1_score(y_test, predictions, average='macro')
    print(f"F1_Score with N-grams {i}: {score}\n")

F1_Score with N-grams 2: 0.8001967474785857

F1_Score with N-grams 3: 0.7935935346319948

F1_Score with N-grams 4: 0.7889066122660118



**TFIDFVetorizer**

In [33]:
for i in range(2, 5): 
    pipeline1 = Pipeline(steps=[
        ('preprocessing', Transformer()),
        ('Vectorizing', TfidfVectorizer(ngram_range=(1, i))),
        ('model', model),
    ])

    pipeline1.fit(x_train, y_train)
    predictions = pipeline1.predict(x_test)

    score = f1_score(y_test, predictions, average='macro')
    print(f"F1_Score with N-grams {i}: {score}\n")

F1_Score with N-grams 2: 0.7583731895003548

F1_Score with N-grams 3: 0.7433432976274764

F1_Score with N-grams 4: 0.7352916046885769



**GLOVE**

In [34]:
nlp = spacy.load('en_core_web_md')

x_train_v = np.zeros((len(x_train), 300))
x_test_v = np.zeros((len(x_test), 300))

for i, doc in enumerate(nlp.pipe(x_train)):
    x_train_v[i, :] = doc.vector

for i, doc in enumerate(nlp.pipe(x_test)):
    x_test_v[i, :] = doc.vector

model.fit(x_train_v, y_train)
predictions = model.predict(x_test_v)

f1_score(y_test, predictions, average='macro')

0.21348393069490657

### **Best ML Model**
- **Best Model untill now is LogisticRegression using CountVectorizer with N-gram 2**

In [35]:
lg = LogisticRegression(max_iter=500, C=0.7602384809820221, tol=0.00037154793342214157, solver='liblinear')

lg_pipeline = Pipeline(steps=[
    ('preprocessing', Transformer()),
    ('Vectorizing', CountVectorizer(ngram_range=(1, 2))),
    ('model', lg),
])

lg_pipeline.fit(x_train, y_train)
lg_pipeline.predict(x_test)
predictions = lg_pipeline.predict(x_test)

f1_score(y_test, predictions, average='macro')

0.8001967474785857

### **Model Saving**

In [36]:
with open("model.pkl", 'wb') as file:
    pickle.dump(lg_pipeline, file)

### **Model Loading**

In [37]:
with open("model.pkl", 'rb') as file:
    lg_model = pickle.load(file)

In [38]:
lg_model.predict(pd.Series(["بدّك تبهدل رجّال، فلِّت عليه مرا"]))[0]

3

In [39]:
country_mapper[lg_model.predict(pd.Series(["بدّك تبهدل رجّال، فلِّت عليه مرا"]))[0]]

'Lebanon'

In [40]:
country_mapper[lg_model.predict(pd.Series(["من جاور السعيد يسعد ومن جاور الحداد ينكوي بناره"]))[0]]

'Egypt'

In [41]:
country_mapper[lg_model.predict(pd.Series(["كل زول بيعرف حقه"]))[0]]

'Sudan'

In [42]:
country_mapper[lg_model.predict(pd.Series(["كنحس بالعيا فاش منبدا نقرا"]))[0]]

'Morocco'

In [43]:
country_mapper[lg_model.predict(pd.Series(["للي تخاصمه ما تقطعش أحبال اوصاله"]))[0]]

'Libya'