In [38]:
# Built in Libraries
import numpy as np
import pandas as pd
import joblib
import re

#Embedding
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import KeyedVectors
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
import nltk


# # Modeling and Evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from xgboost import XGBClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, classification_report, accuracy_score





# Prepare Data

In [5]:
df_clean = pd.read_csv('/content/dialects_cleaned.csv', index_col=0)
df_clean.head()

Unnamed: 0,id,text,dialect
0,1009754958479151232,قليلين ادب ومنافقين اختهم او قريبتهم تتعاكس تق...,LY
1,1009794751548313600,الليبيين متقلبين بالنسبة ليا انا ميليشياوي زما...,LY
2,1019989115490787200,تانيه شاب ليبي بيرتاح لبنت مختلفة ويلاحظ انها ...,LY
3,1035479791758135168,رانيا عقليتك متخلفة اولا الانسان يلي يحتاج اهل...,LY
4,1035481122921164800,شكلك متعقدة علشان الراجل تحبيه ازوج بنت يتيمة ...,LY


In [7]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 147725 entries, 0 to 147724
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       147725 non-null  int64 
 1   text     147650 non-null  object
 2   dialect  147725 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.5+ MB


In [8]:
df_clean.dialect.value_counts()

Unnamed: 0_level_0,count
dialect,Unnamed: 1_level_1
EG,57636
LY,36499
LB,27617
SD,14434
MA,11539


In [9]:
df_clean.isnull().sum()

Unnamed: 0,0
id,0
text,75
dialect,0


In [10]:
df_clean = df_clean.dropna()

In [11]:
df_clean.duplicated().sum()

0

# Split The data


In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_clean['text'], df_clean['dialect'], test_size=0.2, random_state=42)

# Modeling

In [6]:
df_results = pd.DataFrame(columns=['Model', 'Embedding', 'Accuracy', 'Macro F1 Score'])

In [13]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [19]:
def eval_ml(model,X_test_model):
    y_pred = model.predict(X_test_model)
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1_score = f1_score(y_test, y_pred, average="macro")
    print(f"Testing ML:\nAccuracy: {accuracy}")
    print(f"Macro F1 score: {macro_f1_score}")
    return accuracy, macro_f1_score

## TFIDF

In [None]:
vectorizee = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizee.fit_transform(X_train)
X_test_tfidf = vectorizee.transform(X_test)

### Logistic Regression


In [None]:
model_logistic = LogisticRegression(class_weight='balanced')
model_logistic.fit(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
acc_logist, macro_f1_logist = eval_ml(model_logistic,X_test_tfidf)

Testing ML:
Accuracy: 0.8007576038920043
Macro F1 score: 0.680216297600986


In [None]:
Logit_row = ['Logistic Regression','TFIDF(10000)', acc_logist,macro_f1_logist]
df_results.loc[len(df_results)] = Logit_row

### Random Forest


In [None]:
model_RLF = RandomForestClassifier(n_estimators=200, class_weight='balanced')
model_RLF.fit(X_train_tfidf, y_train)

In [None]:
acc_RLF, macro_f1_RLF = eval_ml(model_RLF,X_test_tfidf)

Testing ML:
Accuracy: 0.752330374716827
Macro F1 score: 0.6045204587557327


In [None]:
RLF_tf_row = ['Random Forest', 'TFIDF(10000)', acc_RLF, macro_f1_RLF]
df_results.loc[len(df_results)] = RLF_tf_row

### Xgboost

In [None]:
model_Xgboost_tfidf = XGBClassifier(n_estimators =200,class_weight='balanced')
model_Xgboost_tfidf.fit(X_train_tfidf, y_train)

Parameters: { "class_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
acc_XGB, macro_f1_XGB = eval_ml(model_Xgboost_tfidf,X_test_tfidf)

Testing ML:
Accuracy: 0.764102944999443
Macro F1 score: 0.650870248997141


In [None]:
XGB_tf_row = ['XGBoost', 'TFIDF(10000)', acc_XGB, macro_f1_XGB]
df_results.loc[len(df_results)] = XGB_tf_row

### Naive Bayes

In [None]:
vectorizer_for_naive = TfidfVectorizer()
X_train_naive = vectorizer_for_naive.fit_transform(X_train)
X_test_naive = vectorizer_for_naive.transform(X_test)

In [None]:
model_naive = ComplementNB()
model_naive.fit(X_train_naive, y_train)

In [None]:
y_pred = model_naive.predict(X_test_naive)
accuracy_naive_tf = accuracy_score(y_test, y_pred)
macro_f1_score_naive_tf = f1_score(y_test, y_pred, average="macro")
print(f"Testing ML:\nAccuracy: {accuracy_naive_tf}")
print(f"Macro F1 score: {macro_f1_score_naive_tf}")

Testing ML:
Accuracy: 0.8616630148178408
Macro F1 score: 0.7142180559143614


In [None]:
naive_row_tf = ['NaiveBayes', 'TFIDF', accuracy_naive_tf, macro_f1_score_naive_tf]
df_results.loc[len(df_results)] = naive_row_tf

## Word2Vec

In [None]:
# Tokenize the text data
nltk.download('punkt_tab')
X_train_tokens = [word_tokenize(text) for text in X_train]
X_test_tokens = [word_tokenize(text) for text in X_test]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def document_vector(doc,model):
    words = [word for word in doc if word in model.wv]
    if words:
        return np.mean(model.wv[words], axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
# Train the Word2Vec model
model = Word2Vec(
    vector_size=500,
    min_count=1,
    workers=4,
    sg=0
)

model.build_vocab(X_train_tokens)
model.train(X_train_tokens, total_examples=len(X_train_tokens), epochs=model.epochs)

# Transform the documents into vectors
X_train_vectors = np.array([document_vector(doc, model) for doc in X_train_tokens])
X_test_vectors = np.array([document_vector(doc, model) for doc in X_test_tokens])

### Logistic Regression


In [None]:
model_logistic_w2v = LogisticRegression(class_weight='balanced')
model_logistic_w2v.fit(X_train_vectors, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
acc_logist_w2v, macro_f1_logist_w2v = eval_ml(model_logistic_w2v,X_test_vectors)

Testing ML:
Accuracy: 0.5842091580941063
Macro F1 score: 0.49632621091551143


In [None]:
logist_row_W2v = ['Logistic Regression', 'W2V(CBOW)', acc_logist_w2v, macro_f1_logist_w2v]
df_results.loc[len(df_results)] = logist_row_W2v

### Random Forest

In [None]:
model_RLF_w2v = RandomForestClassifier(n_estimators=200, class_weight='balanced')
model_RLF_w2v.fit(X_train_vectors, y_train)

In [None]:
acc_RLF_w2v, macro_f1_RLF_w2v = eval_ml(model_RLF_w2v,X_test_vectors)

Testing ML:
Accuracy: 0.7288223715972816
Macro F1 score: 0.520345887914904


In [None]:
RLF_w2v_row = ['Random Forest', 'W2V(CBOW)', acc_RLF_w2v, macro_f1_RLF_w2v]
df_results.loc[len(df_results)] = RLF_w2v_row

### Naive Bayes


In [None]:
model_naive_w2v = GaussianNB()
model_naive_w2v.fit(X_train_vectors, y_train)

In [None]:
acc_naive_w2v, macro_f1_naive_w2v = eval_ml(model_naive_w2v,X_test_vectors)

Testing ML:
Accuracy: 0.4853492776766814
Macro F1 score: 0.39041708715768547


In [None]:
naive_row_W2v = ['NaiveBayes', 'W2V(CBOW)', acc_naive_w2v, macro_f1_naive_w2v]
df_results.loc[len(df_results)] = naive_row_W2v

### XGboost

In [None]:
model_xgb_w2v = XGBClassifier(n_estimators = 200,class_weight='balanced')
model_xgb_w2v.fit(X_train_vectors, y_train)

Parameters: { "class_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
acc_XGB, macro_f1_XGB = eval_ml(model_xgb_w2v,X_test_vectors)

Testing ML:
Accuracy: 0.7408549039997029
Macro F1 score: 0.5432147157240529


In [None]:
XGB_row_w2v = ['XGboost','W2V(CBOW)',acc_XGB,macro_f1_XGB]
df_results.loc[len(df_results)] = XGB_row_w2v

## Word2Vec Skip-Gram

In [None]:
# Train the Word2Vec model
model_Skip = Word2Vec(
    vector_size=600,
    min_count=1,
    workers=4,
    sg=1
)

model_Skip.build_vocab(X_train_tokens)
model_Skip.train(X_train_tokens, total_examples=len(X_train_tokens), epochs=model_Skip.epochs)

# Transform the documents into vectors
X_train_vectors_Skip = np.array([document_vector(doc, model_Skip) for doc in X_train_tokens])
X_test_vectors_Skip = np.array([document_vector(doc, model_Skip) for doc in X_test_tokens])

### Logistic Regression

In [None]:
model_logistic_w2v_Skip = LogisticRegression(class_weight='balanced')
model_logistic_w2v_Skip.fit(X_train_vectors_Skip, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
acc_logist_w2v_Skip, macro_f1_logist_w2v_Skip = eval_ml(model_logistic_w2v_Skip,X_test_vectors_Skip)

Testing ML:
Accuracy: 0.7374011215508597
Macro F1 score: 0.6238392563144441


In [None]:
logist_row_W2v_Skip = ['Logistic Regression', 'W2V(Skip-Gram)', acc_logist_w2v_Skip, macro_f1_logist_w2v_Skip]
df_results.loc[len(df_results)] = logist_row_W2v_Skip

### Random Forest

In [None]:
model_RLF_w2v_Skip = RandomForestClassifier(n_estimators=200, class_weight='balanced')
model_RLF_w2v_Skip.fit(X_train_vectors_Skip, y_train)

In [None]:
acc_RLF_w2v_Skip, macro_f1_RLF_w2v_Skip = eval_ml(model_RLF_w2v_Skip,X_test_vectors_Skip)

Testing ML:
Accuracy: 0.8077023062353771
Macro F1 score: 0.6152529331630976


In [None]:
RLF_row_w2v_Skip = ['Random Forest', 'W2V(Skip-Gram)', acc_RLF_w2v_Skip, macro_f1_RLF_w2v_Skip]
df_results.loc[len(df_results)] = RLF_row_w2v_Skip

### Naive Bayes

In [None]:
model_naive_w2v_Skip = GaussianNB()
model_naive_w2v_Skip.fit(X_train_vectors_Skip, y_train)

In [None]:
acc_naive_w2v_Skip, macro_f1_naive_w2v_Skip = eval_ml(model_naive_w2v_Skip,X_test_vectors_Skip)

Testing ML:
Accuracy: 0.6637204293088721
Macro F1 score: 0.5587546441362974


In [None]:
naive_row_W2v_Skip = ['NaiveBayes', 'W2V(Skip-Gram)', acc_naive_w2v_Skip, macro_f1_naive_w2v_Skip]
df_results.loc[len(df_results)]= naive_row_W2v_Skip

In [None]:
model_Xgb_Skip = XGBClassifier(n_estimators =200,class_weight='balanced')
model_Xgb_Skip.fit(X_train_vectors_Skip, y_train)

Parameters: { "class_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
acc_XGB_Skip, macro_f1_XGB_Skip = eval_ml(model_Xgb_Skip,X_test_vectors_Skip)

Testing ML:
Accuracy: 0.817098079994058
Macro F1 score: 0.6449330865844148


In [None]:
XGB_row_w2v_Skip = ['XGboost','W2V(Skip-Gram)',acc_XGB_Skip,macro_f1_XGB_Skip]
df_results.loc[len(df_results)] = XGB_row_w2v_Skip

## Pretained fast-text arabic

In [4]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz

# 2. Unzip it (gunzip)
!gunzip cc.ar.300.vec.gz

--2025-10-07 22:42:15--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.96, 3.163.189.51, 3.163.189.108, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1272365870 (1.2G) [binary/octet-stream]
Saving to: ‘cc.ar.300.vec.gz’


2025-10-07 22:42:21 (198 MB/s) - ‘cc.ar.300.vec.gz’ saved [1272365870/1272365870]



In [14]:
def arabic_tokenize(text):
    # Keep only Arabic letters and spaces
    tokens = text.split()
    return tokens

X_train_tokens = [arabic_tokenize(text) for text in X_train]
X_test_tokens = [arabic_tokenize(text) for text in X_test]

In [15]:
model = KeyedVectors.load_word2vec_format("cc.ar.300.vec", binary=False)

In [16]:
def document_vector(doc,model):
    words = [word for word in doc if word in model]
    if words:
        return np.mean(model[words], axis=0)
    else:
        return np.zeros(model.vector_size)

In [17]:
# Transform the documents into vectors using the pretrained model
X_train_vectors_pretrained = np.array([document_vector(doc, model) for doc in X_train_tokens])
X_test_vectors_pretrained = np.array([document_vector(doc, model) for doc in X_test_tokens])

### Logistic Regression

In [18]:
logistic_reg_fasttext = LogisticRegression(class_weight='balanced')
logistic_reg_fasttext.fit(X_train_vectors_pretrained, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
acc_logist_fasttext, macro_f1_logist_fasttext = eval_ml(logistic_reg_fasttext,X_test_vectors_pretrained)

Testing ML:
Accuracy: 0.747544869624111
Macro F1 score: 0.7088058691208546


In [30]:
logit_fast_row = ['Logistic Regression','FastText', acc_logist_fasttext,macro_f1_logist_fasttext]
df_results.loc[len(df_results)] = logit_fast_row

### Random Forest

In [21]:
RlF_fasttext = RandomForestClassifier(n_estimators=200, class_weight='balanced')
RlF_fasttext.fit(X_train_vectors_pretrained, y_train)

In [22]:
acc_RLF_fasttext, macro_f1_RLF_fasttext = eval_ml(RlF_fasttext,X_test_vectors_pretrained)

Testing ML:
Accuracy: 0.7135116830342025
Macro F1 score: 0.6404659986497426


In [31]:
Rlf_fast_row = ['Random Forest', 'FastText', acc_RLF_fasttext, macro_f1_RLF_fasttext]
df_results.loc[len(df_results)] = Rlf_fast_row

### XGboost

In [23]:
XGboost_fasttext = XGBClassifier(n_estimators =200,class_weight='balanced')
XGboost_fasttext.fit(X_train_vectors_pretrained, y_train)

Parameters: { "class_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [24]:
acc_Xgb_fasttext, macro_f1_Xgb_fasttext = eval_ml(XGboost_fasttext,X_test_vectors_pretrained)

Testing ML:
Accuracy: 0.7811716898069759
Macro F1 score: 0.738158654749994


In [32]:
Xg_fast_row = ['XGboost','FastText',acc_Xgb_fasttext,macro_f1_Xgb_fasttext]
df_results.loc[len(df_results)] = Xg_fast_row

### Naive Bayes

In [26]:
naive_fasttext = GaussianNB()
naive_fasttext.fit(X_train_vectors_pretrained, y_train)

In [27]:
acc_naive_fasttext, macro_f1_naive_fasttext = eval_ml(naive_fasttext,X_test_vectors_pretrained)

Testing ML:
Accuracy: 0.6296308838469353
Macro F1 score: 0.5911058412339066


In [33]:
naive_fast_row = ['NaiveBayes', 'FastText', acc_naive_fasttext, macro_f1_naive_fasttext]
df_results.loc[len(df_results)] = naive_fast_row

# Save best model and results

In [34]:
df_results

Unnamed: 0,Model,Embedding,Accuracy,Macro F1 Score
0,Logistic Regression,TFIDF(10000),0.800758,0.680216
1,Random Forest,TFIDF(10000),0.75233,0.60452
2,XGBoost,TFIDF(10000),0.764103,0.65087
3,NaiveBayes,TFIDF,0.861663,0.714218
4,Logistic Regression,W2V(CBOW),0.584209,0.496326
5,Random Forest,W2V(CBOW),0.728822,0.520346
6,NaiveBayes,W2V(CBOW),0.485349,0.390417
7,XGboost,W2V(CBOW),0.740855,0.543215
8,Logistic Regression,W2V(Skip-Gram),0.737401,0.623839
9,Random Forest,W2V(Skip-Gram),0.807702,0.615253


In [None]:
best_model = df_results[df_results['Accuracy'] == df_results['Accuracy'].max()]

In [None]:
best_model

Unnamed: 0,Model,Embedding,Accuracy,Macro F1 Score
3,NaiveBayes,TFIDF,0.861663,0.714218


In [None]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', ComplementNB())
])
pipe.fit(X_train, y_train)

In [49]:
ML_MODEL_PATH = '/content/Models/ml_model(Naive)_Pipe.pkl'


In [50]:
def save_ml_model(model, path=ML_MODEL_PATH):
    joblib.dump(model, path)

In [None]:
save_ml_model(pipe, ML_MODEL_PATH)

In [35]:
best_model2 = df_results[df_results['Macro F1 Score'] == df_results['Macro F1 Score'].max()]

In [36]:
best_model2

Unnamed: 0,Model,Embedding,Accuracy,Macro F1 Score
14,XGboost,FastText,0.781172,0.738159


In [42]:
class ArabicTokenizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.arabic_tokenize(text) for text in X]

    def arabic_tokenize(self, text):
        # Keep only Arabic letters and spaces
        text = re.sub(r'[^\u0600-\u06FF\s]', '', text)
        tokens = text.split()
        return tokens

In [43]:
class FastTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.vector_size = model.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.document_vector(tokens) for tokens in X])

    def document_vector(self, tokens):
        valid_tokens = [w for w in tokens if w in self.model]
        if not valid_tokens:
            return np.zeros(self.vector_size)
        return np.mean(self.model[valid_tokens], axis=0)

In [44]:
pipe2 = Pipeline([
    ('tokenizer', ArabicTokenizer()),
    ('vectorizer', FastTextVectorizer(model=model)),
    ('classifier', XGBClassifier(n_estimators=200,class_weight='balanced'))
])
pipe2.fit(X_train, y_train)

Parameters: { "class_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [46]:
ML_MODEL_PATH2 = '/content/Models/ml_model(XG_Fast-text)_Pipe.pkl'


In [52]:
save_ml_model(pipe2, ML_MODEL_PATH2)

In [None]:
df_results.to_csv('/content/ML_Models_embedding_comparison.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')