In [2]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score
from tqdm.auto import tqdm





In [3]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz

# 2. Unzip it (gunzip)
!gunzip cc.ar.300.vec.gz

--2025-10-07 22:09:50--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.108, 3.163.189.51, 3.163.189.96, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1272365870 (1.2G) [binary/octet-stream]
Saving to: ‘cc.ar.300.vec.gz’


2025-10-07 22:09:56 (197 MB/s) - ‘cc.ar.300.vec.gz’ saved [1272365870/1272365870]



In [9]:
df = pd.read_csv('/content/dialects_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,text,dialect
0,0,1009754958479151232,قليلين ادب ومنافقين اختهم او قريبتهم تتعاكس تق...,LY
1,1,1009794751548313600,الليبيين متقلبين بالنسبة ليا انا ميليشياوي زما...,LY
2,2,1019989115490787200,تانيه شاب ليبي بيرتاح لبنت مختلفة ويلاحظ انها ...,LY
3,3,1035479791758135168,رانيا عقليتك متخلفة اولا الانسان يلي يحتاج اهل...,LY
4,4,1035481122921164800,شكلك متعقدة علشان الراجل تحبيه ازوج بنت يتيمة ...,LY


In [10]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
id,0
text,75
dialect,0


In [11]:
df = df.dropna()

# Split the data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['dialect'], test_size=0.2, random_state=42)

In [13]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [14]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_train_encoded = to_categorical(y_train)
y_test = encoder.transform(y_test)
y_test_encoded = to_categorical(y_test)
y_val = encoder.transform(y_val)
y_val_encoded = to_categorical(y_val)

# Embedding

In [15]:
def get_longest_text(texts):
    longest_input = 0
    for text in texts:
        text_len= len(text.split())
        longest_input = max(longest_input, text_len)
    return longest_input

In [16]:
longest_input = get_longest_text(df['text'])
longest_input

61

In [17]:
# Tokenize the text data


def arabic_tokenize(text):
    # Keep only Arabic letters and spaces
    tokens = text.split()
    return tokens

X_train_tokens = [arabic_tokenize(text) for text in X_train]
X_test_tokens = [arabic_tokenize(text) for text in X_test]
X_val_tokens = [arabic_tokenize(text) for text in X_val]

In [18]:
model = KeyedVectors.load_word2vec_format("cc.ar.300.vec", binary=False)

In [19]:
X_train_vec = np.zeros((len(X_train_tokens), longest_input, model.vector_size))

for i, tokens in enumerate(tqdm(X_train_tokens, total=len(X_train_tokens))):
    for j, token in enumerate(tokens[:longest_input]):  # truncate if longer
        if token in model:
            X_train_vec[i, j] = model[token]
        else:
            X_train_vec[i, j] = np.zeros(model.vector_size)  # OOV -> zero vector

print("Embedding shape:", X_train_vec.shape)

  0%|          | 0/118120 [00:00<?, ?it/s]

Embedding shape: (118120, 61, 300)


In [20]:
X_test_vec = np.zeros((len(X_test_tokens), longest_input, model.vector_size))

for i, tokens in enumerate(tqdm(X_test_tokens, total=len(X_test_tokens))):
    for j, token in enumerate(tokens[:longest_input]):  # truncate if longer
        if token in model:
            X_test_vec[i, j] = model[token]
        else:
            X_test_vec[i, j] = np.zeros(model.vector_size)  # OOV -> zero vector

print("Embedding shape:", X_test_vec.shape)

  0%|          | 0/14765 [00:00<?, ?it/s]

Embedding shape: (14765, 61, 300)


In [21]:
X_val_vec = np.zeros((len(X_val_tokens), longest_input, model.vector_size))

for i, tokens in enumerate(tqdm(X_val_tokens, total=len(X_val_tokens))):
    for j, token in enumerate(tokens[:longest_input]):  # truncate if longer
        if token in model:
            X_val_vec[i, j] = model[token]
        else:
            X_val_vec[i, j] = np.zeros(model.vector_size)  # OOV -> zero vector

print("Embedding shape:", X_val_vec.shape)

  0%|          | 0/14765 [00:00<?, ?it/s]

Embedding shape: (14765, 61, 300)


In [22]:
import tensorflow as tf

# define the network
inputs = tf.keras.layers.Input((longest_input, 300))
reshaped = tf.keras.layers.Reshape((longest_input, 300, 1))(inputs)


filters = [2, 3, 4]

# define the conv net
conv_1 = tf.keras.layers.Conv2D(200, (filters[0], 300), activation='relu')(reshaped)
conv_2 = tf.keras.layers.Conv2D(200, (filters[1], 300), activation='relu')(reshaped)
conv_3 = tf.keras.layers.Conv2D(200, (filters[2], 300), activation='relu')(reshaped)

# define max-pooling
pool_1 = tf.keras.layers.MaxPooling2D((longest_input - filters[0] + 1, 1), strides=(1,1))(conv_1)
pool_2 = tf.keras.layers.MaxPooling2D((longest_input - filters[1] + 1, 1), strides=(1,1))(conv_2)
pool_3 = tf.keras.layers.MaxPooling2D((longest_input - filters[2] + 1, 1), strides=(1,1))(conv_3)

# concatenate the convs
merged_tensor = tf.keras.layers.concatenate([pool_1, pool_2, pool_3], axis=1)

# now flatten them and add a dense layer
flatten = tf.keras.layers.Flatten()(merged_tensor)

# add a dense layer
clf = tf.keras.layers.Dense(200, activation='relu')(flatten)

# add final output
clf = tf.keras.layers.Dense(len(encoder.classes_), activation='softmax')(clf)

In [23]:
# compile the model
model = tf.keras.models.Model(inputs, clf)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

In [24]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [25]:
model.fit(X_train_vec, y_train_encoded, epochs=17,validation_data=(X_val_vec,y_val_encoded),callbacks=[early_stopping],)

Epoch 1/17
[1m3692/3692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 48ms/step - acc: 0.7538 - loss: 0.6782 - val_acc: 0.8071 - val_loss: 0.5329
Epoch 2/17
[1m3692/3692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 48ms/step - acc: 0.8381 - loss: 0.4604 - val_acc: 0.8136 - val_loss: 0.5221
Epoch 3/17
[1m3692/3692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 47ms/step - acc: 0.8771 - loss: 0.3515 - val_acc: 0.8154 - val_loss: 0.5462
Epoch 4/17
[1m3692/3692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 47ms/step - acc: 0.9163 - loss: 0.2464 - val_acc: 0.8125 - val_loss: 0.6006
Epoch 5/17
[1m3692/3692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 47ms/step - acc: 0.9408 - loss: 0.1745 - val_acc: 0.8015 - val_loss: 0.7510


<keras.src.callbacks.history.History at 0x7c9de5251f10>

In [26]:
test_acc = model.evaluate(X_test_vec, y_test_encoded)
print('Test accuracy:', test_acc)

[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - acc: 0.8220 - loss: 0.4992
Test accuracy: [0.5137262344360352, 0.8176091909408569]


In [27]:
y_pred = model.predict(X_test_vec)
macof1 = f1_score(y_test, np.argmax(y_pred, axis=1), average='macro')
print("Macro F1 Score:", macof1)

[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step
Macro F1 Score: 0.7859224815476487


In [28]:
model.save("CNN_model.h5")

