In [50]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score


In [None]:
df = pd.read_csv('/content/dialects_cleaned.csv')

In [None]:
df.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
id,0
text,75
dialect,0


In [6]:
df = df.dropna()

In [7]:
count_vec = CountVectorizer()
count_vec.fit(df['text'])
vocap = count_vec.get_feature_names_out()

In [None]:
def unique_words(text):
  words = set(text.split())
  row_vocap = [word for word in vocap if word in words]
  return ' '.join(row_vocap)
df['text'] = df['text'].apply(unique_words)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,text,dialect
0,0,1009754958479151232,اختهم ادب المرأة او تتعاكس تقولي حقوق ردة عليه...,LY
1,1,1009794751548313600,الليبيين انا بالنسبة زمان ليا متقلبين ميليشياو...,LY
2,2,1019989115490787200,البنات الرومانسية اند انها بعدين بيرتاح تانيه ...,LY
3,3,1035479791758135168,اب ادير اصلا الادب الانسان البنت الف اليتيمة ا...,LY
4,4,1035481122921164800,ازوج اعصابك التخلف الراجل امتاعك بنت تحبيه خوت...,LY


In [8]:
word2idx = {word: idx + 1 for idx, word in enumerate(vocap)}
def text_to_sequence(text):
    return [word2idx[word] for word in text.split()]

df['seq'] = df['text'].apply(text_to_sequence)



In [9]:
max_sequence_len = 0
for sentence in df['seq']:
    max_sequence_len = max(len(sentence), max_sequence_len)
print(max_sequence_len)

51


In [4]:
df = pd.read_csv('/content/dialects_cleaned_seq.csv')

In [None]:
# save the data
df.to_csv('dialects_cleaned_seq.csv')

In [10]:
import ast

# Convert the string representation of lists in 'seq' column back to actual lists
df['seq'] = df['seq'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

X_padded = pad_sequences(df['seq'], maxlen=max_sequence_len, padding='post')

In [11]:
# Encode the dialect labels numerically
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['dialect'])

# Split the data
X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)
X_test, X_val, y_test_encoded, y_val_encoded = train_test_split(X_test, y_test_encoded, test_size=0.2, random_state=42)
# One-hot encode the numerical labels
num_classes = len(label_encoder.classes_)
y_train_one_hot = to_categorical(y_train_encoded, num_classes=num_classes)
y_val_one_hot = to_categorical(y_val_encoded, num_classes=num_classes)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=num_classes)

In [12]:
VOCAB_SIZE = len(word2idx)
print(VOCAB_SIZE)

245759


In [14]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE + 1, 64), # Add 1 to VOCAB_SIZE for padding (index 0)
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax') # Output layer with number of units equal to number of classes and softmax activation
])

In [15]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [32]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3,restore_best_weights = True)

In [17]:
# Train the model with one-hot encoded labels
history = model.fit(X_train, y_train_one_hot, epochs=10, batch_size=128, validation_data=(X_val, y_val_one_hot), callbacks=[early_stopping])

Epoch 1/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.4370 - loss: 1.3721 - val_accuracy: 0.5970 - val_loss: 1.1182
Epoch 2/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.6849 - loss: 0.9035 - val_accuracy: 0.7010 - val_loss: 0.8896
Epoch 3/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.8141 - loss: 0.5713 - val_accuracy: 0.7091 - val_loss: 0.8895
Epoch 4/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.8865 - loss: 0.3586 - val_accuracy: 0.7161 - val_loss: 0.9591
Epoch 5/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.9255 - loss: 0.2397 - val_accuracy: 0.6954 - val_loss: 1.0598
Epoch 6/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.9360 - loss: 0.2080 - val_accuracy: 0.6752 - val_loss: 1.1150


In [18]:
test_loss, test_acc = model.evaluate(X_test, y_test_one_hot)
print('Test accuracy:', test_acc)

[1m739/739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6771 - loss: 1.1268
Test accuracy: 0.6773059964179993


In [19]:
model_bi = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])


In [20]:
model_bi.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [21]:
history_bi = model_bi.fit(X_train, y_train_one_hot, epochs=10, batch_size=128, validation_data=(X_val, y_val_one_hot), callbacks=[early_stopping])

Epoch 1/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 25ms/step - accuracy: 0.6007 - loss: 1.0275 - val_accuracy: 0.8143 - val_loss: 0.5496
Epoch 2/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 21ms/step - accuracy: 0.9185 - loss: 0.2508 - val_accuracy: 0.8056 - val_loss: 0.5938
Epoch 3/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 21ms/step - accuracy: 0.9757 - loss: 0.0792 - val_accuracy: 0.7980 - val_loss: 0.7043
Epoch 4/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 20ms/step - accuracy: 0.9894 - loss: 0.0349 - val_accuracy: 0.7855 - val_loss: 0.8708


In [48]:
del f1_score

In [51]:
test_loss, test_acc = model_bi.evaluate(X_test, y_test_one_hot)
print('Test accuracy:', test_acc)
y_pred = model_bi.predict(X_test)
f1_bi = f1_score(np.argmax(y_test_one_hot, axis=1), np.argmax(y_pred, axis=1), average='macro')
print('F1 score:', f1_bi)

[1m739/739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7830 - loss: 0.8831
Test accuracy: 0.7868602871894836
[1m739/739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
F1 score: 0.7451228706204442


In [26]:
gru_bi = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])


In [28]:
gru_bi.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [29]:
history_gru_bi = gru_bi.fit(X_train, y_train_one_hot, epochs=10, batch_size=128, validation_data=(X_val, y_val_one_hot), callbacks=[early_stopping])

Epoch 1/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 24ms/step - accuracy: 0.6334 - loss: 0.9370 - val_accuracy: 0.8205 - val_loss: 0.5106
Epoch 2/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 23ms/step - accuracy: 0.9206 - loss: 0.2432 - val_accuracy: 0.8241 - val_loss: 0.5369
Epoch 3/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 23ms/step - accuracy: 0.9722 - loss: 0.0909 - val_accuracy: 0.8093 - val_loss: 0.6521
Epoch 4/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 27ms/step - accuracy: 0.9847 - loss: 0.0500 - val_accuracy: 0.8034 - val_loss: 0.7696


In [53]:
test_loss, test_acc = gru_bi.evaluate(X_test, y_test_one_hot)
print('Test accuracy:', test_acc)
y_pred = gru_bi.predict(X_test)
f1_gru = f1_score(np.argmax(y_test_one_hot, axis=1), np.argmax(y_pred, axis=1),average='macro')
print('F1 score:', f1_gru)

[1m739/739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.8017 - loss: 0.7676
Test accuracy: 0.8050205111503601
[1m739/739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
F1 score: 0.7694128778238811


In [38]:
lstm_bi = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences= True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [39]:
lstm_bi.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [40]:
history_lstm_bi = lstm_bi.fit(X_train, y_train_one_hot, epochs=10, batch_size=128, validation_data=(X_val, y_val_one_hot), callbacks=[early_stopping])

Epoch 1/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 25ms/step - accuracy: 0.6207 - loss: 0.9693 - val_accuracy: 0.8212 - val_loss: 0.5039
Epoch 2/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 25ms/step - accuracy: 0.9233 - loss: 0.2418 - val_accuracy: 0.8251 - val_loss: 0.5222
Epoch 3/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 25ms/step - accuracy: 0.9722 - loss: 0.0947 - val_accuracy: 0.8134 - val_loss: 0.6271
Epoch 4/10
[1m923/923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 25ms/step - accuracy: 0.9852 - loss: 0.0504 - val_accuracy: 0.8077 - val_loss: 0.7346


In [54]:
test_loss, test_acc = lstm_bi.evaluate(X_test, y_test_one_hot)
print('Test accuracy:', test_acc)
y_pred = lstm_bi.predict(X_test)
# Get class indices
y_true = np.argmax(y_test_one_hot, axis=1)
y_pred_classes = np.argmax(y_pred, axis=1)
f1_lstm = f1_score(y_true, y_pred_classes, average='macro')  # or 'weighted', 'micro'
print('F1 score:', f1_lstm)

[1m739/739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.8230 - loss: 0.4961
Test accuracy: 0.8234347701072693
[1m739/739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step
F1 score: 0.7873179143684238


In [59]:
# save the model
lstm_bi.save('lstm_bi.h5')



array([4, 1, 2, ..., 1, 1, 0])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [93]:
model_lstm = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [94]:
model_lstm.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [95]:
history_lstm = model_lstm.fit(X_train, y_train_one_hot, epochs=10, batch_size=16, validation_data=(X_val, y_val_one_hot), callbacks=[early_stopping])

Epoch 1/10
[1m7383/7383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 18ms/step - accuracy: 0.7089 - loss: 0.7829 - val_accuracy: 0.8317 - val_loss: 0.4839
Epoch 2/10
[1m7383/7383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 18ms/step - accuracy: 0.9316 - loss: 0.2163 - val_accuracy: 0.8193 - val_loss: 0.5440
Epoch 3/10
[1m7383/7383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 18ms/step - accuracy: 0.9710 - loss: 0.0912 - val_accuracy: 0.8183 - val_loss: 0.6755
Epoch 4/10
[1m7383/7383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 18ms/step - accuracy: 0.9845 - loss: 0.0475 - val_accuracy: 0.8127 - val_loss: 0.8132


In [96]:
test_loss, test_acc = model_lstm.evaluate(X_test, y_test_one_hot)
print('Test accuracy:', test_acc)
y_pred = model_lstm.predict(X_test)
f1_lstm = f1_score(np.argmax(y_test_one_hot, axis=1), np.argmax(y_pred, axis=1),average='macro')
print('F1 score:', f1_lstm)

[1m739/739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.8332 - loss: 0.4740
Test accuracy: 0.83406001329422
[1m739/739[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
F1 score: 0.8037958800430696


In [97]:
model_lstm.save('model_lstm_last.h5')

