In [1]:
import pandas as pd
import numpy as np
import torch
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from adapters import AutoAdapterModel, AdapterConfig
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import string


In [None]:
if torch.cuda.is_available():
    print("CUDA is available. GPU is ready to use.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("CUDA is not available. Using CPU.")

CUDA is available. GPU is ready to use.
Number of GPUs available: 1
Current GPU: NVIDIA GeForce RTX 3050 Laptop GPU


In [None]:
df_final = pd.read_csv("C:/Users/saifk/OneDrive/Desktop/FINAL_data2.csv")
df_final

Unnamed: 0.1,Unnamed: 0,text,dialect
0,0,المفروض انلوم ربعنا الي يعطون المجال,AE
1,1,ربعنا مب مقصرين اهم شي عندهم المظاهر مديون سنه,AE
2,2,المشكله انه مب المشكله انه,AE
3,3,باختصار قاعد اقولك هالكلام كخليجي,AE
4,4,اذكر ان شفت فيديو بنت اوروبا الشرقيه تقول انا ...,AE
...,...,...,...
1091605,1091605,منع الخمر عمان,OM
1091606,1091606,ياحظ عنده ريهام,SA
1091607,1091607,قولك شلونك ابن العم,SY
1091608,1091608,مجنون سهيله الابواب,DZ


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_final['text'].tolist(),
    df_final['dialect'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_final['dialect']
)


In [15]:
train_dataset = Dataset.from_dict({'text': train_texts, 'dialect': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'dialect': val_labels})
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

In [None]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
base_model = AutoModelForSequenceClassification.from_pretrained(
    "aubmindlab/bert-base-arabertv02",
    num_labels=7,  # Updated to 7 dialect groups
    id2label={0: 'Khaleeji', 1: 'Levantine', 2: 'Maghrebi', 3: 'Iraqi', 4: 'Lybia', 5: 'Egyptian', 6: 'Sudani'},
    label2id={'Khaleeji': 0, 'Levantine': 1, 'Maghrebi': 2, 'Iraqi': 3, 'Lybia': 4, 'Egyptian': 5, 'Sudani': 6}
)
base_model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)
model = get_peft_model(base_model, lora_config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=280
    )
    dialect_mapping = {'Khaleeji': 0, 'Levantine': 1, 'Maghrebi': 2, 'Iraqi': 3, 'Lybia': 4, 'Egyptian': 5, 'Sudani': 6}
    tokenized_inputs["labels"] = [dialect_mapping[dialect] for dialect in examples["dialect"]]
    return tokenized_inputs

tokenized_datasets = datasets.map(tokenize_function, batched=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.01
    )]
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100736 [00:00<?, ? examples/s]

Map:   0%|          | 0/25185 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [17]:
trainer.train()
trainer.save_model("./final_model")

  0%|          | 0/31480 [00:00<?, ?it/s]

{'loss': 1.8804, 'grad_norm': 5.238877296447754, 'learning_rate': 1.968360864040661e-05, 'epoch': 0.08}
{'loss': 1.5839, 'grad_norm': 6.075682640075684, 'learning_rate': 1.9366581956797968e-05, 'epoch': 0.16}
{'loss': 1.3925, 'grad_norm': 7.659396648406982, 'learning_rate': 1.904955527318933e-05, 'epoch': 0.24}
{'loss': 1.3487, 'grad_norm': 7.270263195037842, 'learning_rate': 1.873189326556544e-05, 'epoch': 0.32}
{'loss': 1.2967, 'grad_norm': 8.4639253616333, 'learning_rate': 1.841423125794155e-05, 'epoch': 0.4}
{'loss': 1.2623, 'grad_norm': 9.559929847717285, 'learning_rate': 1.8096569250317664e-05, 'epoch': 0.48}
{'loss': 1.2441, 'grad_norm': 5.662931442260742, 'learning_rate': 1.7778907242693773e-05, 'epoch': 0.56}
{'loss': 1.2557, 'grad_norm': 10.234264373779297, 'learning_rate': 1.746124523506989e-05, 'epoch': 0.64}
{'loss': 1.2519, 'grad_norm': 8.399125099182129, 'learning_rate': 1.7143583227446e-05, 'epoch': 0.71}
{'loss': 1.2381, 'grad_norm': 9.034385681152344, 'learning_rate':

  0%|          | 0/1575 [00:00<?, ?it/s]

{'eval_loss': 1.1545453071594238, 'eval_accuracy': 0.5920984713122891, 'eval_runtime': 382.4308, 'eval_samples_per_second': 65.855, 'eval_steps_per_second': 4.118, 'epoch': 1.0}
{'loss': 1.2082, 'grad_norm': 7.578114986419678, 'learning_rate': 1.5873570520965694e-05, 'epoch': 1.03}
{'loss': 1.18, 'grad_norm': 6.142395496368408, 'learning_rate': 1.5555908513341807e-05, 'epoch': 1.11}
{'loss': 1.1849, 'grad_norm': 8.535268783569336, 'learning_rate': 1.5238246505717918e-05, 'epoch': 1.19}
{'loss': 1.1679, 'grad_norm': 10.79787540435791, 'learning_rate': 1.4920584498094029e-05, 'epoch': 1.27}
{'loss': 1.1673, 'grad_norm': 5.017571449279785, 'learning_rate': 1.4602922490470141e-05, 'epoch': 1.35}
{'loss': 1.1895, 'grad_norm': 6.41213846206665, 'learning_rate': 1.4285260482846252e-05, 'epoch': 1.43}
{'loss': 1.1625, 'grad_norm': 6.441224575042725, 'learning_rate': 1.3967598475222364e-05, 'epoch': 1.51}
{'loss': 1.1586, 'grad_norm': 8.88980770111084, 'learning_rate': 1.3649936467598477e-05, '

  0%|          | 0/1575 [00:00<?, ?it/s]

{'eval_loss': 1.0915614366531372, 'eval_accuracy': 0.6142942227516379, 'eval_runtime': 380.0901, 'eval_samples_per_second': 66.261, 'eval_steps_per_second': 4.144, 'epoch': 2.0}
{'loss': 1.1393, 'grad_norm': 8.064663887023926, 'learning_rate': 1.1744599745870395e-05, 'epoch': 2.06}
{'loss': 1.1244, 'grad_norm': 11.802730560302734, 'learning_rate': 1.1426937738246507e-05, 'epoch': 2.14}
{'loss': 1.1268, 'grad_norm': 8.948357582092285, 'learning_rate': 1.110927573062262e-05, 'epoch': 2.22}
{'loss': 1.132, 'grad_norm': 8.53919506072998, 'learning_rate': 1.079161372299873e-05, 'epoch': 2.3}
{'loss': 1.1239, 'grad_norm': 8.62474250793457, 'learning_rate': 1.0473951715374841e-05, 'epoch': 2.38}
{'loss': 1.1168, 'grad_norm': 8.862709045410156, 'learning_rate': 1.0156289707750955e-05, 'epoch': 2.46}
{'loss': 1.1501, 'grad_norm': 7.885191917419434, 'learning_rate': 9.839263024142314e-06, 'epoch': 2.54}
{'loss': 1.1142, 'grad_norm': 17.14594078063965, 'learning_rate': 9.521601016518425e-06, 'epo

  0%|          | 0/1575 [00:00<?, ?it/s]

{'eval_loss': 1.0675532817840576, 'eval_accuracy': 0.6212428032559063, 'eval_runtime': 345.4005, 'eval_samples_per_second': 72.915, 'eval_steps_per_second': 4.56, 'epoch': 3.0}
{'loss': 1.1144, 'grad_norm': 11.215519905090332, 'learning_rate': 7.93456162642948e-06, 'epoch': 3.02}
{'loss': 1.112, 'grad_norm': 11.179337501525879, 'learning_rate': 7.616899618805591e-06, 'epoch': 3.1}
{'loss': 1.1092, 'grad_norm': 8.941370964050293, 'learning_rate': 7.2992376111817035e-06, 'epoch': 3.18}
{'loss': 1.1114, 'grad_norm': 8.120783805847168, 'learning_rate': 6.981575603557815e-06, 'epoch': 3.26}
{'loss': 1.108, 'grad_norm': 10.256205558776855, 'learning_rate': 6.663913595933928e-06, 'epoch': 3.34}
{'loss': 1.0928, 'grad_norm': 9.199749946594238, 'learning_rate': 6.3462515883100385e-06, 'epoch': 3.41}
{'loss': 1.1098, 'grad_norm': 8.572525024414062, 'learning_rate': 6.029224904701399e-06, 'epoch': 3.49}
{'loss': 1.1024, 'grad_norm': 7.057705879211426, 'learning_rate': 5.71156289707751e-06, 'epoch

  0%|          | 0/1575 [00:00<?, ?it/s]

{'eval_loss': 1.0541532039642334, 'eval_accuracy': 0.6258884256501887, 'eval_runtime': 366.0296, 'eval_samples_per_second': 68.806, 'eval_steps_per_second': 4.303, 'epoch': 4.0}
{'loss': 1.0837, 'grad_norm': 7.6045241355896, 'learning_rate': 3.8062261753494285e-06, 'epoch': 4.05}
{'loss': 1.1119, 'grad_norm': 11.70776081085205, 'learning_rate': 3.4885641677255406e-06, 'epoch': 4.13}
{'loss': 1.0999, 'grad_norm': 11.236322402954102, 'learning_rate': 3.170902160101652e-06, 'epoch': 4.21}
{'loss': 1.097, 'grad_norm': 9.425206184387207, 'learning_rate': 2.8532401524777634e-06, 'epoch': 4.29}
{'loss': 1.1015, 'grad_norm': 10.701169967651367, 'learning_rate': 2.535578144853876e-06, 'epoch': 4.37}
{'loss': 1.1036, 'grad_norm': 9.513116836547852, 'learning_rate': 2.2179161372299876e-06, 'epoch': 4.45}
{'loss': 1.092, 'grad_norm': 9.019708633422852, 'learning_rate': 1.9002541296060993e-06, 'epoch': 4.53}
{'loss': 1.0855, 'grad_norm': 9.877575874328613, 'learning_rate': 1.582592121982211e-06, 'e

  0%|          | 0/1575 [00:00<?, ?it/s]

{'eval_loss': 1.0489318370819092, 'eval_accuracy': 0.6270399047051817, 'eval_runtime': 338.4404, 'eval_samples_per_second': 74.415, 'eval_steps_per_second': 4.654, 'epoch': 5.0}
{'train_runtime': 23534.3822, 'train_samples_per_second': 21.402, 'train_steps_per_second': 1.338, 'train_loss': 1.1664466964547182, 'epoch': 5.0}


In [None]:
text = " شنو القصة اليوم؟ عايزين نطلع ولا كدي؟ أنا زهجت من القعدة في البيت، عايز أروح مكان زول، يمكن كافيه في الخرطوم، نشرب قهوة ونسمع أغاني. أمس شفت فلم، بس كان شوية ممل. لو بنطلع، لازم نحدد بدري، عشان الزحمة بتكون كتيرة بالليل. ولو عايزين ناكل، شي خفيف زي فول أو تميس. شنو رأيكم، نروح السينما ولا نقعد في البيت؟ أنا والله نفسي أغير جو، يعني لو فيه مطعم جديد، بيكون أحسن كتير."

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=280) 

device = model.device    
inputs = {k: v.to(device) for k, v in inputs.items()}

label2id = {
    'Khaleeji': 0,
    'Levantine': 1,
    'Maghrebi': 2,
    'Iraqi': 3,
    'Lybia': 4,
    'Egyptian': 5,
    'Sudani': 6
}
reverse_dialect_mapping = {i: dialect for dialect, i in label2id.items()}

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
    predicted_class = torch.argmax(outputs.logits, dim=-1).item()
    predicted_dialect = reverse_dialect_mapping[predicted_class]

print(f"Input Text: {text}")
print(f"Predicted Dialect: {predicted_dialect}")
print("Probabilities for each dialect:")
for dialect, prob in zip(reverse_dialect_mapping.values(), probabilities[0]):
    print(f"{dialect}: {prob:.4f}")

Input Text:  شنو القصة اليوم؟ عايزين نطلع ولا كدي؟ أنا زهجت من القعدة في البيت، عايز أروح مكان زول، يمكن كافيه في الخرطوم، نشرب قهوة ونسمع أغاني. أمس شفت فلم، بس كان شوية ممل. لو بنطلع، لازم نحدد بدري، عشان الزحمة بتكون كتيرة بالليل. ولو عايزين ناكل، شي خفيف زي فول أو تميس. شنو رأيكم، نروح السينما ولا نقعد في البيت؟ أنا والله نفسي أغير جو، يعني لو فيه مطعم جديد، بيكون أحسن كتير.
Predicted Dialect: Sudani
Probabilities for each dialect:
Khaleeji: 0.0030
Levantine: 0.0035
Maghrebi: 0.0017
Iraqi: 0.0018
Lybia: 0.0034
Egyptian: 0.0078
Sudani: 0.9789


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt


predictions = trainer.predict(tokenized_datasets["validation"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

label2id = {
    'Khaleeji': 0,
    'Levantine': 1,
    'Maghrebi': 2,
    'Iraqi': 3,
    'Lybia': 4,
    'Egyptian': 5,
    'Sudani': 6
}
inverse_dialect_mapping = {v: k for k, v in label2id.items()}

report = classification_report(
    y_true,
    y_pred,
    target_names=list(label2id.keys()),
    output_dict=True
)
overall_metrics = report['weighted avg']

print("Overall Evaluation Metrics:")
print(f"Accuracy: {report['accuracy']:.4f}")
print(f"Precision: {overall_metrics['precision']:.4f}")
print(f"Recall: {overall_metrics['recall']:.4f}")
print(f"F1-Score: {overall_metrics['f1-score']:.4f}")

print("\nPer-Dialect Metrics:")
for dialect in label2id.keys():
    print(f"{dialect}:")
    print(f"  Precision: {report[dialect]['precision']:.4f}")
    print(f"  Recall: {report[dialect]['recall']:.4f}")
    print(f"  F1-Score: {report[dialect]['f1-score']:.4f}")

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(label2id.keys()))
plt.figure(figsize=(10, 8))
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix for Dialect Classification")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

  0%|          | 0/1575 [00:00<?, ?it/s]

Overall Evaluation Metrics:
Accuracy: 0.6270
Precision: 0.6362
Recall: 0.6270
F1-Score: 0.6269

Per-Dialect Metrics:
Khaleeji:
  Precision: 0.4715
  Recall: 0.6589
  F1-Score: 0.5497
Levantine:
  Precision: 0.6755
  Recall: 0.6595
  F1-Score: 0.6674
Maghrebi:
  Precision: 0.7007
  Recall: 0.6567
  F1-Score: 0.6780
Iraqi:
  Precision: 0.6768
  Recall: 0.5771
  F1-Score: 0.6230
Lybia:
  Precision: 0.5260
  Recall: 0.4190
  F1-Score: 0.4664
Egyptian:
  Precision: 0.6556
  Recall: 0.7599
  F1-Score: 0.7039
Sudani:
  Precision: 0.7471
  Recall: 0.6582
  F1-Score: 0.6999


  plt.show()


In [None]:
import json
with open("evaluation_report.json", "w") as f:
    json.dump(report, f, indent=4)

# Save confusion matrix plot
plt.savefig("confusion_matrix.png", bbox_inches="tight")
plt.close()

In [None]:
from gtts import gTTS
import os
tts_text = f"النص باللهجة {predicted_dialect}: {text}" 
tts = gTTS(text=tts_text, lang='ar')
tts.save("output_audio.mp3")
os.system("start output_audio.mp3")  

0