In [None]:
import pandas as pd
import numpy as np
import torch
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from adapters import AutoAdapterModel, AdapterConfig
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import string


In [None]:
%pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118

In [None]:
if torch.cuda.is_available():
    print("CUDA is available. GPU is ready to use.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("CUDA is not available. Using CPU.")

In [None]:
df_final = pd.read_csv("C:/Users/saifk/OneDrive/Desktop/FINAL_data2.csv")
df_final

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_final['text'].tolist(),
    df_final['dialect'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_final['dialect']
)


In [None]:
train_dataset = Dataset.from_dict({'text': train_texts, 'dialect': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'dialect': val_labels})
datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

In [None]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
base_model = AutoModelForSequenceClassification.from_pretrained(
    "aubmindlab/bert-base-arabertv02",
    num_labels=7,  # Updated to 7 dialect groups
    id2label={0: 'Khaleeji', 1: 'Levantine', 2: 'Maghrebi', 3: 'Iraqi', 4: 'Lybia', 5: 'Egyptian', 6: 'Sudani'},
    label2id={'Khaleeji': 0, 'Levantine': 1, 'Maghrebi': 2, 'Iraqi': 3, 'Lybia': 4, 'Egyptian': 5, 'Sudani': 6}
)
base_model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)
model = get_peft_model(base_model, lora_config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=280
    )
    dialect_mapping = {'Khaleeji': 0, 'Levantine': 1, 'Maghrebi': 2, 'Iraqi': 3, 'Lybia': 4, 'Egyptian': 5, 'Sudani': 6}
    tokenized_inputs["labels"] = [dialect_mapping[dialect] for dialect in examples["dialect"]]
    return tokenized_inputs

tokenized_datasets = datasets.map(tokenize_function, batched=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.01
    )]
)

In [None]:
trainer.train()
trainer.save_model("./final_model")

In [None]:
text = " شنو القصة اليوم؟ عايزين نطلع ولا كدي؟ أنا زهجت من القعدة في البيت، عايز أروح مكان زول، يمكن كافيه في الخرطوم، نشرب قهوة ونسمع أغاني. أمس شفت فلم، بس كان شوية ممل. لو بنطلع، لازم نحدد بدري، عشان الزحمة بتكون كتيرة بالليل. ولو عايزين ناكل، شي خفيف زي فول أو تميس. شنو رأيكم، نروح السينما ولا نقعد في البيت؟ أنا والله نفسي أغير جو، يعني لو فيه مطعم جديد، بيكون أحسن كتير."

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=280) 

device = model.device    
inputs = {k: v.to(device) for k, v in inputs.items()}

label2id = {
    'Khaleeji': 0,
    'Levantine': 1,
    'Maghrebi': 2,
    'Iraqi': 3,
    'Lybia': 4,
    'Egyptian': 5,
    'Sudani': 6
}
reverse_dialect_mapping = {i: dialect for dialect, i in label2id.items()}

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
    predicted_class = torch.argmax(outputs.logits, dim=-1).item()
    predicted_dialect = reverse_dialect_mapping[predicted_class]

print(f"Input Text: {text}")
print(f"Predicted Dialect: {predicted_dialect}")
print("Probabilities for each dialect:")
for dialect, prob in zip(reverse_dialect_mapping.values(), probabilities[0]):
    print(f"{dialect}: {prob:.4f}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt


predictions = trainer.predict(tokenized_datasets["validation"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

label2id = {
    'Khaleeji': 0,
    'Levantine': 1,
    'Maghrebi': 2,
    'Iraqi': 3,
    'Lybia': 4,
    'Egyptian': 5,
    'Sudani': 6
}
inverse_dialect_mapping = {v: k for k, v in label2id.items()}

report = classification_report(
    y_true,
    y_pred,
    target_names=list(label2id.keys()),
    output_dict=True
)
overall_metrics = report['weighted avg']

print("Overall Evaluation Metrics:")
print(f"Accuracy: {report['accuracy']:.4f}")
print(f"Precision: {overall_metrics['precision']:.4f}")
print(f"Recall: {overall_metrics['recall']:.4f}")
print(f"F1-Score: {overall_metrics['f1-score']:.4f}")

print("\nPer-Dialect Metrics:")
for dialect in label2id.keys():
    print(f"{dialect}:")
    print(f"  Precision: {report[dialect]['precision']:.4f}")
    print(f"  Recall: {report[dialect]['recall']:.4f}")
    print(f"  F1-Score: {report[dialect]['f1-score']:.4f}")

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(label2id.keys()))
plt.figure(figsize=(10, 8))
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix for Dialect Classification")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import json
with open("evaluation_report.json", "w") as f:
    json.dump(report, f, indent=4)

# Save confusion matrix plot
plt.savefig("confusion_matrix.png", bbox_inches="tight")
plt.close()

In [None]:
from gtts import gTTS
import os
tts_text = f"النص باللهجة {predicted_dialect}: {text}" 
tts = gTTS(text=tts_text, lang='ar')
tts.save("output_audio.mp3")
os.system("start output_audio.mp3")  