In [None]:
!pip install --upgrade transformers datasets scikit-learn



In [None]:
import transformers
print(transformers.__version__)

4.57.1


In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, ClassLabel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dfs = []
for i in range(1, 13):
  temp_df = pd.read_csv(f'/content/Comments {i}.csv', header=None, names=['review', 'City', 'District', 'Label'], skiprows=1)
  dfs.append(temp_df)
df = pd.concat(dfs, ignore_index=True)
df.head(10)

Unnamed: 0,review,City,District,Label
0,الخدمة كانت ممتازة والسلع كلها موجودة النهاردة.,Cairo,Nasr City,Positive
1,الأسعار أغلى من اللي متوقعينه وخارج الميزانية.,Cairo,Heliopolis,Negative
2,استلمت حصتي زي كل شهر بدون أي تغيير.,Cairo,Maadi,Neutral
3,التاجر محترم وبيتعامل بذوق قوي مع الناس.,Cairo,Downtown Cairo,Positive
4,التاجر بيتعامل بحدة ومش بيسمع الشكاوي خالص.,Cairo,Zamalek,Negative
5,الفرع كان هادي والانتظار كان معقول.,Giza,Dokki,Neutral
6,الأسعار مناسبة جدا والوزن مظبوط في كل كيس.,Giza,Mohandessin,Positive
7,السلع الأساسية خلصت بدري خالص قبل ما نوصل.,Giza,6th of October City,Negative
8,بعض السلع موجودة والباقي لسه مجاش.,Giza,Haram,Neutral
9,المكان نظيف ومنظم والموظفين بشوشين.,Giza,Faisal,Positive


In [None]:
# تحويل التقييم الرقمي لفئات بالعربي
def stars_to_label(stars):
    if stars <= 2:
        return "سلبي"
    elif stars == 3:
        return "محايد"
    else:
        return "إيجابي"

df["label"] = df["stars"].apply(stars_to_label)

In [None]:
df.sample(5)

Unnamed: 0,review,City,District,Label
331,المنفذ وفر أكياس قماش بديلة عن البلاستيك.,Matruh,Barani,Positive
1576,المتابعة الأسبوعية شغالة على نفس الوتيرة.,Luxor,Al Boghdadi,Neutral
203,الأكياس اتقطعت من كتر الوزن ومحدش ساعد.,Cairo,Zamalek,Negative
631,السلع كلها متاحة والتعبئة شكلها محترم.,Aswan,Darau,Positive
518,مفيش موظفين كفاية يخدموا الزباين.,Alexandria,Miami,Negative


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Encode labels
le = LabelEncoder()
df["labels"] = le.fit_transform(df["Label"])

# تقسيم Train / Validation
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

# تحويلها إلى Dataset
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [None]:
print(train_ds)
display(train_ds[0])

Dataset({
    features: ['review', 'City', 'District', 'Label', 'labels', '__index_level_0__'],
    num_rows: 1180
})


{'review': 'الأرز كان فيه سوس صغير.',
 'City': 'Giza',
 'District': 'Warraq',
 'Label': 'Negative',
 'labels': 0,
 '__index_level_0__': 287}

In [None]:
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["review"],
        padding='max_length',  # Ensure all sequences are padded to max_length
        truncation=True,
        max_length=107
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

# الاحتفاظ بالأعمدة المطلوبة فقط
train_ds = train_ds.remove_columns([col for col in train_ds.column_names if col not in ["input_ids", "attention_mask", "labels"]])
val_ds = val_ds.remove_columns([col for col in val_ds.column_names if col not in ["input_ids", "attention_mask", "labels"]])

Map:   0%|          | 0/1180 [00:00<?, ? examples/s]

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(le.classes_)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
import transformers
print(transformers.__version__)

4.57.1


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./sentiment_model",
    eval_strategy="epoch",      # بدل evaluation_strategy
    logging_steps=50,           # تسجيل كل 50 خطوة
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4955,0.104771,0.970414,0.96856
2,0.0657,0.067692,0.986193,0.985875
3,0.041,0.050895,0.988166,0.987867


TrainOutput(global_step=444, training_loss=0.2338569158906335, metrics={'train_runtime': 96.6172, 'train_samples_per_second': 36.639, 'train_steps_per_second': 4.595, 'total_flos': 194654287470240.0, 'train_loss': 0.2338569158906335, 'epoch': 3.0})

In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=107)
    # Move inputs to the same device as the model (e.g., GPU if available)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class_id = logits.argmax().item()
    predicted_label = le.inverse_transform([predicted_class_id])[0]
    return predicted_label

import torch # Ensure torch is imported

# Example usage:
new_review = "خدمة التموين كانت ممتازة وسريعة جداً، أنا راضي تماماً."
prediction = predict_sentiment(new_review)
print(f"Review: {new_review}")
print(f"Predicted sentiment: {prediction}")

new_review_2 = "الخدمة سيئة للغاية، لا أنصح بها أبداً."
prediction_2 = predict_sentiment(new_review_2)
print(f"\nReview: {new_review_2}")
print(f"Predicted sentiment: {prediction_2}")

new_review_3 = "كانت الخدمة لا بأس بها، ليست ممتازة وليست سيئة."
prediction_3 = predict_sentiment(new_review_3)
print(f"\nReview: {new_review_3}")
print(f"Predicted sentiment: {prediction_3}")

Review: خدمة التموين كانت ممتازة وسريعة جداً، أنا راضي تماماً.
Predicted sentiment: Positive

Review: الخدمة سيئة للغاية، لا أنصح بها أبداً.
Predicted sentiment: Negative

Review: كانت الخدمة لا بأس بها، ليست ممتازة وليست سيئة.
Predicted sentiment: Neutral


In [None]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.05089456960558891, 'eval_accuracy': 0.9881656804733728, 'eval_f1': 0.9878673813796822, 'eval_runtime': 3.2046, 'eval_samples_per_second': 158.211, 'eval_steps_per_second': 19.971, 'epoch': 3.0}


In [None]:
predictions = trainer.predict(val_ds)

# Extract true labels and predicted logits
true_labels = predictions.label_ids
predicted_logits = predictions.predictions

# Convert logits to predicted class IDs
predicted_labels = np.argmax(predicted_logits, axis=1)

print("True Labels:", true_labels[:5])
print("Predicted Labels (IDs):", predicted_labels[:5])

True Labels: [2 1 0 2 0]
Predicted Labels (IDs): [2 1 0 2 0]


In [None]:
misclassified_mask = (true_labels != predicted_labels)
misclassified_ds = val_ds.filter(lambda example, idx: misclassified_mask[idx], with_indices=True)

misclassified_ds = misclassified_ds.add_column("true_label_id", true_labels[misclassified_mask].tolist())
misclassified_ds = misclassified_ds.add_column("predicted_label_id", predicted_labels[misclassified_mask].tolist())

print(f"Number of misclassified examples: {len(misclassified_ds)}")
display(misclassified_ds.to_pandas().head())

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/6 [00:00<?, ? examples/s]

Number of misclassified examples: 6


Unnamed: 0,labels,input_ids,attention_mask,true_label_id,predicted_label_id
0,2,"[2, 11776, 2411, 201, 3781, 22654, 25664, 1012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",2,1
1,1,"[2, 394, 5684, 325, 208, 559, 11458, 305, 5914...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",1,0
2,0,"[2, 16270, 323, 13480, 678, 55280, 197, 20, 3,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",0,1
3,1,"[2, 41681, 10608, 1177, 27006, 4154, 20, 3, 0,...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",1,0
4,2,"[2, 24365, 38609, 201, 21430, 418, 9526, 9613,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",2,1


In [None]:
misclassified_df = misclassified_ds.to_pandas()
misclassified_df['true_sentiment'] = le.inverse_transform(misclassified_df['true_label_id'])
misclassified_df['predicted_sentiment'] = le.inverse_transform(misclassified_df['predicted_label_id'])

print("Misclassified examples with sentiment labels:")
display(misclassified_df.head())


Misclassified examples with sentiment labels:


Unnamed: 0,labels,input_ids,attention_mask,true_label_id,predicted_label_id,true_sentiment,predicted_sentiment
0,2,"[2, 11776, 2411, 201, 3781, 22654, 25664, 1012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",2,1,Positive,Neutral
1,1,"[2, 394, 5684, 325, 208, 559, 11458, 305, 5914...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",1,0,Neutral,Negative
2,0,"[2, 16270, 323, 13480, 678, 55280, 197, 20, 3,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",0,1,Negative,Neutral
3,1,"[2, 41681, 10608, 1177, 27006, 4154, 20, 3, 0,...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",1,0,Neutral,Negative
4,2,"[2, 24365, 38609, 201, 21430, 418, 9526, 9613,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",2,1,Positive,Neutral


In [None]:
misclassification_summary = misclassified_df.groupby(['true_sentiment', 'predicted_sentiment']).size().reset_index(name='count')
print("Misclassification Summary:")
display(misclassification_summary)

# Further analysis: Check specific patterns, e.g., Negative predicted as Neutral or Positive predicted as Neutral
negative_to_neutral = misclassified_df[(misclassified_df['true_sentiment'] == 'سلبي') & (misclassified_df['predicted_sentiment'] == 'محايد')]
positive_to_neutral = misclassified_df[(misclassified_df['true_sentiment'] == 'إيجابي') & (misclassified_df['predicted_sentiment'] == 'محايد')]
neutral_to_positive = misclassified_df[(misclassified_df['true_sentiment'] == 'محايد') & (misclassified_df['predicted_sentiment'] == 'إيجابي')]
neutral_to_negative = misclassified_df[(misclassified_df['true_sentiment'] == 'محايد') & (misclassified_df['predicted_sentiment'] == 'سلبي')]

print(f"\nNumber of 'سلبي' reviews predicted as 'محايد': {len(negative_to_neutral)}")
print(f"Number of 'إيجابي' reviews predicted as 'محايد': {len(positive_to_neutral)}")
print(f"Number of 'محايد' reviews predicted as 'إيجابي': {len(neutral_to_positive)}")
print(f"Number of 'محايد' reviews predicted as 'سلبي': {len(neutral_to_negative)}")

Misclassification Summary:


Unnamed: 0,true_sentiment,predicted_sentiment,count
0,Negative,Neutral,1
1,Neutral,Negative,2
2,Positive,Neutral,2



Number of 'سلبي' reviews predicted as 'محايد': 0
Number of 'إيجابي' reviews predicted as 'محايد': 0
Number of 'محايد' reviews predicted as 'إيجابي': 0
Number of 'محايد' reviews predicted as 'سلبي': 0


## Testing Again

In [None]:
NewData = pd.read_csv(r"/content/reviews_data.csv")

# Apply the predict_sentiment function to the 'review' column of NewData
NewData['predicted_sentiment'] = NewData['review'].apply(predict_sentiment)

# Display the original review and the predicted sentiment
result_df = NewData[['review', 'predicted_sentiment']]
display(result_df.head())


Unnamed: 0,review,predicted_sentiment
0,"أنا مش راضي على خدمات التموين في البحيرة, ده ل...",Negative
1,أنا مش فاهم حاجة! أعمل إيه?,Negative
2,"أنا مش راضي تماما على خدمة التموين في القاهرة,...",Negative
3,أنا سعيد جدا بخدمة التموين دي. هي فعلا بتوفر ل...,Positive
4,"أنا سعيد جدا بالخدمات اللي بتقدمها الدولة, خاص...",Positive


In [None]:
result_df

Unnamed: 0,review,predicted_sentiment
0,"أنا مش راضي على خدمات التموين في البحيرة, ده ل...",Negative
1,أنا مش فاهم حاجة! أعمل إيه?,Negative
2,"أنا مش راضي تماما على خدمة التموين في القاهرة,...",Negative
3,أنا سعيد جدا بخدمة التموين دي. هي فعلا بتوفر ل...,Positive
4,"أنا سعيد جدا بالخدمات اللي بتقدمها الدولة, خاص...",Positive
...,...,...
6131,"أنا مش راضيا على خدمات التموين هنا, فهي بطيئة ...",Positive
6132,أنا مش راضيه على خدمه التوزيع دي خالص! هي بتاخ...,Negative
6133,"أنا مش راضي تمام على خدمة التموين, لأن الأكل د...",Negative
6134,أنا سعيد جدا بتلقي تعليقاتكم حول خدمات التموين...,Positive


In [None]:
import os
import pickle

# Define the directory to save the model
save_directory = "./my_sentiment_model"
os.makedirs(save_directory, exist_ok=True)

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save the LabelEncoder
with open(os.path.join(save_directory, "label_encoder.pkl"), "wb") as f:
    pickle.dump(le, f)

print(f"Model, tokenizer, and LabelEncoder saved to: {save_directory}")

print("\nTo load the model in your project, you can use the following code:")
print("""
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pickle
import os

load_directory = "./my_sentiment_model" # Or the path where you saved it

# Load tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(load_directory)

# Load model
loaded_model = AutoModelForSequenceClassification.from_pretrained(load_directory)

# Load LabelEncoder
with open(os.path.join(load_directory, "label_encoder.pkl"), "rb") as f:
    loaded_le = pickle.load(f)

print("Model, tokenizer, and LabelEncoder loaded successfully!")
""")

Model, tokenizer, and LabelEncoder saved to: ./my_sentiment_model

To load the model in your project, you can use the following code:

from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pickle
import os

load_directory = "./my_sentiment_model" # Or the path where you saved it

# Load tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(load_directory)

# Load model
loaded_model = AutoModelForSequenceClassification.from_pretrained(load_directory)

# Load LabelEncoder
with open(os.path.join(load_directory, "label_encoder.pkl"), "rb") as f:
    loaded_le = pickle.load(f)

print("Model, tokenizer, and LabelEncoder loaded successfully!")



In [None]:
import shutil
from google.colab import files

# Define the directory to be zipped
save_directory = "./my_sentiment_model"
zip_filename = "my_sentiment_model.zip"

# Create a zip archive of the directory
shutil.make_archive(save_directory, 'zip', save_directory)

print(f"'{save_directory}' has been zipped to '{zip_filename}'")

# Download the zip file
files.download(zip_filename)

print("Your model folder is being downloaded!")

'./my_sentiment_model' has been zipped to 'my_sentiment_model.zip'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Your model folder is being downloaded!
