# installation

In [None]:
!pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Colab drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [None]:
import pandas as pd
import numpy as np

import torch
import transformers


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

from tqdm import tqdm

# Reading Data

In [None]:
data = pd.read_csv("/content/drive/MyDrive/preprocessed_data.csv").drop('Unnamed: 0', axis=1)
data = data.dropna()

In [None]:
data.rename(columns={'dialect':'label'},inplace=True)

In [None]:
data.head()

Unnamed: 0,text,label
0,قليلين ادب ومنافقين اختهم او قريبتهم تتعاكس تق...,2
1,الليبيين متقلبين بالنسبه ليا انا ميليشياوي زما...,2
2,تانيه شاب ليبي بيرتاح لبنت مختلفه ويلاحظ انها ...,2
3,رانيا عقليتك متخلفه اولا الانسان يلي يحتاج اهل...,2
4,شكلك متعقده علشان الراجل تحبيه ازوج بنت يتيمه ...,2


# prepare data for model

In [None]:
torch.manual_seed(42)

model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)


data["text"] = data["text"].apply(lambda x: x.strip())
data["label"] = data["label"].astype("category")
data["label_id"] = data["label"].cat.codes

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_texts = train_data["text"].tolist()
train_labels = train_data["label_id"].tolist()
test_texts = test_data["text"].tolist()
test_labels = test_data["label_id"].tolist()


# Train test split

In [None]:
# Tokenize text and convert to input tensors
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
train_labels = torch.tensor(train_labels)

test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
test_labels = torch.tensor(test_labels)

# Create PyTorch DataLoader for training and testing datasets
train_dataset = torch.utils.data.TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"], test_labels)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

# Create model and optimizer
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(data["label"].cat.categories))
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()



Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

# Train AraBert

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
model.train()

for epoch in range(1):

    for batch in tqdm(train_loader):

        inputs = {"input_ids": batch[0].to(device), "attention_mask": batch[1].to(device), "labels": batch[2].to(device)}
        optimizer.zero_grad()

        outputs = model(**inputs)

        loss = loss_fn(outputs.logits, inputs['labels'])

        loss.backward()
        optimizer.step()


100%|██████████| 7383/7383 [38:27<00:00,  3.20it/s]


# Validate AraBert

In [None]:

model.eval()

# Initialize variables for evaluation
true_labels = []
predicted_labels = []

# Evaluate the model on the test dataset
for batch in test_loader:

    input_ids, attention_mask, labels = batch

    with torch.no_grad():
        outputs = model(input_ids.to(device), attention_mask.to(device))
        logits = outputs.logits

        predicted = torch.argmax(logits, axis=-1)

    true_labels.extend(labels.cpu().numpy())
    predicted_labels.extend(predicted.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.8444241254360121
Precision: 0.8446223022344934
Recall: 0.8444241254360121
F1 score: 0.8436069491395654


# Inference 

In [None]:
inputs = tokenizer("انا بتكلم مصري و ايش بدك مني لك ولو معلم", return_tensors="pt").to(device)


outs = model(**inputs).logits.to('cpu')

print(torch.argmax(outs))

tensor(1)


# Save Model

In [None]:
torch.save(model, "arabert_arabic_lahga.pth")


In [None]:
!cp /content/arabert_arabic_lahga.pth /content/drive/MyDrive/Models/

In [None]:
#https://drive.google.com/file/d/1J8IdAT_xNDXumZKabT2xnk33AhEjR7ce/view?usp=sharing

# load model

In [None]:
model = torch.load("/content/arabert_arabic_lahga.pth")
