In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install Dependencies

In [2]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [5]:
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW

import tensorflow as tf
import pandas as pd
import json
import gc

from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.stem.isri import ISRIStemmer


import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot

from tqdm import tqdm
import io
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [14]:
data = pd.read_csv('/content/drive/MyDrive/NLP/data')

# Apply Stemming

In [15]:
st = ISRIStemmer()
def stemming(text):
    return " ".join([st.stem(word) for word in str(text).split()])

In [16]:
data['text'] = data['text'].apply(lambda x: stemming(x))

# Encode the labels

In [17]:
data['encoded_label'] = data['label'].astype('category').cat.codes
data.head(20)

Unnamed: 0,text,label,encoded_label
0,كشف جيسوس افس جنح مانشستر سيت جدد قدم ارس نغر ...,Sports,5
1,دبي حمد سعدتعتبر تطر بدر تقد دبي جال مدن ذكه ر...,Tech,6
2,عقد ندي سسك وسكو روس هجم دول يجر حمد وسي لمد خ...,Sports,5
3,اشد دجاردو بوز درب ارج نجم نخب ونل يسي وقل «ال...,Sports,5
4,توف امس بئع غرب تجل نور دين عدن دين رمو يطل تث...,Politics,3
5,حرج شعيباذ بيئ صدقاء كثر يسع لحم حفظ عليها ركز...,Religion,4
6,وظب سعد يطر جوء حذر رقب سوق مال دول جلس امس ضع...,Finance,1
7,دهش كتب قاص عبدالرضا سجا نول ذكر قدم فهي كثر ح...,Culture,0
8,متع كثر جهز رقم صغر حجم وقت حضر بقه ادء حسب اع...,Tech,6
9,زوج عرف وفء قدم لزج عرف بدي حيت زوج عند فتح ال...,Religion,4


In [18]:
class_mapping = {}

# Manually add entries to the mapping dictionary
class_mapping[0] = "Culture"
class_mapping[1] = "Finance"
class_mapping[2] = "Medical"
class_mapping[3] = "Politics"
class_mapping[4] = "Religion"
class_mapping[5] = "Sports"
class_mapping[6] = "Tech"

for code, class_label in class_mapping.items():
    print(f"Code: {code} - Class: {class_label}")


Code: 0 - Class: Culture
Code: 1 - Class: Finance
Code: 2 - Class: Medical
Code: 3 - Class: Politics
Code: 4 - Class: Religion
Code: 5 - Class: Sports
Code: 6 - Class: Tech


# Import MARBERT

In [19]:
# Set the device (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_classes = 7  # Number of classes in the SANAD dataset
pretrained_model_name = "aubmindlab/bert-base-arabertv2"  # Pretrained MARBERT model
max_seq_length = 128  # Maximum sequence length for input text
batch_size = 8
num_epochs = 8
learning_rate = 2e-5

# Load the tokenizer and the pretrained model
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
model = BertForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=num_classes)
model.to(device)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Split the data

In [23]:
train_df, val_test_df = train_test_split(data, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)

In [24]:
train_texts = train_df['text'].tolist()
train_labels = train_df['encoded_label'].tolist()
val_texts = val_df['text'].tolist()
val_labels = val_df['encoded_label'].tolist()

# Encode the training and validation datasets

In [25]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_seq_length)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_seq_length)


In [26]:
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels)
)
val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

In [28]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [29]:
# Define the optimizer and the loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()





# Fine-tune MARBERT

In [33]:
# Fine-tuning loop
for epoch in range(num_epochs): 

    print(f"Epoch {epoch+1}/{num_epochs}")
    print("-" * 10)

    # Training
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Training Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            _, predicted_labels = torch.max(logits, dim=1)

Epoch 1/8
----------
Training Loss: 1.9641
Epoch 2/8
----------
Training Loss: 1.9622
Epoch 3/8
----------
Training Loss: 1.9610
Epoch 4/8
----------
Training Loss: 1.9607
Epoch 5/8
----------
Training Loss: 1.9596
Epoch 6/8
----------
Training Loss: 1.9589
Epoch 7/8
----------
Training Loss: 1.6442
Epoch 8/8
----------
Training Loss: 1.8464


# Save model & tokenizer


In [34]:
model.save_pretrained('/content/model/model')
tokenizer.save_pretrained('/content/model/tokenizer')

('/content/model/tokenizer/tokenizer_config.json',
 '/content/model/tokenizer/special_tokens_map.json',
 '/content/model/tokenizer/vocab.txt',
 '/content/model/tokenizer/added_tokens.json')

# Load model & tokenizer

In [35]:
tokenizer = BertTokenizer.from_pretrained('/content/model/tokenizer')
model = BertForSequenceClassification.from_pretrained('/content/model/model')

# Some Predictions

In [45]:
def predict_class(input_text):
  input_encoding = tokenizer(input_text, truncation=True, padding=True, max_length=max_seq_length)

  # Create input tensors
  input_ids = torch.tensor(input_encoding['input_ids']).unsqueeze(0).to(device)
  attention_mask = torch.tensor(input_encoding['attention_mask']).unsqueeze(0).to(device)

  # Make the prediction
  model.to(device)
  model.eval()
  with torch.no_grad():
      inputs = {
          'input_ids': input_ids,
          'attention_mask': attention_mask
      }
      outputs = model(**inputs)
      logits = outputs.logits
      probabilities = torch.softmax(logits, dim=1)
      predicted_label = torch.argmax(probabilities, dim=1).item()
      return class_mapping.get(predicted_label)

In [46]:
print(test_texts[0])
print(test_labels[0])
print(predict_class(test_texts[0]))

كثر عجب قرن كثر عجز وقف تني نص قرا تضع تمل افق وسع عظم كتب سمو خلد جاء قرن كرم زخر باي حدث ظهر حيت كون تلف ظار حقق ومن لنص قرا كرم جعل ماء شيء حي دار قال هذل انس رغم قدم هئل حرز ميد علم برح قدم بره تلو بره ضعف وقل حيل ازء عرف سرر حيه حول واد يته تبد سمه سمت حيه كئن حيه تمل ارض بهج جمل تضع انس عظم خلق سبح وحد ظهر عجز حد ذهل يبق حل لغز حيل ايه كرم ويسالونك روح قل روح امر ربي يقر انس ضعف عظم علم ربه كمل ايه كرم وما اوت علم قلل سرء صيغ عظم تمل وصل انس حقق علم كثر شكل جمل خلف علم تعلق كئن حيه نتج لنص دار نقش شكل قعد اهم قعد سسي تقم حيه لرب سبب دعي لاستخدام صيغ عظم وهي كلم فرد بصغ جمع عبر فعل عجز ذهل خلق حيء اده بسط ماء عبر بصغ فرد فقل سبح جعل خدم صيغ فرد عرف خدم صيغ عظم عبر فعل نسب لله برك تعل تقف ورء حكم حقق علم علم باكملهالا ارد ثقل قرئ كرم سرد دور ماء وهم حيه كئن حيه بتت عرف كشف انس يشق طرق ياد علم بحث علم ارد سلط ضوء سلب قرا فرد طرق عرض حقق علم جنب علق لنص نصص اخر علق شرح نضح لنص شرف عجز علميهفهذه ايه كرم تات فرد جءت لسل ايت حدث صور حيه خلف بثه الل علي كون قدم حيه سان تضح سيق ايت كثر ور

In [47]:
text = "امبارح المظاهرات كانت عنيفة جدا والشرطة استخدمت الغاز المسيل للدموع عشلن تفرق المتظاهرين"
print(predict_class(text))

Sports


In [48]:
text = "انقذوا مستشفى 57357"
print(predict_class(text))

Sports


In [50]:
text = "عيد الشرطة يأتي في الخامس والعشرين من يناير كل عام وذلك لإحياء ذكرى استشهاد ٥٠ ضابطا وجنديا وإصابة ٨٠ من قوات الشرطة المصرية البواسل في مدينة الإسماعيلية وهم يدافعون عن ثغورهم من الاحتلال الانجليزي في ١٩٥٢. ربنا يرحم كل ضابط مات دفاعا عن وطنه ودينه وأهله وناسه 🖤🤍"
print('Politics')

Politics


In [52]:
text = "ليلة غُسلت فيها أحزان الحبيب  المصطفى بعد عام الحزن  اللهم كما جعلتها ليلة دخول الفرح والسرور على قلبه الشريف بعد أن طال حزنه فَاجعلها ليلة فرح وسرور علينا وعلى أمة سيدنا محمد صلى الله عليه وسلم"
print('Religion')

Religion


# Evaluate the model

In [38]:
test_texts = test_df['text'].tolist()
test_labels = test_df['encoded_label'].tolist()

In [39]:
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_seq_length)


In [40]:
test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels)
)

In [41]:
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
model = model.to(device)
def test_model(model, dataloader):
    model.eval()
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1)

            true_labels.extend(labels.tolist())
            predicted_labels.extend(predicted_class.tolist())

    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    return accuracy, precision, recall, f1


In [43]:
test_accuracy, test_precision, test_recall, test_f1 = test_model(model, test_dataloader)



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [44]:
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Test Accuracy: 0.2596
Test Precision: 0.1184
Test Recall: 0.2743
Test F1 Score: 0.1506
