In [None]:
# Install libraries
!pip install transformers datasets evaluate hazm torch transformers[torch] accelerate -U

In [None]:
# Data processing
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import hazm

# Modeling
import tensorflow as tf
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

import torch
from torch.utils.data import Dataset, DataLoader


In [None]:
data= pd.read_csv('data.csv')

# Split the dataset into training and validation sets (80:20 split)
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
model_name = "classla/xlm-roberta-base-multilingual-text-genre-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7, ignore_mismatched_sizes=True)

In [None]:
class PersianNewsDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        label = self.data.iloc[idx]["label"]

        # Normalize the text
        normalized_text = hazm.Normalizer().normalize(text)

        # Tokenize the normalized text
        tokens = hazm.word_tokenize(normalized_text)

        # Tokenize the text and convert to input IDs and attention masks
        inputs = tokenizer.encode_plus(
            normalized_text,
            None,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),  # Convert labels to long data type
        }

In [None]:
train_dataset = PersianNewsDataset(train_data)
val_dataset = PersianNewsDataset(val_data)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True, 
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    cm = confusion_matrix(labels, preds)
    return {"accuracy": accuracy, "f1": f1, "confusion_matrix": cm}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
val_predictions = trainer.predict(val_dataset)
val_predicted_labels = np.argmax(val_predictions.predictions, axis=1)
val_true_labels = val_dataset[:]["labels"].numpy()

val_accuracy = accuracy_score(val_true_labels, val_predicted_labels)
val_f1 = f1_score(val_true_labels, val_predicted_labels, average="weighted")
val_cm = confusion_matrix(val_true_labels, val_predicted_labels)

print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation F1 Score: {val_f1}")
print("Validation Confusion Matrix:")
print(val_cm)

In [None]:
output_dir = "/saved_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

loaded_model = AutoModelForSequenceClassification.from_pretrained(output_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)