In [None]:
pip install transformers


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:

# Assuming you have your training data in a CSV file with 'text' and 'label' columns
df = pd.read_csv('data.csv')

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased-clf-persiannews')

# Tokenize the texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Create PyTorch datasets
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, train_labels.tolist())
val_dataset = MyDataset(val_encodings, val_labels.tolist())


In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('HooshvareLab/bert-fa-base-uncased-clf-persiannews', num_labels=num_labels)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Directory where model checkpoints and evaluation results will be saved
    num_train_epochs=1,              # Number of training epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=200,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory where logs will be saved
    logging_steps=100,               # Number of steps between logging
    evaluation_strategy='steps',     # Evaluate the model periodically
    eval_steps=500,                  # Number of steps between evaluation
    save_strategy='epoch',           # Save checkpoints at the end of each epoch
    save_total_limit=1,              # Limit the total number of checkpoints
    load_best_model_at_end=True, 
)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    cm = confusion_matrix(labels, preds)
    return {"accuracy": accuracy, "f1": f1, "confusion_matrix": cm}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()