_Imports_

In [None]:
import warnings
warnings.filterwarnings("ignore")

import torch
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay

_Model Setup_

In [None]:
class TextClassifier:
    def __init__(self, data_path):
        # əgər varsa train gpu-da olacaq, yoxdursa cpu ilə davam edəcək
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # datanı yükləyib bölən funksiyanı çağır
        print(f"Using device: {self.device}")
        self.load_data(data_path)
        # tokenizerı initialize et 
        print(f"Loaded data with {len(self.train_texts)} train, {len(self.eval_texts)} eval, {len(self.test_texts)} test samples")
        self.tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base", max_length=512)
        print("Tokenizer loaded")
        self.setup_model()
        print("Model loaded and moved to device")
    
    def load_data(self, path):
        # datanı yüklə
        df = pd.read_parquet(path)

        # datanı qarışdır
        df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

        # label2id və id2label yarat, model üçün kateqoriyaları rəqəmlərə çevirir
        labels = df['category'].unique().tolist()
        labels = [s.strip() for s in labels]
        num_labels = len(labels)
        id2label = {id: label for id, label in enumerate(labels)}
        label2id = {label: id for id, label in enumerate(labels)}
        df["labels"] = df['category'].map(lambda x: label2id[x.strip()])

        # datanı train (70%), test (20%), validationa (10%) böl
        size = len(df)
        train_end = int(size * 0.7)
        test_end = int(size * 0.9)
        self.train_texts = list(df['text'][:train_end])
        self.test_texts = list(df['text'][train_end:test_end])
        self.eval_texts = list(df['text'][test_end:])
        self.train_labels = list(df['labels'][:train_end])
        self.test_labels = list(df['labels'][train_end:test_end])
        self.eval_labels = list(df['labels'][test_end:])
        self.num_labels = num_labels
        self.id2label = id2label
        self.label2id = label2id
    
    # modeli yarat və gpu-ya göndər (yoxdursa cpu)
    def setup_model(self):
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "FacebookAI/xlm-roberta-base",
            num_labels=self.num_labels,
            id2label=self.id2label,
            label2id=self.label2id
        )
        self.model.to(self.device)

    # datanı encode et 
    def encode_data(self, texts):
        print(f"Encoding {len(texts)} texts...")
        return self.tokenizer(texts, truncation=True, padding=True)

    # encode olunmuş textləri transformer modellərin başa düşdüyü dataset formasında qaytar
    def create_data_loader(self, encodings, labels):
        class CustomDataset(Dataset):
            def __init__(self, encodings, labels):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
                item['labels'] = torch.tensor(self.labels[idx])
                return item

            def __len__(self):
                return len(self.labels)
        print(f"Creating dataset with {len(labels)} samples")
        return CustomDataset(encodings, labels)
    
    # metrikləri hesablamaq üçün funksiya
    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
        acc = accuracy_score(labels, preds)
        return {
            'Accuracy': acc,
            'Weighted F1': f1,
            'Precision': precision,
            'Recall': recall
        }

    # training və evaluation argumentlərini initialize et və trainə başla
    def train_and_evaluate(self):
        train_encodings = self.encode_data(self.train_texts)
        eval_encodings = self.encode_data(self.eval_texts)
        test_encodings = self.encode_data(self.test_texts)
        train_dataset = self.create_data_loader(train_encodings, self.train_labels)
        eval_dataset = self.create_data_loader(eval_encodings, self.eval_labels)
        test_dataset = self.create_data_loader(test_encodings, self.test_labels)
        training_args = TrainingArguments(
            output_dir='./results',
            report_to="none",
            num_train_epochs=1,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            eval_strategy='epoch',
            save_strategy='epoch',
            load_best_model_at_end=True
        )
        print("Starting training...")
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=self.compute_metrics
        )
        trainer.train()
        print("Training finished. Evaluating on test set...")
        results = trainer.evaluate(eval_dataset=test_dataset)
        print("Evaluation done.")
        return results

In [None]:
# traini başlat
classifier = TextClassifier("./data/data.parquet")
results = classifier.train_and_evaluate()
print(results)

_Confusion Matrix_

In [None]:
test_encodings = classifier.encode_data(classifier.test_texts)
test_dataset = classifier.create_data_loader(test_encodings, classifier.test_labels)

trainer = Trainer(model=classifier.model, args=TrainingArguments(output_dir='./results', report_to="none"))
predictions_output = trainer.predict(test_dataset)
preds = predictions_output.predictions.argmax(-1)
labels = predictions_output.label_ids

cm = confusion_matrix(labels, preds, normalize='true')
labels_names = list(classifier.id2label.values())

fig, ax = plt.subplots(figsize=(20, 12))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels_names)
disp.plot(cmap='YlOrRd', ax=ax,  xticks_rotation=45,  values_format=".2f")
plt.title("Normalized Confusion Matrix (%)", fontsize=16)
plt.grid(False)
plt.tight_layout()
plt.show()

_Save Model_

In [None]:
save_path = "/content/model"
classifier.model.save_pretrained(save_path)
classifier.tokenizer.save_pretrained(save_path)