In [None]:
import torch
import numpy as np
import pandas as pd
import time
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.nn.functional as F


dataset = load_dataset("csv", data_files={"train": "ISEAR_train.csv", "test": "ISEAR_test.csv"})


unique_emotions = sorted(set(dataset["train"]["emotion"]))
emotion2label = {emotion: idx for idx, emotion in enumerate(unique_emotions)}
num_labels = len(emotion2label)
print("Emotion to label mapping:", emotion2label)

def map_emotion_to_label(example):
    example["label"] = emotion2label[example["emotion"]]
    return example

dataset = dataset.map(map_emotion_to_label)

teacher_model_name = "./roberta_finetuned/checkpoint-1920"
student_model_name = "distilroberta-base"                   # Student model: DistilRoBERTa
tokenizer = AutoTokenizer.from_pretrained("roberta-large")  #  tokenizer from original roberta-large
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_name, num_labels=num_labels)
student_model = AutoModelForSequenceClassification.from_pretrained(student_model_name, num_labels=num_labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


class DistillationTrainer(Trainer):
    def __init__(self, teacher_model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model.to(self.args.device)
        self.teacher_model.eval()
        self.distillation_alpha = 0.5
        self.temperature = 2.0

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        inputs = {k: v.to(self.args.device) for k, v in inputs.items()}
        student_outputs = model(**inputs)
        student_logits = student_outputs.logits
        with torch.no_grad():
            teacher_outputs = self.teacher_model(**inputs)
            teacher_logits = teacher_outputs.logits
        ce_loss = student_outputs.loss
        soft_student = F.log_softmax(student_logits / self.temperature, dim=-1)
        soft_teacher = F.softmax(teacher_logits / self.temperature, dim=-1)
        distillation_loss = F.kl_div(soft_student, soft_teacher, reduction="batchmean") * (self.temperature ** 2)
        total_loss = self.distillation_alpha * distillation_loss + (1 - self.distillation_alpha) * ce_loss
        return (total_loss, student_outputs) if return_outputs else total_loss


student_training_args = TrainingArguments(
    output_dir="./distilroberta_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs_distil",
    logging_steps=50,
    metric_for_best_model="f1",
    greater_is_better=True,
)


distillation_trainer = DistillationTrainer(
    model=student_model,
    teacher_model=teacher_model,
    args=student_training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)


start_time = time.time()
distillation_trainer.train()
end_time = time.time()
training_time = end_time - start_time
print(f" Training time: {training_time / 60:.2f} minutes")


print("Evaluating DistilRoBERTa student model...")
student_results = distillation_trainer.evaluate()
print(" DistilRoBERTa student model evaluation results:")
print(student_results)

Emotion to label mapping: {'anger': 0, 'disgust': 1, 'fear': 2, 'guilt': 3, 'joy': 4, 'sadness': 5, 'shame': 6}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1532 [00:00<?, ? examples/s]

Training DistilRoBERTa student model with knowledge distillation...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1893,1.418077,0.648825,0.653354,0.648825,0.644124
2,1.5128,1.145596,0.681462,0.691507,0.681462,0.675057
3,1.251,1.023865,0.704308,0.70704,0.704308,0.704345
4,0.9828,1.004551,0.713446,0.715841,0.713446,0.713631
5,0.8425,0.989461,0.712141,0.714218,0.712141,0.713003


🕒 Training time: 13.94 minutes
Evaluating DistilRoBERTa student model...


📊 DistilRoBERTa student model evaluation results:
{'eval_loss': 1.004550576210022, 'eval_accuracy': 0.7134464751958225, 'eval_precision': 0.7158413615889938, 'eval_recall': 0.7134464751958225, 'eval_f1': 0.7136306426575686, 'eval_runtime': 27.9712, 'eval_samples_per_second': 54.771, 'eval_steps_per_second': 1.716, 'epoch': 5.0}
