In [None]:

#Data Augmentation + Cosine Schedulerimport torch
import nltk
import numpy as np
import pandas as pd
import nlpaug.augmenter.word as naw
import time
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    get_cosine_schedule_with_warmup
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#  required NLTK resources
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')


dataset = load_dataset("csv", data_files={"train": "ISEAR_train.csv", "test": "ISEAR_test.csv"})


unique_emotions = sorted(set(dataset["train"]["emotion"]))
emotion2label = {emotion: idx for idx, emotion in enumerate(unique_emotions)}
num_labels = len(emotion2label)
print("Emotion to label mapping:", emotion2label)

def map_emotion_to_label(example):
    example["label"] = emotion2label[example["emotion"]]
    return example

dataset = dataset.map(map_emotion_to_label)

# Data Augmentation: Create 30% more examples with synonym replacement
aug = naw.SynonymAug(aug_p=0.3)
def augment_data(example):
    return {"text": aug.augment(example["text"])[0], "label": example["label"]}

# Calculate number of examples to augment (30% of original training size)
train_size = len(dataset["train"])
augment_size = int(train_size * 0.3)  # 30% more examples
print(f"Original training size: {train_size}, Augmenting {augment_size} examples")

# Randomly select 30% of the training data to augment
augmented_subset = dataset["train"].shuffle(seed=42).select(range(augment_size))
augmented_data = augmented_subset.map(augment_data)

# Combine original and augmented datasets
from datasets import concatenate_datasets
augmented_train = concatenate_datasets([dataset["train"], augmented_data])


roberta_model_name = "roberta-base"
tokenizer_roberta = AutoTokenizer.from_pretrained(roberta_model_name)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name, num_labels=num_labels)


def tokenize_roberta(examples):
    return tokenizer_roberta(examples["text"], truncation=True, padding="max_length", max_length=128)

# Tokenize both original test and augmented train datasets
augmented_train = augmented_train.map(tokenize_roberta, batched=True)
dataset["test"] = dataset["test"].map(tokenize_roberta, batched=True)
augmented_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
dataset["test"].set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Training arguments with Cosine Scheduler
roberta_training_args = TrainingArguments(
    output_dir="./roberta_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_on_each_node=False,
    lr_scheduler_type="cosine",
    warmup_steps=100
)

# Create the Trainer for RoBERTa
roberta_trainer = Trainer(
    model=roberta_model,
    args=roberta_training_args,
    train_dataset=augmented_train,  # Use combined original + augmented data
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)


start_time = time.time()
roberta_trainer.train()
end_time = time.time()
training_time = end_time - start_time
print(f" Training time: {training_time / 60:.2f} minutes")

print("Evaluating RoBERTa model...")
roberta_results = roberta_trainer.evaluate()
print(" RoBERTa evaluation results:")
print(roberta_results)

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Emotion to label mapping: {'anger': 0, 'disgust': 1, 'fear': 2, 'guilt': 3, 'joy': 4, 'sadness': 5, 'shame': 6}
Original training size: 6124, Augmenting 1837 examples


Map:   0%|          | 0/1837 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7961 [00:00<?, ? examples/s]



Training RoBERTa model with 30% Augmented Data and Cosine Scheduler...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0674,0.865065,0.710836,0.722839,0.710836,0.711159
2,0.8092,0.776115,0.740209,0.7437,0.740209,0.7405
3,0.5623,0.772855,0.746736,0.747586,0.746736,0.746079
4,0.437,0.854432,0.741514,0.74054,0.741514,0.740058
5,0.28,0.923992,0.74282,0.745149,0.74282,0.742828
6,0.2111,1.000343,0.744125,0.744837,0.744125,0.743743
7,0.1581,1.093406,0.740209,0.743121,0.740209,0.741153
8,0.112,1.149478,0.741514,0.745836,0.741514,0.742849
9,0.1287,1.15953,0.744125,0.744727,0.744125,0.744383
10,0.0821,1.167142,0.744125,0.745861,0.744125,0.744827


🕒 Training time: 8.29 minutes
Evaluating RoBERTa model...


📊 RoBERTa evaluation results:
{'eval_loss': 0.7728545069694519, 'eval_accuracy': 0.7467362924281984, 'eval_precision': 0.7475860057560603, 'eval_recall': 0.7467362924281984, 'eval_f1': 0.7460792610468808, 'eval_runtime': 2.5153, 'eval_samples_per_second': 609.071, 'eval_steps_per_second': 19.083, 'epoch': 10.0}
