# DL-GENAi PROJECT ‚Äî DistilRoBERTa-Base
# Name  : Abhishek Saha
# Roll  : 23f1001572
# Model : roberta-large


## IMPORTS AND SETUP

In [1]:
!wandb login 20d9b18a55f275c39d05bf53e51e8b328aeffff5

import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    get_linear_schedule_with_warmup
)

from tqdm.auto import tqdm
import warnings, logging

warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("tokenizers").setLevel(logging.ERROR)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
Using device: cuda


## CONFIGURATION

In [2]:
class Config:
    model_name    = "roberta-large"   
    max_length    = 256               
    batch_size    = 8                 
    learning_rate = 1e-5              
    epochs        = 5                 
    warmup_steps  = 200               
    grad_accum    = 2                 
    focal_loss_gamma = 2.0
    dropout_rate  = 0.3
    weight_decay  = 0.01
    seed          = 42
    augmentation  = True

    project = "23f1001572-t32025"
    run_name = "roberta-large-final"
CFG = Config()

TARGET_COLS = ["anger", "fear", "joy", "sadness", "surprise"]


## REPRODUCIBILITY

In [3]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(CFG.seed)


## DATA LOADING

In [4]:
TRAIN_PATH = "/kaggle/input/2025-sep-dl-gen-ai-project/train.csv"
TEST_PATH  = "/kaggle/input/2025-sep-dl-gen-ai-project/test.csv"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

print(train.shape, test.shape)


(6827, 8) (1707, 2)


## TEXT PREPROCESSING

In [6]:
import re, html

def basic_text_preprocessing(text):
    if pd.isna(text):
        return ""
    s = html.unescape(str(text))
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    s = re.sub(r"@\w+", " ", s)
    s = re.sub(r"#(\w+)", r"\1", s)
    s = re.sub(r"[^A-Za-z0-9\s!?']", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s.lower()

def simple_augmentation(text, p=0.1):
    if random.random() > p: return text
    words = text.split()
    if len(words) < 2: return text
    idx = random.randint(0, len(words)-2)
    words[idx], words[idx+1] = words[idx+1], words[idx]
    return " ".join(words)

train["text_processed"] = train["text"].apply(basic_text_preprocessing)
test["text_processed"]  = test["text"].apply(basic_text_preprocessing)


## DATASET CLASS

In [7]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128, augment=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_length
        self.augment = augment

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        if self.augment and self.labels is not None:
            text = simple_augmentation(text)

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
        }
        if self.labels is not None:
            item["labels"] = torch.FloatTensor(self.labels[idx])
        return item


## MODEL ARCHITECTURE

In [8]:
class EmotionClassifier(nn.Module):
    def __init__(self, model_name, num_labels=5, dropout_rate=0.3):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.transformer = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        out = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        cls = self.dropout(cls)
        logits = self.classifier(cls)
        return logits


## LOSS FUNCTION

In [9]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0):
        super().__init__()
        self.gamma = gamma

    def forward(self, logits, targets):
        bce = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction="none")
        pt = torch.exp(-bce)
        loss = ((1 - pt)**self.gamma) * bce
        return loss.mean()

criterion = FocalLoss(gamma=CFG.focal_loss_gamma)


## TRAIN FUNCTION

In [10]:
def train_epoch(model, loader, optimizer, scheduler, device, epoch):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(loader, desc=f"Training Epoch {epoch+1}")):

        ids   = batch["input_ids"].to(device)
        mask  = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(ids, mask)
        loss   = criterion(logits, labels)

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    return total_loss / len(loader)


## VALIDATION FUNCTION

In [11]:
def validate_epoch(model, loader, device, epoch):
    model.eval()
    val_loss = 0.0
    preds, labels_list = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc=f"Validation Epoch {epoch+1}"):

            ids   = batch["input_ids"].to(device)
            mask  = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(ids, mask)
            loss = criterion(logits, labels)
            val_loss += loss.item()

            probs = torch.sigmoid(logits).cpu().numpy()
            preds.extend(probs)
            labels_list.extend(labels.cpu().numpy())

    preds = np.array(preds)
    labels_list = np.array(labels_list)

    bin_preds = (preds > 0.5).astype(int)

    f1_scores = {emo: f1_score(labels_list[:, i], bin_preds[:, i]) 
                 for i, emo in enumerate(TARGET_COLS)}

    macro_f1 = np.mean(list(f1_scores.values()))

    return macro_f1, val_loss / len(loader), f1_scores


## DATA PREP

In [12]:
train_df, val_df = train_test_split(train, test_size=0.1, random_state=CFG.seed)

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

train_dataset = EmotionDataset(
    train_df["text_processed"].values,
    train_df[TARGET_COLS].values,
    tokenizer,
    max_length=CFG.max_length,
    augment=CFG.augmentation
)

val_dataset = EmotionDataset(
    val_df["text_processed"].values,
    val_df[TARGET_COLS].values,
    tokenizer,
    max_length=CFG.max_length,
    augment=False
)

train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## MODEL INIT

In [13]:
model = EmotionClassifier(CFG.model_name, num_labels=len(TARGET_COLS), dropout_rate=CFG.dropout_rate)
model.to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.learning_rate, weight_decay=CFG.weight_decay)

total_steps = len(train_loader) * CFG.epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=CFG.warmup_steps,
    num_training_steps=total_steps
)


2025-12-01 19:23:15.502854: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764616995.895584      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764616996.027144      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

## wandb init

In [14]:
import wandb

wandb.init(
    project=CFG.project,
    name=CFG.run_name,
    config=CFG.__dict__
)

wandb.watch(model, log_freq=50)


[34m[1mwandb[0m: Currently logged in as: [33mabhisheksaha[0m ([33mabhisheksahaiitm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


## TRAIN LOOP

In [15]:
best_f1 = 0.0

for epoch in range(CFG.epochs):
    print(f"\nEpoch {epoch+1}/{CFG.epochs}")

    train_loss = train_epoch(model, train_loader, optimizer, scheduler, DEVICE, epoch)
    val_f1, val_loss, emo_f1s = validate_epoch(model, val_loader, DEVICE, epoch)

    print("Train Loss:", train_loss)
    print("Val Loss:", val_loss)
    print("Val F1:", val_f1)
    print("Per Emotion:", emo_f1s)

    log_data = {
        "epoch": epoch+1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "val_macro_f1": val_f1
    }
    for k,v in emo_f1s.items():
        log_data[f"f1_{k}"] = v

    wandb.log(log_data)

    if val_f1 > best_f1:
        torch.save(model.state_dict(), "best_model.pth")
        best_f1 = val_f1
        print(" New best model saved")

wandb.finish()



Epoch 1/5


Training Epoch 1:   0%|          | 0/768 [00:00<?, ?it/s]

Validation Epoch 1:   0%|          | 0/86 [00:00<?, ?it/s]

Train Loss: 0.13162195727151507
Val Loss: 0.0822086225285433
Val F1: 0.7263431148440889
Per Emotion: {'anger': 0.6466165413533834, 'fear': 0.8689320388349514, 'joy': 0.7147766323024056, 'sadness': 0.6883468834688347, 'surprise': 0.7130434782608694}
üî• New best model saved

Epoch 2/5


Training Epoch 2:   0%|          | 0/768 [00:00<?, ?it/s]

Validation Epoch 2:   0%|          | 0/86 [00:00<?, ?it/s]

Train Loss: 0.0783418577023743
Val Loss: 0.07184008461271607
Val F1: 0.8211575275931473
Per Emotion: {'anger': 0.7712418300653595, 'fear': 0.8675675675675676, 'joy': 0.8038585209003215, 'sadness': 0.8284313725490196, 'surprise': 0.8346883468834688}
üî• New best model saved

Epoch 3/5


Training Epoch 3:   0%|          | 0/768 [00:00<?, ?it/s]

Validation Epoch 3:   0%|          | 0/86 [00:00<?, ?it/s]

Train Loss: 0.05027232604152232
Val Loss: 0.07020375485055495
Val F1: 0.8413627819397312
Per Emotion: {'anger': 0.7814569536423841, 'fear': 0.9051833122629582, 'joy': 0.8074534161490682, 'sadness': 0.8433734939759037, 'surprise': 0.8693467336683418}
üî• New best model saved

Epoch 4/5


Training Epoch 4:   0%|          | 0/768 [00:00<?, ?it/s]

Validation Epoch 4:   0%|          | 0/86 [00:00<?, ?it/s]

Train Loss: 0.033636351379527696
Val Loss: 0.08561704498661551
Val F1: 0.8569932172047885
Per Emotion: {'anger': 0.8211920529801324, 'fear': 0.9124668435013261, 'joy': 0.8294117647058823, 'sadness': 0.8444444444444444, 'surprise': 0.8774509803921569}
üî• New best model saved

Epoch 5/5


Training Epoch 5:   0%|          | 0/768 [00:00<?, ?it/s]

Validation Epoch 5:   0%|          | 0/86 [00:00<?, ?it/s]

Train Loss: 0.0223057640390986
Val Loss: 0.09066195616384881
Val F1: 0.8721339593023945
Per Emotion: {'anger': 0.8496732026143791, 'fear': 0.9286640726329443, 'joy': 0.8501529051987767, 'sadness': 0.8483412322274881, 'surprise': 0.8838383838383838}
üî• New best model saved


0,1
epoch,‚ñÅ‚ñÉ‚ñÖ‚ñÜ‚ñà
f1_anger,‚ñÅ‚ñÖ‚ñÜ‚ñá‚ñà
f1_fear,‚ñÅ‚ñÅ‚ñÖ‚ñÜ‚ñà
f1_joy,‚ñÅ‚ñÜ‚ñÜ‚ñá‚ñà
f1_sadness,‚ñÅ‚ñá‚ñà‚ñà‚ñà
f1_surprise,‚ñÅ‚ñÜ‚ñá‚ñà‚ñà
train_loss,‚ñà‚ñÖ‚ñÉ‚ñÇ‚ñÅ
val_loss,‚ñÖ‚ñÇ‚ñÅ‚ñÜ‚ñà
val_macro_f1,‚ñÅ‚ñÜ‚ñá‚ñá‚ñà

0,1
epoch,5.0
f1_anger,0.84967
f1_fear,0.92866
f1_joy,0.85015
f1_sadness,0.84834
f1_surprise,0.88384
train_loss,0.02231
val_loss,0.09066
val_macro_f1,0.87213


In [16]:
model.load_state_dict(torch.load("best_model.pth", map_location=DEVICE))
model.to(DEVICE)
model.eval()


EmotionClassifier(
  (transformer): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
        

In [17]:
test_dataset = EmotionDataset(
    test["text_processed"].values,
    None,
    tokenizer=tokenizer,
    max_length=CFG.max_length,
    augment=False
)

test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

all_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        ids = batch["input_ids"].to(DEVICE)
        mask = batch["attention_mask"].to(DEVICE)

        logits = model(ids, mask)
        probs  = torch.sigmoid(logits).cpu().numpy()
        all_preds.extend(probs)

all_preds = np.array(all_preds)
binary_preds = (all_preds > 0.5).astype(int)

submission = pd.DataFrame({
    "id": test["id"],
    "anger": binary_preds[:,0],
    "fear": binary_preds[:,1],
    "joy": binary_preds[:,2],
    "sadness": binary_preds[:,3],
    "surprise": binary_preds[:,4]
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")


Testing:   0%|          | 0/214 [00:00<?, ?it/s]

Saved submission.csv
