In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl


In [9]:
# -----------------------------
# 1. Import libraries
# -----------------------------
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import pickle

# -----------------------------
# 2. Use GPU if available
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# -----------------------------
# 3. Load and preprocess AGNEWS dataset
# -----------------------------
dataset = load_dataset("ag_news")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)


tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Using device: cuda


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [10]:
# ============================================================
# 4. 🧠 Load RoBERTa Model with LoRA Adapters
# ============================================================

from transformers import RobertaForSequenceClassification

# Load pre-trained RoBERTa-base model for 4-class classification
base_model = "roberta-base"
model = RobertaForSequenceClassification.from_pretrained(base_model, num_labels=4)

# LoRA Adapter Configuration
lora_config = LoraConfig(
    r=8,                                # Rank of LoRA matrices (low for parameter efficiency)
    lora_alpha=32,                      # Scaling factor; typically 2–4× r
    lora_dropout=0.1,                   # Slightly higher dropout helps regularize small-rank updates
    bias="none",                        # Don't adapt bias (saves params, avoids drift)
    target_modules=["query", "value"],  # Target core attention weights
    task_type=TaskType.SEQ_CLS          # Sequence classification task
)

# Apply LoRA to base model
model = get_peft_model(model, lora_config)
model.to(device)

# Show trainable parameter count for sanity check
model.print_trainable_parameters()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 888,580 || all params: 125,537,288 || trainable%: 0.7078


In [11]:
# ============================================================
# 5. ⚙️ Training Configuration — Optimized for LoRA
# ============================================================

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_lora_agnews",     # More descriptive output dir
    eval_strategy="epoch",            # Evaluate every epoch
    save_strategy="epoch",                  # Save once per epoch
    save_total_limit=1,                     # Keep only the best model
    load_best_model_at_end=True,            # Restore best weights after training
    metric_for_best_model="eval_accuracy",  # Use accuracy as the benchmark
    greater_is_better=True,

    learning_rate=2e-5,                     # Lower LR helps avoid overfitting
    per_device_train_batch_size=8,          # Small batch size with accum steps
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,          # Effective batch size = 16
    num_train_epochs=5,                     # Sweet spot for small adapters

    warmup_ratio=0.1,                       # More adaptive than fixed steps
    weight_decay=0.01,                      # Light regularization

    lr_scheduler_type="cosine",             # Smooth warmup + decay
    fp16=True,                              # Use FP16 if GPU supports it

    logging_dir="./logs",                   # Where to save logs
    logging_steps=50,
    report_to="none"                        # Turn off default logging to HuggingFace hub
)


In [None]:
# ============================================================
# 6. 🚀 Train the Model
# ============================================================

# Define metric computation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Initialize Trainer with model, data, tokenizer, and config
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()


In [None]:
import matplotlib.pyplot as plt

log_history = trainer.state.log_history
train_loss = [x["loss"] for x in log_history if "loss" in x]
eval_loss = [x["eval_loss"] for x in log_history if "eval_loss" in x]
eval_accuracy = [x["eval_accuracy"] for x in log_history if "eval_accuracy" in x]

epochs = range(1, len(train_loss) + 1)

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(epochs, train_loss, label='Train Loss')
plt.plot(epochs[:len(eval_loss)], eval_loss, label='Eval Loss')
plt.title("Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs[:len(eval_accuracy)], eval_accuracy, marker='o', label='Eval Accuracy')
plt.title("Eval Accuracy per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.tight_layout()
plt.show()


In [9]:
from sklearn.metrics import accuracy_score
import numpy as np

# Evaluate model on test set without retraining
outputs = trainer.predict(tokenized_dataset["test"])

# Extract predictions and true labels
predictions = np.argmax(outputs.predictions, axis=1)
labels = outputs.label_ids

# Calculate accuracy
acc = accuracy_score(labels, predictions)
print("Final Evaluation Accuracy:", acc)




Final Evaluation Accuracy: 0.9527631578947369


In [None]:
from datasets import Dataset
from torch.utils.data import DataLoader

# Load dataset object
with open("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl", "rb") as f:
    test_dataset = pickle.load(f)

# Convert to HuggingFace Dataset (already is, but this helps formatting)
test_dataset = Dataset.from_dict({"text": test_dataset["text"]})

# Tokenize function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenizer
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create PyTorch DataLoader for batching
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=64)

# Prediction loop
model.eval()
all_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(preds.cpu().numpy())


In [12]:
# -----------------------------
# 10. Save predictions to CSV
# -----------------------------
df = pd.DataFrame({
    "ID": list(range(len(all_predictions))),   # ID ✅
    "label": all_predictions
})
df.to_csv("submission.csv", index=False)
print("✅ Batched predictions complete. Saved to submission.csv.")

✅ Batched predictions complete. Saved to submission.csv.
