In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime

import ml_collections
import datasets
import torch
import transformers
import evaluate
from sklearn.model_selection import train_test_split
from accelerate import Accelerator, DistributedType
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed,
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

from utils import clean_text, preprocess_text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setting up the model hyperparameters
datetime_now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
def model_config():
    cfg_dictionary = {
        "data_path": "../data/data.csv",
        "test_split_size": 0.2,
        "validation_split_size":0.2,
                
        "train_batch_size": 32,
        "eval_batch_size": 32,

        "epochs": 5,
        "adam_epsilon": 1e-8,
        "lr": 1e-4,  # Higher learning rate for LoRA (typically 1e-4 to 3e-4)
        "num_warmup_steps": 10,

        "max_length": 128,
        "random_seed": 42,
        "num_labels": 3,
        "model_checkpoint":"FacebookAI/roberta-large",

        # LoRA hyperparameters
        "lora_r": 16,  # Rank of the low-rank matrices
        "lora_alpha": 32,  # Scaling factor (typically 2x lora_r)
        "lora_dropout": 0.1,  # Dropout for LoRA layers
    }
    cfg = ml_collections.FrozenConfigDict(cfg_dictionary)

    return cfg
cfg = model_config()

In [3]:
def create_dataset(dataframe):
    train_df, test_df = train_test_split(
        dataframe,
        test_size=cfg.test_split_size,
        random_state=cfg.random_seed,
        stratify=dataframe.labels.values,
    )
    train_df, val_df = train_test_split(
        train_df,
        test_size=cfg.validation_split_size,
        random_state=cfg.random_seed,
        stratify=train_df.labels.values,
    )

    dataset = {
        "train": Dataset.from_pandas(train_df),
        "validation": Dataset.from_pandas(val_df),
        "test": Dataset.from_pandas(test_df),
    }

    dataset = DatasetDict(dataset)

    return dataset

In [4]:
def preprocess_csv(csv_file: str) -> pd.DataFrame:
    df = pd.read_csv(csv_file)

    labelencoder = LabelEncoder()
    df["labels"] = labelencoder.fit_transform(df["Sentiment"])
    df.drop_duplicates(subset=['Sentence'],keep='first',inplace=True)

    cleaned_df = clean_text(df, "Sentence")
    df.rename(columns={"Sentiment": "sentiment"}, inplace=True)
    df.rename(columns={"Sentence": "sentence"}, inplace=True)

    return cleaned_df


def tokenize_dataset():
    dataset = create_dataset(preprocess_csv(cfg.data_path))
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_checkpoint,use_fast=True)

    def tokenize_function(sample):
        outputs = tokenizer(
            sample["sentence"],
            truncation=True,
            padding="max_length",
            max_length=cfg.max_length,
        )
        return outputs

    tokenized_datasets = dataset.map(
        tokenize_function, batched=True, remove_columns=["sentence","sentiment","__index_level_0__"]
    )
    # Rename 'label' to 'labels' as expected by HuggingFace models
    tokenized_datasets.set_format("torch")

    return tokenized_datasets

In [5]:
def create_dataloaders(tokenized_datasets):
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, batch_size=cfg.train_batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, batch_size=cfg.eval_batch_size
    )
    return train_dataloader, eval_dataloader

In [6]:
def training_function():
    accelerator = Accelerator()

    set_seed(cfg.random_seed)
    tokenized_datasets = tokenize_dataset()
    
    # Load metrics
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    roc_auc_metric = evaluate.load("roc_auc", "multiclass")

    tokenizer = AutoTokenizer.from_pretrained(cfg.model_checkpoint)

    train_dataloader, eval_dataloader = create_dataloaders(tokenized_datasets)
    
    # Load base model
    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.model_checkpoint, num_labels=cfg.num_labels
    )
    
    # Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=cfg.lora_r,
        lora_alpha=cfg.lora_alpha,
        lora_dropout=cfg.lora_dropout,
        target_modules=["query", "key", "value", "dense"],  # Target attention layers
        bias="none",
    )
    
    # Apply LoRA to the model
    model = get_peft_model(model, lora_config)
    
    # Print trainable parameters info
    model.print_trainable_parameters()
    
    optimizer = torch.optim.AdamW(
        params=model.parameters(), eps=cfg.adam_epsilon, lr=cfg.lr
    )
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=cfg.num_warmup_steps,
        num_training_steps=len(train_dataloader) * cfg.epochs,
    )
    progress_bar = tqdm(
        range(cfg.epochs * len(train_dataloader)),
    )

    best_macro_f1 = 0
    checkpoint_dir = "../results/checkpoints"

    # Model Training
    for epoch in range(cfg.epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        model.eval()
        all_predictions = []
        all_probabilities = []
        all_labels = []

        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)
            predictions = logits.argmax(dim=-1)

            all_predictions.append(accelerator.gather(predictions))
            all_probabilities.append(accelerator.gather(probabilities))
            all_labels.append(accelerator.gather(batch["labels"]))

        all_predictions = torch.cat(all_predictions)[
            : len(tokenized_datasets["validation"])
        ]
        all_probabilities = torch.cat(all_probabilities)[
            : len(tokenized_datasets["validation"])
        ]
        all_labels = torch.cat(all_labels)[: len(tokenized_datasets["validation"])]

        # Compute metrics
        eval_accuracy = accuracy_metric.compute(
            predictions=all_predictions, references=all_labels
        )["accuracy"]
        eval_micro_f1 = f1_metric.compute(
            predictions=all_predictions, references=all_labels, average="micro"
        )["f1"]
        eval_macro_f1 = f1_metric.compute(
            predictions=all_predictions, references=all_labels, average="macro"
        )["f1"]
        eval_macro_auroc = roc_auc_metric.compute(
            references=all_labels.cpu().numpy(),
            prediction_scores=all_probabilities.cpu().numpy(),
            multi_class="ovr",
            average="macro"
        )["roc_auc"]

        accelerator.print(
            f"epoch {epoch}: accuracy={eval_accuracy:.4f}, micro_f1={eval_micro_f1:.4f}, "
            f"macro_f1={eval_macro_f1:.4f}, macro_auroc={eval_macro_auroc:.4f}"
        )

        # Save checkpoint if this is the best model based on macro AUROC
        if eval_macro_f1 > best_macro_f1:
            best_macro_f1 = eval_macro_f1
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            # Save only the LoRA adapters (much smaller than full model)
            unwrapped_model.save_pretrained(
                f"{checkpoint_dir}/roberta-large-lora-best",
                save_function=accelerator.save
            )
            # Also save the tokenizer for easy loading later
            tokenizer.save_pretrained(f"{checkpoint_dir}/roberta-large-lora-best")
            accelerator.print(f"Saved new best LoRA adapter with macro_f1: {best_macro_f1:.4f}")

In [7]:
## TRAINING
training_function()

Map: 100%|██████████| 3405/3405 [00:00<00:00, 9521.58 examples/s]
Map: 100%|██████████| 852/852 [00:00<00:00, 9599.21 examples/s]
Map: 100%|██████████| 1065/1065 [00:00<00:00, 11087.37 examples/s]
Loading weights: 100%|██████████| 389/389 [00:00<00:00, 473.99it/s, Materializing param=roberta.encoder.layer.23.output.dense.weight]              
RobertaForSequenceClassification LOAD REPORT from: FacebookAI/roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 

No

trainable params: 8,130,563 || all params: 363,493,382 || trainable%: 2.2368


 20%|██        | 107/535 [01:26<05:05,  1.40it/s]

epoch 0: accuracy=0.7383, micro_f1=0.7383, macro_f1=0.5716, macro_auroc=0.8916
Saved new best LoRA adapter with macro_f1: 0.5716


 40%|████      | 214/535 [03:03<03:49,  1.40it/s]

epoch 1: accuracy=0.8803, micro_f1=0.8803, macro_f1=0.8390, macro_auroc=0.9715
Saved new best LoRA adapter with macro_f1: 0.8390


 60%|██████    | 321/535 [04:40<02:32,  1.40it/s]

epoch 2: accuracy=0.8885, micro_f1=0.8885, macro_f1=0.8511, macro_auroc=0.9772
Saved new best LoRA adapter with macro_f1: 0.8511


 80%|████████  | 428/535 [06:17<01:16,  1.40it/s]

epoch 3: accuracy=0.8850, micro_f1=0.8850, macro_f1=0.8486, macro_auroc=0.9780


100%|██████████| 535/535 [07:53<00:00,  1.40it/s]

epoch 4: accuracy=0.8920, micro_f1=0.8920, macro_f1=0.8522, macro_auroc=0.9785


100%|██████████| 535/535 [08:04<00:00,  1.10it/s]

Saved new best LoRA adapter with macro_f1: 0.8522





GPU memory usage:  10296MiB (10% less than fine-tuning)

Trainig time ~ 8 mins

In [8]:
## Save results from test set for evaluation later

def evaluate_and_save_test_results(output_path="../results/predictions/roberta_large_lora_predictions.pkl"):
    """Evaluate on test set and save results for later analysis."""
    accelerator = Accelerator()
    set_seed(cfg.random_seed)
    
    # Recreate tokenized datasets
    tokenized_datasets = tokenize_dataset()
    
    # Load metrics
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    roc_auc_metric = evaluate.load("roc_auc", "multiclass")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    
    # Load the base model first
    base_model = AutoModelForSequenceClassification.from_pretrained(
        cfg.model_checkpoint, num_labels=cfg.num_labels
    )
    
    # Load the LoRA adapters on top of the base model
    model = PeftModel.from_pretrained(
        base_model,
        "../results/checkpoints/roberta-large-lora-best"
    )
    
    # Create test dataloader
    test_dataloader = DataLoader(
        tokenized_datasets["test"], shuffle=False, batch_size=cfg.eval_batch_size
    )
    
    model, test_dataloader = accelerator.prepare(model, test_dataloader)
    model.eval()
    
    all_predictions = []
    all_labels = []
    all_probabilities = []
    
    for batch in tqdm(test_dataloader, desc="Evaluating test set"):
        with torch.no_grad():
            outputs = model(**batch)
        
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predictions = logits.argmax(dim=-1)
        
        all_predictions.append(accelerator.gather(predictions).cpu().numpy())
        all_labels.append(accelerator.gather(batch["labels"]).cpu().numpy())
        all_probabilities.append(accelerator.gather(probabilities).cpu().numpy())
    
    # Concatenate and trim to actual test set size
    test_size = len(tokenized_datasets["test"])
    all_predictions = np.concatenate(all_predictions)[:test_size]
    all_labels = np.concatenate(all_labels)[:test_size]
    all_probabilities = np.concatenate(all_probabilities)[:test_size]
    
    # Compute all metrics
    test_accuracy = accuracy_metric.compute(
        predictions=all_predictions, references=all_labels
    )["accuracy"]
    
    test_micro_f1 = f1_metric.compute(
        predictions=all_predictions, references=all_labels, average="micro"
    )["f1"]
    
    test_macro_f1 = f1_metric.compute(
        predictions=all_predictions, references=all_labels, average="macro"
    )["f1"]
    
    test_weighted_f1 = f1_metric.compute(
        predictions=all_predictions, references=all_labels, average="weighted"
    )["f1"]
    
    test_macro_auroc = roc_auc_metric.compute(
        references=all_labels,
        prediction_scores=all_probabilities,
        multi_class="ovr",
        average="macro"
    )["roc_auc"]
    
    test_weighted_auroc = roc_auc_metric.compute(
        references=all_labels,
        prediction_scores=all_probabilities,
        multi_class="ovr",
        average="weighted"
    )["roc_auc"]
    
    test_micro_precision = precision_metric.compute(
        predictions=all_predictions, references=all_labels, average="micro"
    )["precision"]
    
    test_macro_precision = precision_metric.compute(
        predictions=all_predictions, references=all_labels, average="macro"
    )["precision"]
    
    test_micro_recall = recall_metric.compute(
        predictions=all_predictions, references=all_labels, average="micro"
    )["recall"]
    
    test_macro_recall = recall_metric.compute(
        predictions=all_predictions, references=all_labels, average="macro"
    )["recall"]
    
    # Print all metrics
    print("=" * 50)
    print("TEST SET EVALUATION RESULTS")
    print("=" * 50)
    print(f"Accuracy:          {test_accuracy:.4f}")
    print("-" * 50)
    print("F1 Scores:")
    print(f"  Micro F1:        {test_micro_f1:.4f}")
    print(f"  Macro F1:        {test_macro_f1:.4f}")
    print(f"  Weighted F1:     {test_weighted_f1:.4f}")
    print("-" * 50)
    print("Precision:")
    print(f"  Micro Precision: {test_micro_precision:.4f}")
    print(f"  Macro Precision: {test_macro_precision:.4f}")
    print("-" * 50)
    print("Recall:")
    print(f"  Micro Recall:    {test_micro_recall:.4f}")
    print(f"  Macro Recall:    {test_macro_recall:.4f}")
    print("-" * 50)
    print("ROC-AUC:")
    print(f"  Macro AUROC:     {test_macro_auroc:.4f}")
    print(f"  Weighted AUROC:  {test_weighted_auroc:.4f}")
    print("=" * 50)
    
    # Save results using pickle (consistent with ml_baselines notebook)
    import pickle
    import os
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    results = {
        "RoBERTa-large-LoRA": all_predictions,
        "y_true": all_labels,
        "probabilities": all_probabilities,
        "metrics": {
            "accuracy": test_accuracy,
            "micro_f1": test_micro_f1,
            "macro_f1": test_macro_f1,
            "weighted_f1": test_weighted_f1,
            "micro_precision": test_micro_precision,
            "macro_precision": test_macro_precision,
            "micro_recall": test_micro_recall,
            "macro_recall": test_macro_recall,
            "macro_auroc": test_macro_auroc,
            "weighted_auroc": test_weighted_auroc,
        }
    }
    
    with open(output_path, 'wb') as f:
        pickle.dump(results, f)
    
    print(f"\nResults saved to {output_path}")
    
    return all_predictions, all_labels, all_probabilities

# Run evaluation and save
predictions, labels, probabilities = evaluate_and_save_test_results()

Map: 100%|██████████| 3405/3405 [00:00<00:00, 11061.12 examples/s]
Map: 100%|██████████| 852/852 [00:00<00:00, 11412.67 examples/s]
Map: 100%|██████████| 1065/1065 [00:00<00:00, 11107.55 examples/s]
Downloading builder script: 7.56kB [00:00, 10.2MB/s]
Downloading builder script: 7.38kB [00:00, 10.4MB/s]
Loading weights: 100%|██████████| 389/389 [00:00<00:00, 580.72it/s, Materializing param=roberta.encoder.layer.23.output.dense.weight]              
RobertaForSequenceClassification LOAD REPORT from: FacebookAI/roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSI

TEST SET EVALUATION RESULTS
Accuracy:          0.8704
--------------------------------------------------
F1 Scores:
  Micro F1:        0.8704
  Macro F1:        0.8194
  Weighted F1:     0.8688
--------------------------------------------------
Precision:
  Micro Precision: 0.8704
  Macro Precision: 0.8309
--------------------------------------------------
Recall:
  Micro Recall:    0.8704
  Macro Recall:    0.8101
--------------------------------------------------
ROC-AUC:
  Macro AUROC:     0.9715
  Weighted AUROC:  0.9684

Results saved to ../results/predictions/roberta_large_lora_predictions.pkl
