In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime

import ml_collections
import datasets
import torch
import transformers
import evaluate
from sklearn.model_selection import train_test_split
from accelerate import Accelerator, DistributedType
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed,
)

from utils import clean_text, preprocess_text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setting up the model hyperparameters
datetime_now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
def model_config():
    cfg_dictionary = {
        "data_path": "../data/data.csv",
        "test_split_size": 0.2,
        "validation_split_size":0.2,
                
        "model_path": "/model/roberta_base_model.h5",

        "train_batch_size": 32,
        "eval_batch_size": 32,

        "epochs": 5,
        "adam_epsilon": 1e-8,
        "lr": 3e-5,
        "num_warmup_steps": 10,

        "max_length": 128,
        "random_seed": 42,
        "num_labels": 3,
        "model_checkpoint":"FacebookAI/roberta-base",

    }
    cfg = ml_collections.FrozenConfigDict(cfg_dictionary)

    return cfg
cfg = model_config()

In [3]:
def create_dataset(dataframe):
    train_df, test_df = train_test_split(
        dataframe,
        test_size=cfg.test_split_size,
        random_state=cfg.random_seed,
        stratify=dataframe.labels.values,
    )
    train_df, val_df = train_test_split(
        train_df,
        test_size=cfg.validation_split_size,
        random_state=cfg.random_seed,
        stratify=train_df.labels.values,
    )

    dataset = {
        "train": Dataset.from_pandas(train_df),
        "validation": Dataset.from_pandas(val_df),
        "test": Dataset.from_pandas(test_df),
    }

    dataset = DatasetDict(dataset)

    return dataset

In [4]:
def preprocess_csv(csv_file: str) -> pd.DataFrame:
    df = pd.read_csv(csv_file)

    labelencoder = LabelEncoder()
    df["labels"] = labelencoder.fit_transform(df["Sentiment"])
    df.drop_duplicates(subset=['Sentence'],keep='first',inplace=True)

    cleaned_df = clean_text(df, "Sentence")
    df.rename(columns={"Sentiment": "sentiment"}, inplace=True)
    df.rename(columns={"Sentence": "sentence"}, inplace=True)

    return cleaned_df


def tokenize_dataset():
    dataset = create_dataset(preprocess_csv(cfg.data_path))
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_checkpoint,use_fast=True)

    def tokenize_function(sample):
        outputs = tokenizer(
            sample["sentence"],
            truncation=True,
            padding="max_length",
            max_length=cfg.max_length,
        )
        return outputs

    tokenized_datasets = dataset.map(
        tokenize_function, batched=True, remove_columns=["sentence","sentiment","__index_level_0__"]
    )
    # Rename 'label' to 'labels' as expected by HuggingFace models
    tokenized_datasets.set_format("torch")

    return tokenized_datasets

In [5]:
def create_dataloaders(tokenized_datasets):
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, batch_size=cfg.train_batch_size
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], shuffle=False, batch_size=cfg.eval_batch_size
    )
    return train_dataloader, eval_dataloader

In [None]:
def training_function():
    accelerator = Accelerator()

    set_seed(cfg.random_seed)
    tokenized_datasets = tokenize_dataset()
    accuracy = evaluate.load("accuracy")

    # if accelerator.is_main_process:
    #     datasets.utils.logging.set_verbosity_warning()
    #     transformers.utils.logging.set_verbosity_info()
    # else:
    #     datasets.utils.logging.set_verbosity_error()
    #     transformers.utils.logging.set_verbosity_error()

    train_dataloader, eval_dataloader = create_dataloaders(tokenized_datasets)
    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.model_checkpoint, num_labels=cfg.num_labels
    )
    optimizer = torch.optim.AdamW(
        params=model.parameters(), eps=cfg.adam_epsilon, lr=cfg.lr
    )
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=cfg.num_warmup_steps,
        num_training_steps=len(train_dataloader) * cfg.epochs,
    )
    progress_bar = tqdm(
        range(cfg.epochs * len(train_dataloader)),
        # disable=not accelerator.is_main_process,
    )

    best_accuracy = 0
    checkpoint_dir = "../results/checkpoints"

    # Model Training
    for epoch in range(cfg.epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            # print(batch["labels"])
            outputs = model(**batch)
            loss = outputs.loss
            # print(loss.item())
            accelerator.backward(loss)
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        model.eval()
        all_predictions = []
        all_labels = []

        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)

            # gather predictions and labels from the multiple TPUs/GPUs (if applicable)
            all_predictions.append(accelerator.gather(predictions))
            all_labels.append(accelerator.gather(batch["labels"]))

        # Concatenate all predictions and labels.
        all_predictions = torch.cat(all_predictions)[
            : len(tokenized_datasets["validation"])
        ]
        all_labels = torch.cat(all_labels)[: len(tokenized_datasets["validation"])]

        eval_accuracy = accuracy.compute(
            predictions=all_predictions, references=all_labels
        )

        # Use accelerator.print to print only on the main process.
        accelerator.print(f"epoch {epoch}:", eval_accuracy)

        # Save checkpoint if this is the best model so far
        if eval_accuracy["accuracy"] > best_accuracy:
            best_accuracy = eval_accuracy["accuracy"]
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                f"{checkpoint_dir}/roberta-base-best",
                save_function=accelerator.save
            )
            # Also save the tokenizer for easy loading later
            tokenizer = AutoTokenizer.from_pretrained(cfg.model_checkpoint)
            tokenizer.save_pretrained(f"{checkpoint_dir}/roberta-best")
            accelerator.print(f"Saved new best model with accuracy: {best_accuracy:.4f}")

In [10]:
## TRAINING
training_function()

Map: 100%|██████████| 3405/3405 [00:00<00:00, 10777.10 examples/s]
Map: 100%|██████████| 852/852 [00:00<00:00, 11429.50 examples/s]
Map: 100%|██████████| 1065/1065 [00:00<00:00, 3285.29 examples/s]
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 572.34it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
RobertaForSequenceClassification LOAD REPORT from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

No

epoch 0: {'accuracy': 0.8497652582159625}


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.26it/s]
 20%|██        | 108/535 [00:35<10:11,  1.43s/it]

Saved new best model with accuracy: 0.8498


 40%|████      | 215/535 [01:09<05:28,  1.03s/it]

epoch 1: {'accuracy': 0.8438967136150235}


 60%|██████    | 321/535 [01:40<01:02,  3.40it/s]

epoch 2: {'accuracy': 0.8720657276995305}


Writing model shards: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]
 60%|██████    | 322/535 [01:46<07:17,  2.05s/it]

Saved new best model with accuracy: 0.8721


 80%|████████  | 428/535 [02:18<00:31,  3.38it/s]

epoch 3: {'accuracy': 0.8732394366197183}


Writing model shards: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it]
 80%|████████  | 429/535 [02:24<03:35,  2.04s/it]

Saved new best model with accuracy: 0.8732


100%|██████████| 535/535 [02:55<00:00,  3.39it/s]

epoch 4: {'accuracy': 0.8744131455399061}


Writing model shards: 100%|██████████| 1/1 [00:02<00:00,  2.99s/it]
100%|██████████| 535/535 [03:01<00:00,  2.94it/s]

Saved new best model with accuracy: 0.8744





GPU memory usage:  4492MiB 

In [None]:
## Save results from test set for evaluation later

def evaluate_and_save_test_results(output_path="../results/predictions/roberta_base_predictions.pkl"):
    """Evaluate on test set and save results for later analysis."""
    accelerator = Accelerator()
    set_seed(cfg.random_seed)
    
    # Recreate tokenized datasets
    tokenized_datasets = tokenize_dataset()
    
    # Load the fine-tuned model (not the pretrained checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(
        "../results/checkpoints/roberta-base-best", num_labels=cfg.num_labels
    )
    
    # Create test dataloader
    test_dataloader = DataLoader(
        tokenized_datasets["test"], shuffle=False, batch_size=cfg.eval_batch_size
    )
    
    model, test_dataloader = accelerator.prepare(model, test_dataloader)
    model.eval()
    
    all_predictions = []
    all_labels = []
    all_probabilities = []
    
    for batch in tqdm(test_dataloader, desc="Evaluating test set"):
        with torch.no_grad():
            outputs = model(**batch)
        
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predictions = logits.argmax(dim=-1)
        
        all_predictions.append(accelerator.gather(predictions).cpu().numpy())
        all_labels.append(accelerator.gather(batch["labels"]).cpu().numpy())
        all_probabilities.append(accelerator.gather(probabilities).cpu().numpy())
    
    # Concatenate and trim to actual test set size
    test_size = len(tokenized_datasets["test"])
    all_predictions = np.concatenate(all_predictions)[:test_size]
    all_labels = np.concatenate(all_labels)[:test_size]
    all_probabilities = np.concatenate(all_probabilities)[:test_size]
    
    # Calculate accuracy
    accuracy = (all_predictions == all_labels).mean()
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Save results using pickle (consistent with ml_baselines notebook)
    import pickle
    import os
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    results = {
        "RoBERTa-base": all_predictions,
        "y_true": all_labels,
        "probabilities": all_probabilities,
    }
    
    with open(output_path, 'wb') as f:
        pickle.dump(results, f)
    
    print(f"Results saved to {output_path}")
    
    return all_predictions, all_labels, all_probabilities

# Run evaluation and save
predictions, labels, probabilities = evaluate_and_save_test_results()

Map:  59%|█████▊    | 2000/3405 [00:00<00:00, 10816.42 examples/s]