# MLOps Final Project:

Comparing performance of fine-tuned LLM vs out of the box pretrained LLM on sentiment analysis task for IMDB reviews dataset

# Data Preprocessing

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

df = pd.read_csv('movie_short.csv')

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])




Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

# DistilBERT Fine-Tuning

In [2]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.7011, 'grad_norm': 1.1648861169815063, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.7, 'grad_norm': 1.393163800239563, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.6905, 'grad_norm': 1.2457865476608276, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.692, 'grad_norm': 1.8890949487686157, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6888, 'grad_norm': 1.626924991607666, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6994, 'grad_norm': 1.5356136560440063, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.693, 'grad_norm': 2.3892295360565186, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.6826, 'grad_norm': 2.665776252746582, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6768, 'grad_norm': 2.523048162460327, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.6404, 'grad_norm': 1.8623087406158447, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.6054, 'grad_norm': 2.162881374359131

TrainOutput(global_step=300, training_loss=0.4570730966329575, metrics={'train_runtime': 163.9231, 'train_samples_per_second': 14.623, 'train_steps_per_second': 1.83, 'total_flos': 317524354578432.0, 'train_loss': 0.4570730966329575, 'epoch': 3.0})

# Evaulating DistilBERT fine-tuned vs pretrained

In [4]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm  # Use standard tqdm
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score


def evaluate_model(model, dataset, batch_size=8):
    dataloader = DataLoader(dataset, batch_size=batch_size)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    preds = []
    labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):  
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels.extend(batch['label'].cpu().numpy())
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return accuracy, f1

finetuned_model = model
finetuned_accuracy, finetuned_f1 = evaluate_model(finetuned_model, val_dataset)

pretrained_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
pretrained_accuracy, pretrained_f1 = evaluate_model(pretrained_model, val_dataset)

print(f'Fine-tuned Model - Accuracy: {finetuned_accuracy}, F1 Score: {finetuned_f1}')
print(f'Pre-trained Model - Accuracy: {pretrained_accuracy}, F1 Score: {pretrained_f1}')


100%|██████████| 25/25 [00:14<00:00,  1.69it/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 25/25 [00:14<00:00,  1.73it/s]

Fine-tuned Model - Accuracy: 0.86, F1 Score: 0.8494623655913979
Pre-trained Model - Accuracy: 0.51, F1 Score: 0.0





# Fine tuning with LoRA instead

In [5]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

lora_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules= ["attention.q_lin", "attention.k_lin", "attention.v_lin"], #["query", "key", "value"],  # Specify the target modules
    bias="none",
)

lora_model = get_peft_model(lora_model, lora_config)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

lora_finetuned_model = lora_model
lora_finetuned_accuracy, lora_finetuned_f1 = evaluate_model(lora_finetuned_model, val_dataset)

print(f'Fine-tuned Model - Accuracy: {lora_finetuned_accuracy}, F1 Score: {lora_finetuned_f1}')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.7221, 'grad_norm': 0.10963136702775955, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.7116, 'grad_norm': 0.1270284503698349, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.706, 'grad_norm': 0.10134343057870865, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.7136, 'grad_norm': 0.1646464467048645, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.7533, 'grad_norm': 0.15595752000808716, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.7051, 'grad_norm': 0.11388856917619705, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.7031, 'grad_norm': 0.21470071375370026, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.6796, 'grad_norm': 0.13678190112113953, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.7245, 'grad_norm': 0.1396872103214264, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.7288, 'grad_norm': 0.164378821849823, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.7124, 'grad_norm': 0.12

100%|██████████| 25/25 [00:16<00:00,  1.55it/s]

Fine-tuned Model - Accuracy: 0.645, F1 Score: 0.6926406926406926





# Same process as above but on DistillRoBERTa

In [6]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.6999, 'grad_norm': 1.3030778169631958, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.7006, 'grad_norm': 2.0900352001190186, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.6983, 'grad_norm': 1.2767045497894287, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6956, 'grad_norm': 1.7100857496261597, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6823, 'grad_norm': 1.9790457487106323, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.7201, 'grad_norm': 1.873080849647522, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.7134, 'grad_norm': 3.24796986579895, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.7103, 'grad_norm': 3.5436837673187256, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6854, 'grad_norm': 2.443457841873169, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.6914, 'grad_norm': 2.5282208919525146, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.692, 'grad_norm': 2.08798003196

TrainOutput(global_step=300, training_loss=0.6957423639297485, metrics={'train_runtime': 153.7467, 'train_samples_per_second': 15.591, 'train_steps_per_second': 1.951, 'total_flos': 317524354578432.0, 'train_loss': 0.6957423639297485, 'epoch': 3.0})

In [7]:

finetuned_model = model
finetuned_accuracy, finetuned_f1 = evaluate_model(finetuned_model, val_dataset)

pretrained_model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=2)
pretrained_accuracy, pretrained_f1 = evaluate_model(pretrained_model, val_dataset)

print(f'Fine-tuned Model - Accuracy: {finetuned_accuracy}, F1 Score: {finetuned_f1}')
print(f'Pre-trained Model - Accuracy: {pretrained_accuracy}, F1 Score: {pretrained_f1}')


100%|██████████| 25/25 [00:14<00:00,  1.67it/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 25/25 [00:14<00:00,  1.70it/s]

Fine-tuned Model - Accuracy: 0.545, F1 Score: 0.3357664233576642
Pre-trained Model - Accuracy: 0.49, F1 Score: 0.6577181208053692





In [8]:
lora_model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=2)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules= ["attention.self.query", "attention.self.key", "attention.self.value"], #["query", "key", "value"],  # Specify the target modules
    bias="none",
)

lora_model = get_peft_model(lora_model, lora_config)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

lora_finetuned_model = lora_model
lora_finetuned_accuracy, lora_finetuned_f1 = evaluate_model(lora_finetuned_model, val_dataset)

print(f'Fine-tuned Model - Accuracy: {lora_finetuned_accuracy}, F1 Score: {lora_finetuned_f1}')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.6945, 'grad_norm': 0.15243421494960785, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.6931, 'grad_norm': 0.15644918382167816, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.696, 'grad_norm': 0.12537923455238342, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6914, 'grad_norm': 0.16847458481788635, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6838, 'grad_norm': 0.22018037736415863, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6901, 'grad_norm': 0.1848909705877304, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.6942, 'grad_norm': 0.29606980085372925, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.7001, 'grad_norm': 0.3684787154197693, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6931, 'grad_norm': 0.28186720609664917, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.6878, 'grad_norm': 0.1738651543855667, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.6935, 'grad_norm': 0.

100%|██████████| 25/25 [00:16<00:00,  1.53it/s]

Fine-tuned Model - Accuracy: 0.51, F1 Score: 0.0



