# MLOps Final Project:

Comparing performance of fine-tuned LLM vs out of the box pretrained LLM on sentiment analysis task for IMDB reviews dataset

# Data Preprocessing

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load the dataset
df = pd.read_csv('movie_short.csv')

# Split into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Prepare the datasets for training
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 799/799 [00:01<00:00, 605.12 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 561.29 examples/s]


# DistilBERT Fine-Tuning

In [2]:
# Load the pre-trained model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 10/300 [00:06<02:43,  1.77it/s]

{'loss': 0.691, 'grad_norm': 1.2544898986816406, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}


  7%|▋         | 20/300 [00:11<02:32,  1.84it/s]

{'loss': 0.7051, 'grad_norm': 1.4241828918457031, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}


 10%|█         | 30/300 [00:16<02:27,  1.83it/s]

{'loss': 0.6986, 'grad_norm': 1.3029305934906006, 'learning_rate': 3e-06, 'epoch': 0.3}


 13%|█▎        | 40/300 [00:22<02:21,  1.83it/s]

{'loss': 0.687, 'grad_norm': 1.8051680326461792, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}


 17%|█▋        | 50/300 [00:27<02:15,  1.85it/s]

{'loss': 0.6801, 'grad_norm': 1.640710473060608, 'learning_rate': 5e-06, 'epoch': 0.5}


 20%|██        | 60/300 [00:33<02:07,  1.88it/s]

{'loss': 0.6967, 'grad_norm': 1.4054498672485352, 'learning_rate': 6e-06, 'epoch': 0.6}


 23%|██▎       | 70/300 [00:38<02:03,  1.86it/s]

{'loss': 0.6846, 'grad_norm': 2.5806026458740234, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}


 27%|██▋       | 80/300 [00:43<01:58,  1.86it/s]

{'loss': 0.6921, 'grad_norm': 2.801276445388794, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}


 30%|███       | 90/300 [00:49<01:52,  1.87it/s]

{'loss': 0.682, 'grad_norm': 2.792407512664795, 'learning_rate': 9e-06, 'epoch': 0.9}


 33%|███▎      | 100/300 [00:54<02:01,  1.64it/s]

{'loss': 0.6564, 'grad_norm': 1.937863826751709, 'learning_rate': 1e-05, 'epoch': 1.0}


 37%|███▋      | 110/300 [01:00<01:42,  1.86it/s]

{'loss': 0.6252, 'grad_norm': 2.1747634410858154, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.1}


 40%|████      | 120/300 [01:05<01:38,  1.83it/s]

{'loss': 0.574, 'grad_norm': 3.0119564533233643, 'learning_rate': 1.2e-05, 'epoch': 1.2}


 43%|████▎     | 130/300 [01:11<01:31,  1.85it/s]

{'loss': 0.4803, 'grad_norm': 8.573603630065918, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.3}


 47%|████▋     | 140/300 [01:16<01:25,  1.86it/s]

{'loss': 0.4262, 'grad_norm': 12.746912956237793, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.4}


 50%|█████     | 150/300 [01:21<01:20,  1.86it/s]

{'loss': 0.3943, 'grad_norm': 16.025938034057617, 'learning_rate': 1.5e-05, 'epoch': 1.5}


 53%|█████▎    | 160/300 [01:27<01:15,  1.86it/s]

{'loss': 0.4352, 'grad_norm': 11.32339096069336, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.6}


 57%|█████▋    | 170/300 [01:32<01:10,  1.85it/s]

{'loss': 0.3425, 'grad_norm': 12.288833618164062, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.7}


 60%|██████    | 180/300 [01:38<01:04,  1.86it/s]

{'loss': 0.3961, 'grad_norm': 15.891372680664062, 'learning_rate': 1.8e-05, 'epoch': 1.8}


 63%|██████▎   | 190/300 [01:43<00:59,  1.86it/s]

{'loss': 0.4395, 'grad_norm': 8.1889009475708, 'learning_rate': 1.9e-05, 'epoch': 1.9}


 67%|██████▋   | 200/300 [01:48<00:53,  1.89it/s]

{'loss': 0.3082, 'grad_norm': 4.455150604248047, 'learning_rate': 2e-05, 'epoch': 2.0}


 70%|███████   | 210/300 [01:54<00:48,  1.85it/s]

{'loss': 0.1632, 'grad_norm': 0.8421816825866699, 'learning_rate': 2.1e-05, 'epoch': 2.1}


 73%|███████▎  | 220/300 [01:59<00:42,  1.87it/s]

{'loss': 0.2259, 'grad_norm': 17.32733726501465, 'learning_rate': 2.2000000000000003e-05, 'epoch': 2.2}


 77%|███████▋  | 230/300 [02:04<00:37,  1.87it/s]

{'loss': 0.087, 'grad_norm': 0.2724447548389435, 'learning_rate': 2.3000000000000003e-05, 'epoch': 2.3}


 80%|████████  | 240/300 [02:10<00:32,  1.85it/s]

{'loss': 0.224, 'grad_norm': 0.6501181721687317, 'learning_rate': 2.4e-05, 'epoch': 2.4}


 83%|████████▎ | 250/300 [02:15<00:26,  1.87it/s]

{'loss': 0.1617, 'grad_norm': 1.7636594772338867, 'learning_rate': 2.5e-05, 'epoch': 2.5}


 87%|████████▋ | 260/300 [02:21<00:21,  1.87it/s]

{'loss': 0.3547, 'grad_norm': 34.078372955322266, 'learning_rate': 2.6000000000000002e-05, 'epoch': 2.6}


 90%|█████████ | 270/300 [02:26<00:16,  1.85it/s]

{'loss': 0.6289, 'grad_norm': 73.39103698730469, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.7}


 93%|█████████▎| 280/300 [02:31<00:10,  1.85it/s]

{'loss': 0.2679, 'grad_norm': 1.448103904724121, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.8}


 97%|█████████▋| 290/300 [02:37<00:05,  1.86it/s]

{'loss': 0.2979, 'grad_norm': 0.3696131706237793, 'learning_rate': 2.9e-05, 'epoch': 2.9}


100%|██████████| 300/300 [02:42<00:00,  1.84it/s]

{'loss': 0.1945, 'grad_norm': 48.17932891845703, 'learning_rate': 3e-05, 'epoch': 3.0}
{'train_runtime': 162.6225, 'train_samples_per_second': 14.74, 'train_steps_per_second': 1.845, 'train_loss': 0.4633541278044383, 'epoch': 3.0}





TrainOutput(global_step=300, training_loss=0.4633541278044383, metrics={'train_runtime': 162.6225, 'train_samples_per_second': 14.74, 'train_steps_per_second': 1.845, 'total_flos': 317524354578432.0, 'train_loss': 0.4633541278044383, 'epoch': 3.0})

In [6]:
# from transformers import pipeline
# from sklearn.metrics import accuracy_score, f1_score

# def evaluate_model(model, tokenizer, dataset):
#     nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
#     texts = dataset['text'].tolist()  # Convert the column to a list of strings
#     preds = [1 if item['label'] == 'POSITIVE' else 0 for item in nlp(texts)]
#     labels = dataset['label'].tolist()  # Convert the column to a list of labels
#     accuracy = accuracy_score(labels, preds)
#     f1 = f1_score(labels, preds)
#     return accuracy, f1

# # Evaluate fine-tuned model
# finetuned_model = model
# finetuned_accuracy, finetuned_f1 = evaluate_model(finetuned_model, tokenizer, val_df)

# # Load the original pre-trained model (non-fine-tuned)
# pretrained_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
# pretrained_accuracy, pretrained_f1 = evaluate_model(pretrained_model, tokenizer, val_df)

# # Print the results
# print(f'Fine-tuned Model - Accuracy: {finetuned_accuracy}, F1 Score: {finetuned_f1}')
# print(f'Pre-trained Model - Accuracy: {pretrained_accuracy}, F1 Score: {pretrained_f1}')


# Evaulating DistilBERT fine-tuned vs pretrained

In [5]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm  # Use standard tqdm

def evaluate_model(model, dataset, batch_size=8):
    dataloader = DataLoader(dataset, batch_size=batch_size)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    preds = []
    labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):  # Use standard tqdm
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels.extend(batch['label'].cpu().numpy())
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return accuracy, f1

# Evaluate fine-tuned model
finetuned_model = model
finetuned_accuracy, finetuned_f1 = evaluate_model(finetuned_model, val_dataset)

# Load the original pre-trained model (non-fine-tuned)
pretrained_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
pretrained_accuracy, pretrained_f1 = evaluate_model(pretrained_model, val_dataset)

# Print the results
print(f'Fine-tuned Model - Accuracy: {finetuned_accuracy}, F1 Score: {finetuned_f1}')
print(f'Pre-trained Model - Accuracy: {pretrained_accuracy}, F1 Score: {pretrained_f1}')


100%|██████████| 25/25 [00:14<00:00,  1.73it/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 25/25 [00:14<00:00,  1.75it/s]

Fine-tuned Model - Accuracy: 0.88, F1 Score: 0.8723404255319149
Pre-trained Model - Accuracy: 0.51, F1 Score: 0.0





# Same process as above but on another model?