# MLOps Final Project:

Comparing performance of fine-tuned LLM vs out of the box pretrained LLM on sentiment analysis task for IMDB reviews dataset

# Data Preprocessing

In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load dataset
# dataset = load_dataset('imdb')
# train_dataset = dataset['train']
# test_dataset = dataset['test']
df = pd.read_csv('movie_short.csv')

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(val_df)


# Preprocess the data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Load model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(logits, dim=-1)
    return metric.compute(predictions=predictions, references=labels)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

# Evaluate model
eval_result = trainer.evaluate()
print(eval_result)





Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.7016, 'grad_norm': 1.2517576217651367, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.6893, 'grad_norm': 1.4068324565887451, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.6914, 'grad_norm': 1.2125697135925293, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6856, 'grad_norm': 2.0932648181915283, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6793, 'grad_norm': 1.796614170074463, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6942, 'grad_norm': 1.592820405960083, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.6838, 'grad_norm': 2.9470982551574707, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.6581, 'grad_norm': 2.7445967197418213, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6594, 'grad_norm': 3.3230276107788086, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.5784, 'grad_norm': 3.5184383392333984, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.5139, 'grad_norm': 4.00328588

  0%|          | 0/25 [00:00<?, ?it/s]

TypeError: argmax(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load the dataset
df = pd.read_csv('movie_short.csv')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True)

# Convert to torch dataset
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_df['label'].tolist())
val_dataset = IMDbDataset(val_encodings, val_df['label'].tolist())

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define the evaluation function
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='binary')
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=0,  # No training, just evaluation
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Evaluate the pre-trained model
eval_result = trainer.evaluate()
print(eval_result)

# Training arguments for fine-tuning
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # You can adjust this as needed
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize the Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

# Evaluate the fine-tuned model
eval_result_fine_tuned = trainer.evaluate()
print(eval_result_fine_tuned)

print("Pre-trained DistilBERT evaluation:")
print(eval_result)

print("Fine-tuned DistilBERT evaluation:")
print(eval_result_fine_tuned)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6929181814193726, 'eval_accuracy': 0.525, 'eval_f1': 0.31654676258992803, 'eval_precision': 0.5365853658536586, 'eval_recall': 0.22448979591836735, 'eval_runtime': 4.4391, 'eval_samples_per_second': 45.054, 'eval_steps_per_second': 2.929}


  0%|          | 0/150 [00:00<?, ?it/s]

{'loss': 0.6974, 'grad_norm': 1.4938287734985352, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.2}
{'loss': 0.6979, 'grad_norm': 1.3188352584838867, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.4}
{'loss': 0.6834, 'grad_norm': 1.2274770736694336, 'learning_rate': 3e-06, 'epoch': 0.6}
{'loss': 0.6947, 'grad_norm': 1.697504997253418, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6842, 'grad_norm': 0.967103123664856, 'learning_rate': 5e-06, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6836497783660889, 'eval_accuracy': 0.65, 'eval_f1': 0.5138888888888888, 'eval_precision': 0.8043478260869565, 'eval_recall': 0.37755102040816324, 'eval_runtime': 4.4577, 'eval_samples_per_second': 44.866, 'eval_steps_per_second': 2.916, 'epoch': 1.0}
{'loss': 0.6825, 'grad_norm': 1.4007461071014404, 'learning_rate': 6e-06, 'epoch': 1.2}
{'loss': 0.6692, 'grad_norm': 3.2846662998199463, 'learning_rate': 7.000000000000001e-06, 'epoch': 1.4}
{'loss': 0.656, 'grad_norm': 2.2971441745758057, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.6}
{'loss': 0.6312, 'grad_norm': 1.886910080909729, 'learning_rate': 9e-06, 'epoch': 1.8}
{'loss': 0.5765, 'grad_norm': 2.567556619644165, 'learning_rate': 1e-05, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.5507641434669495, 'eval_accuracy': 0.78, 'eval_f1': 0.8070175438596491, 'eval_precision': 0.7076923076923077, 'eval_recall': 0.9387755102040817, 'eval_runtime': 4.3776, 'eval_samples_per_second': 45.687, 'eval_steps_per_second': 2.97, 'epoch': 2.0}
{'loss': 0.4901, 'grad_norm': 2.909862518310547, 'learning_rate': 1.1000000000000001e-05, 'epoch': 2.2}
{'loss': 0.369, 'grad_norm': 3.7406013011932373, 'learning_rate': 1.2e-05, 'epoch': 2.4}
{'loss': 0.3643, 'grad_norm': 8.123082160949707, 'learning_rate': 1.3000000000000001e-05, 'epoch': 2.6}
{'loss': 0.3566, 'grad_norm': 9.323834419250488, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.8}
{'loss': 0.2799, 'grad_norm': 5.017178535461426, 'learning_rate': 1.5e-05, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.30060872435569763, 'eval_accuracy': 0.89, 'eval_f1': 0.8829787234042553, 'eval_precision': 0.9222222222222223, 'eval_recall': 0.8469387755102041, 'eval_runtime': 4.3009, 'eval_samples_per_second': 46.501, 'eval_steps_per_second': 3.023, 'epoch': 3.0}
{'train_runtime': 181.8816, 'train_samples_per_second': 13.179, 'train_steps_per_second': 0.825, 'train_loss': 0.5688720019658406, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.30060872435569763, 'eval_accuracy': 0.89, 'eval_f1': 0.8829787234042553, 'eval_precision': 0.9222222222222223, 'eval_recall': 0.8469387755102041, 'eval_runtime': 4.2623, 'eval_samples_per_second': 46.923, 'eval_steps_per_second': 3.05, 'epoch': 3.0}
Pre-trained DistilBERT evaluation:
{'eval_loss': 0.6929181814193726, 'eval_accuracy': 0.525, 'eval_f1': 0.31654676258992803, 'eval_precision': 0.5365853658536586, 'eval_recall': 0.22448979591836735, 'eval_runtime': 4.4391, 'eval_samples_per_second': 45.054, 'eval_steps_per_second': 2.929}
Fine-tuned DistilBERT evaluation:
{'eval_loss': 0.30060872435569763, 'eval_accuracy': 0.89, 'eval_f1': 0.8829787234042553, 'eval_precision': 0.9222222222222223, 'eval_recall': 0.8469387755102041, 'eval_runtime': 4.2623, 'eval_samples_per_second': 46.923, 'eval_steps_per_second': 3.05, 'epoch': 3.0}


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import loralib as lora
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

# Load the dataset
df = pd.read_csv('movie_short.csv')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True)

# Convert to torch dataset
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_df['label'].tolist())
val_dataset = IMDbDataset(val_encodings, val_df['label'].tolist())

# Define the evaluation function
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='binary')
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Load the pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Evaluate the pre-trained model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=0,  # No training, just evaluation
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

eval_result = trainer.evaluate()

# Fine-tune the DistilBERT model without LoRA
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

eval_result_fine_tuned = trainer.evaluate()


lora_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules= ["attention.q_lin", "attention.k_lin", "attention.v_lin"], #["query", "key", "value"],  # Specify the target modules
    bias="none",
)

lora_model = get_peft_model(lora_model, lora_config)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

eval_result_lora = trainer.evaluate()


# # Fine-tune the DistilBERT model with LoRA
# class LoRADistilBertForSequenceClassification(DistilBertForSequenceClassification):
#     def __init__(self, config):
#         super().__init__(config)
#         lora.apply_lora(self)

# # Load and wrap the DistilBERT model with LoRA
# lora_model = LoRADistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# trainer = Trainer(
#     model=lora_model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics
# )

# trainer.train()

# eval_result_lora_fine_tuned = trainer.evaluate()





# Print the evaluation results
print("Pre-trained DistilBERT evaluation:")
print(eval_result)

print("Regular fine-tuned DistilBERT evaluation:")
print(eval_result_fine_tuned)

print("LoRA fine-tuned DistilBERT evaluation:")
print(eval_result_lora)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'loss': 0.6845, 'grad_norm': 1.402540922164917, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.2}
{'loss': 0.6878, 'grad_norm': 1.3427975177764893, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.4}
{'loss': 0.6826, 'grad_norm': 1.4069232940673828, 'learning_rate': 3e-06, 'epoch': 0.6}
{'loss': 0.6958, 'grad_norm': 1.7702349424362183, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6714, 'grad_norm': 1.014061689376831, 'learning_rate': 5e-06, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6795913577079773, 'eval_accuracy': 0.655, 'eval_f1': 0.46511627906976744, 'eval_precision': 0.967741935483871, 'eval_recall': 0.30612244897959184, 'eval_runtime': 4.4483, 'eval_samples_per_second': 44.961, 'eval_steps_per_second': 2.922, 'epoch': 1.0}
{'train_runtime': 62.0851, 'train_samples_per_second': 12.869, 'train_steps_per_second': 0.805, 'train_loss': 0.6844189453125, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/50 [00:00<?, ?it/s]

{'loss': 0.67, 'grad_norm': 1.5083074569702148, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.2}
{'loss': 0.6747, 'grad_norm': 1.4883174896240234, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.4}
{'loss': 0.6623, 'grad_norm': 1.5165575742721558, 'learning_rate': 3e-06, 'epoch': 0.6}
{'loss': 0.6727, 'grad_norm': 1.9634284973144531, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6407, 'grad_norm': 1.265491008758545, 'learning_rate': 5e-06, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.648670494556427, 'eval_accuracy': 0.805, 'eval_f1': 0.7936507936507936, 'eval_precision': 0.8241758241758241, 'eval_recall': 0.7653061224489796, 'eval_runtime': 4.4635, 'eval_samples_per_second': 44.808, 'eval_steps_per_second': 2.913, 'epoch': 1.0}
{'train_runtime': 61.1212, 'train_samples_per_second': 13.072, 'train_steps_per_second': 0.818, 'train_loss': 0.6640888595581055, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

Pre-trained DistilBERT evaluation:
{'eval_loss': 0.690392017364502, 'eval_accuracy': 0.52, 'eval_f1': 0.04, 'eval_precision': 1.0, 'eval_recall': 0.02040816326530612, 'eval_runtime': 4.4976, 'eval_samples_per_second': 44.468, 'eval_steps_per_second': 2.89}
Regular fine-tuned DistilBERT evaluation:
{'eval_loss': 0.6795913577079773, 'eval_accuracy': 0.655, 'eval_f1': 0.46511627906976744, 'eval_precision': 0.967741935483871, 'eval_recall': 0.30612244897959184, 'eval_runtime': 4.3478, 'eval_samples_per_second': 46.0, 'eval_steps_per_second': 2.99, 'epoch': 1.0}
LoRA fine-tuned DistilBERT evaluation:
{'eval_loss': 0.648670494556427, 'eval_accuracy': 0.805, 'eval_f1': 0.7936507936507936, 'eval_precision': 0.8241758241758241, 'eval_recall': 0.7653061224489796, 'eval_runtime': 4.3584, 'eval_samples_per_second': 45.889, 'eval_steps_per_second': 2.983, 'epoch': 1.0}


In [1]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

df = pd.read_csv('movie_short.csv')

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])




Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

# DistilBERT Fine-Tuning

In [2]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.6878, 'grad_norm': 1.3967981338500977, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.6949, 'grad_norm': 1.4446425437927246, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.698, 'grad_norm': 1.2860491275787354, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6927, 'grad_norm': 1.5432770252227783, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6645, 'grad_norm': 1.4422451257705688, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.7013, 'grad_norm': 1.577475666999817, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.7076, 'grad_norm': 2.1963300704956055, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.6862, 'grad_norm': 2.656816244125366, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6699, 'grad_norm': 2.4682140350341797, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.6375, 'grad_norm': 1.9719682931900024, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.6135, 'grad_norm': 2.478562355

TrainOutput(global_step=300, training_loss=0.4530573260784149, metrics={'train_runtime': 177.3973, 'train_samples_per_second': 13.512, 'train_steps_per_second': 1.691, 'total_flos': 317524354578432.0, 'train_loss': 0.4530573260784149, 'epoch': 3.0})

# Evaulating DistilBERT fine-tuned vs pretrained

In [3]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm  # Use standard tqdm
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score


def evaluate_model(model, dataset, batch_size=8):
    dataloader = DataLoader(dataset, batch_size=batch_size)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    preds = []
    labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader):  
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels.extend(batch['label'].cpu().numpy())
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return accuracy, f1

finetuned_model = model
finetuned_accuracy, finetuned_f1 = evaluate_model(finetuned_model, val_dataset)

pretrained_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
pretrained_accuracy, pretrained_f1 = evaluate_model(pretrained_model, val_dataset)

print(f'Fine-tuned Model - Accuracy: {finetuned_accuracy}, F1 Score: {finetuned_f1}')
print(f'Pre-trained Model - Accuracy: {pretrained_accuracy}, F1 Score: {pretrained_f1}')


100%|██████████| 25/25 [00:15<00:00,  1.60it/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 25/25 [00:15<00:00,  1.66it/s]

Fine-tuned Model - Accuracy: 0.875, F1 Score: 0.8663101604278075
Pre-trained Model - Accuracy: 0.51, F1 Score: 0.0





# Fine tuning with LoRA instead

In [4]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

lora_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules= ["attention.q_lin", "attention.k_lin", "attention.v_lin"], #["query", "key", "value"],  # Specify the target modules
    bias="none",
)

lora_model = get_peft_model(lora_model, lora_config)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

lora_finetuned_model = lora_model
lora_finetuned_accuracy, lora_finetuned_f1 = evaluate_model(lora_finetuned_model, val_dataset)

print(f'Fine-tuned Model - Accuracy: {lora_finetuned_accuracy}, F1 Score: {lora_finetuned_f1}')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.7221, 'grad_norm': 0.10963136702775955, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.7116, 'grad_norm': 0.1270284503698349, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.706, 'grad_norm': 0.10134343057870865, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.7136, 'grad_norm': 0.1646464467048645, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.7533, 'grad_norm': 0.15595752000808716, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.7051, 'grad_norm': 0.11388856917619705, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.7031, 'grad_norm': 0.21470071375370026, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.6796, 'grad_norm': 0.13678190112113953, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.7245, 'grad_norm': 0.1396872103214264, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.7288, 'grad_norm': 0.164378821849823, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.7124, 'grad_norm': 0.12

100%|██████████| 25/25 [00:17<00:00,  1.42it/s]

Fine-tuned Model - Accuracy: 0.645, F1 Score: 0.6926406926406926





# Same process as above but on DistillRoBERTa

In [5]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.6999, 'grad_norm': 1.3030779361724854, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.7006, 'grad_norm': 2.0900352001190186, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.6983, 'grad_norm': 1.2767045497894287, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6956, 'grad_norm': 1.710085391998291, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6823, 'grad_norm': 1.9790459871292114, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.7201, 'grad_norm': 1.8730825185775757, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.7134, 'grad_norm': 3.2479703426361084, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.7103, 'grad_norm': 3.5436835289001465, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6854, 'grad_norm': 2.443458318710327, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.6914, 'grad_norm': 2.5282208919525146, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.692, 'grad_norm': 2.087980270

TrainOutput(global_step=300, training_loss=0.6957423591613769, metrics={'train_runtime': 164.4032, 'train_samples_per_second': 14.58, 'train_steps_per_second': 1.825, 'total_flos': 317524354578432.0, 'train_loss': 0.6957423591613769, 'epoch': 3.0})

In [6]:

finetuned_model = model
finetuned_accuracy, finetuned_f1 = evaluate_model(finetuned_model, val_dataset)

pretrained_model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=2)
pretrained_accuracy, pretrained_f1 = evaluate_model(pretrained_model, val_dataset)

print(f'Fine-tuned Model - Accuracy: {finetuned_accuracy}, F1 Score: {finetuned_f1}')
print(f'Pre-trained Model - Accuracy: {pretrained_accuracy}, F1 Score: {pretrained_f1}')


100%|██████████| 25/25 [00:15<00:00,  1.57it/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 25/25 [00:15<00:00,  1.60it/s]

Fine-tuned Model - Accuracy: 0.545, F1 Score: 0.3357664233576642
Pre-trained Model - Accuracy: 0.49, F1 Score: 0.6577181208053692





In [7]:
lora_model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=2)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules= ["attention.self.query", "attention.self.key", "attention.self.value"], #["query", "key", "value"],  # Specify the target modules
    bias="none",
)

lora_model = get_peft_model(lora_model, lora_config)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

lora_finetuned_model = lora_model
lora_finetuned_accuracy, lora_finetuned_f1 = evaluate_model(lora_finetuned_model, val_dataset)

print(f'Fine-tuned Model - Accuracy: {lora_finetuned_accuracy}, F1 Score: {lora_finetuned_f1}')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.6945, 'grad_norm': 0.15243421494960785, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}
{'loss': 0.6931, 'grad_norm': 0.15644918382167816, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}
{'loss': 0.696, 'grad_norm': 0.12537923455238342, 'learning_rate': 3e-06, 'epoch': 0.3}
{'loss': 0.6914, 'grad_norm': 0.16847458481788635, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}
{'loss': 0.6838, 'grad_norm': 0.22018037736415863, 'learning_rate': 5e-06, 'epoch': 0.5}
{'loss': 0.6901, 'grad_norm': 0.1848909705877304, 'learning_rate': 6e-06, 'epoch': 0.6}
{'loss': 0.6942, 'grad_norm': 0.29606980085372925, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}
{'loss': 0.7001, 'grad_norm': 0.3684787154197693, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}
{'loss': 0.6931, 'grad_norm': 0.28186720609664917, 'learning_rate': 9e-06, 'epoch': 0.9}
{'loss': 0.6878, 'grad_norm': 0.1738651543855667, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.6935, 'grad_norm': 0.

100%|██████████| 25/25 [00:17<00:00,  1.45it/s]

Fine-tuned Model - Accuracy: 0.51, F1 Score: 0.0



