In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from utils.dataset_utils import imdbDataSet
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from time import time
import numpy as np


PreProcessing

In [None]:
df = pd.read_csv('data/raw/IMDB_Dataset.csv')
train_dataset, test_dataset = train_test_split(df, test_size=0.2, shuffle=True, random_state=42, stratify=df['sentiment'])

Upload Dataset and BERT

In [None]:
model_id = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'
device = torch.device('mps')

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id).to(device)

model.eval()

data = imdbDataSet(df=train_dataset, tokenizer=tokenizer)
loader = DataLoader(
    data, 
    batch_size=128, 
    shuffle=True,
    num_workers=4,
    prefetch_factor=4,
    persistent_workers=4
)

In [None]:
'''''
num_batches = 0 
probs = []
with torch.no_grad():
    for batch in loader:
        start_time= time()
        
        input_id = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_id, attention_mask=attention_mask)
        batch_probs = F.softmax(outputs.logits, dim=1)
        
        probs.extend(batch_probs.cpu().tolist())
        
        num_batches += 1
        end_time = time()
        elapsed = end_time - start_time
        if num_batches % 50 == 0:
            print(f"Batch Number: {num_batches} , Time Elapsed:{elapsed:.4f} seconds")
            
probs = np.array(probs)
predicted_labels = np.where(probs[:, 0] > 0.5, 'positive', 'negative')
df = df.iloc[:len(probs)]
accuracy = (predicted_labels == df['sentiment'].values).mean()

print(f"Accuracy{accuracy:.4f}")
        
        
'''        
    


Finetuning

In [None]:
from peft import get_peft_model, LoraConfig, TaskType
from torch.optim import AdamW
from tqdm import tqdm


In [None]:
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.05,
    bias='none',
    task_type= TaskType.SEQ_CLS,
    inference_mode=False
)
model = get_peft_model(model, config).to(device)


In [None]:
from torch.nn.utils import clip_grad_norm_

optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(5):
    model.train()
    total_loss = 0
    total_samples = 0
    total_correct = 0
    batch_count = 0 # 

    for batch in tqdm(loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['sentiment'].to(device, non_blocking=True)

        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item() * input_ids.size(0)
        preds = outputs.logits.argmax(dim=-1)
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)
        batch_count += 1
        
        if batch_count % 100 == 0:
            avg_loss = total_loss / total_samples
            accuracy = total_correct / total_samples
            print(f"Epoch {epoch+1} | Batch {batch_count} | "
                  f"Loss: {avg_loss:.4f} | Acc: {accuracy:.4f}")

    avg_loss = total_loss / total_samples
    epoch_accuracy = total_correct / total_samples
    print(f"\nEpoch {epoch+1} Summary:")
    print(f"  Avg Loss: {avg_loss:.4f}")
    print(f"  Accuracy: {epoch_accuracy * 100:.2f}%\n")