<a href="https://colab.research.google.com/github/nursenakok/IMDB-LoRA-Finetuning/blob/main/1_IMDB_LoRA_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Library

!pip install -q transformers datasets peft accelerate # Install required libraries

In [None]:
# 2. Data

from datasets import load_dataset                     # Import the Hugging Face Datasets library
dataset = load_dataset("stanfordnlp/imdb")            # Load the IMDb dataset (50k movie reviews labeled as positive/negative)

In [None]:
# 3. Tokenization

from transformers import AutoTokenizer  # Import the AutoTokenizer class from Hugging Face transformers
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # Load the tokenizer for the DistilBERT model

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256) # Tokenize text and pad/truncate to max length

tokenized_datasets = dataset.map(tokenize_function, batched=True) # Apply tokenizer to entire dataset


In [None]:
# 4. Model

from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)  # Load DistilBERT for 2-class classification
lora_config = LoraConfig(r=8, lora_alpha=16, target_modules=["q_lin", "v_lin"], lora_dropout=0.3, bias="none", task_type="SEQ_CLS") # LoRA config
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# 5. Training Arguments

import torch
from transformers import TrainingArguments, Trainer

# Check GPU memory before training
print("GPU STATUS BEFORE TRAINING:")
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Total memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Set training arguments
training_args = TrainingArguments(
    output_dir="./imdb-lora-model",    # Directory to save the trained model
    learning_rate=2e-4,                # Learning rate
    per_device_train_batch_size=8,     # Batch size per GPU
    per_device_eval_batch_size=8,
    num_train_epochs=3,                # of training epochs
    weight_decay=0.01,                 # Regularization
    eval_strategy="epoch",             # Evaluate every epoch
    save_strategy="epoch",             # Save model every epoch
    load_best_model_at_end=True,       # Load best model at the end
    logging_steps=100,                 # Log every 100 steps
    fp16=True,                         # Mixed precision for memory efficiency
    report_to="none"                   # Disable TensorBoard reporting
)

# Create Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer

)



# Final GPU memory check
print("GPU STATUS BEFORE STARTING TRAINING:")
print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

In [None]:
# 6. Training

trainer.train()
print("TRAINING COMPLETED!")  # Notify that training has finished

In [None]:
# 7. Accuracy

from sklearn.metrics import accuracy_score
import numpy as np

# compute_metrics fonk
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


test_results = trainer.evaluate(tokenized_datasets["test"])
print(f"🎯 TEST ACCURACY: {test_results['eval_accuracy']:.4f}")

In [None]:
# 8. Save Model

trainer.save_model("imdb-lora-model")

In [None]:
# 9. Test & Inference

from transformers import pipeline

classifier = pipeline("text-classification", model="imdb-lora-model")

# Test et
test_texts = [
    "This movie was absolutely fantastic!",
    "Terrible acting and boring story.",
    "One of the best films I've ever seen!",
    "That was amazing",
    "Worst film ever made",
    "Brilliant cinematography and acting"

]

for text in test_texts:
    result = classifier(text)
    print(f"🎬 '{text[:30]}...' → {result[0]['label']} ({(result[0]['score']*100):.1f}%)")