# PEFT Fine-tuning Project: GPT-2 with LoRA for Sequence Classification

In [1]:
!pip install -q scikit-learn

In [2]:
import torch
import numpy as np
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, AutoPeftModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

## 1. PREPARE THE FOUNDATION MODEL

In [3]:
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("Memory:", torch.cuda.get_device_properties(0).total_memory // (1024 ** 2), "MB")
else:
    print("No GPU detected")

GPU Name: Tesla T4
Memory: 14917 MB


In [4]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    pad_token_id=tokenizer.eos_token_id
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print(f"Original model parameters: {model.num_parameters():,}")

Original model parameters: 124,441,344


In [8]:
from datasets import load_dataset, Dataset, concatenate_datasets

# Load full splits
full_train = load_dataset("imdb", split="train")
full_test = load_dataset("imdb", split="test")

# Select 2500 positive and 2500 negative for train
neg_train = full_train.filter(lambda x: x["label"] == 0).select(range(2500))
pos_train = full_train.filter(lambda x: x["label"] == 1).select(range(2500))
dataset = concatenate_datasets([neg_train, pos_train])

# Select 500 positive and 500 negative for test
neg_test = full_test.filter(lambda x: x["label"] == 0).select(range(500))
pos_test = full_test.filter(lambda x: x["label"] == 1).select(range(500))
test_dataset = concatenate_datasets([neg_test, pos_test])

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [10]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        truncation=True, 
        padding=True, 
        max_length=256
    )

In [11]:
# Tokenize datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
# Set format for PyTorch
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [13]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# Define metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

## 2. EVALUATE PRETRAINED MODEL

In [15]:
# Training arguments for evaluation
eval_args = TrainingArguments(
    output_dir="./eval_results",
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    report_to=None
)

In [16]:
# Create trainer for original model
original_trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\n=== Evaluating Original Model ===")
original_results = original_trainer.evaluate()
print(f"Original Model - Accuracy: {original_results['eval_accuracy']:.4f}, F1: {original_results['eval_f1']:.4f}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



=== Evaluating Original Model ===


Original Model - Accuracy: 0.5010, F1: 0.3442


## 3. PERFORM LIGHTWEIGHT FINE-TUNING

In [17]:
# Create LoRA config
lora_config = LoraConfig(
    r=16,  # rank
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],  # GPT-2 specific modules
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

In [18]:
# Create PEFT model
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()



trainable params: 1,625,088 || all params: 126,064,896 || trainable%: 1.2890884390211212


In [19]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./lora_results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    report_to=None,
    learning_rate=2e-4,  # Higher LR for LoRA
)

In [20]:
# Create trainer for PEFT model
peft_trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\n=== Training PEFT Model ===")
peft_trainer.train()


=== Training PEFT Model ===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4989,0.368763,0.884,0.883429,0.891677,0.884
2,0.4679,0.347817,0.914,0.914,0.914007,0.914
3,0.2779,0.419246,0.917,0.91699,0.917202,0.917


TrainOutput(global_step=3750, training_loss=0.4815634696960449, metrics={'train_runtime': 741.9001, 'train_samples_per_second': 20.218, 'train_steps_per_second': 5.055, 'total_flos': 1997132267520000.0, 'train_loss': 0.4815634696960449, 'epoch': 3.0})

In [21]:
# Save the PEFT model
peft_model.save_pretrained("./gpt2_lora_imdb")
tokenizer.save_pretrained("./gpt2_lora_imdb")

('./gpt2_lora_imdb/tokenizer_config.json',
 './gpt2_lora_imdb/special_tokens_map.json',
 './gpt2_lora_imdb/vocab.json',
 './gpt2_lora_imdb/merges.txt',
 './gpt2_lora_imdb/added_tokens.json',
 './gpt2_lora_imdb/tokenizer.json')

## 4. PERFORM INFERENCE USING FINE-TUNED MODEL

In [22]:
print("\n=== Loading Saved PEFT Model ===")

# Load the saved PEFT model
loaded_peft_model = AutoPeftModelForSequenceClassification.from_pretrained("./gpt2_lora_imdb")
loaded_tokenizer = AutoTokenizer.from_pretrained("./gpt2_lora_imdb")


=== Loading Saved PEFT Model ===


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
loaded_peft_model.to(device)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Linear(
                in_features=768, out_features=2304, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embeddin

In [24]:
if loaded_tokenizer.pad_token is None:
    loaded_tokenizer.pad_token = loaded_tokenizer.eos_token  # or a suitable token
    loaded_tokenizer.pad_token_id = loaded_tokenizer.convert_tokens_to_ids(loaded_tokenizer.pad_token)

# Also set it in the model config
loaded_peft_model.config.pad_token_id = loaded_tokenizer.pad_token_id

In [25]:
# Create trainer for loaded PEFT model
loaded_trainer = Trainer(
    model=loaded_peft_model,
    args=eval_args,
    eval_dataset=tokenized_test,
    tokenizer=loaded_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\n=== Evaluating Fine-tuned Model ===")
finetuned_results = loaded_trainer.evaluate()


=== Evaluating Fine-tuned Model ===


In [26]:
print("\n" + "="*50)
print("PERFORMANCE COMPARISON")
print("="*50)
print(f"Original Model:")
print(f"  Accuracy: {original_results['eval_accuracy']:.4f}")
print(f"  F1 Score: {original_results['eval_f1']:.4f}")
print(f"  Precision: {original_results['eval_precision']:.4f}")
print(f"  Recall: {original_results['eval_recall']:.4f}")

print(f"\nFine-tuned Model:")
print(f"  Accuracy: {finetuned_results['eval_accuracy']:.4f}")
print(f"  F1 Score: {finetuned_results['eval_f1']:.4f}")
print(f"  Precision: {finetuned_results['eval_precision']:.4f}")
print(f"  Recall: {finetuned_results['eval_recall']:.4f}")

print(f"\nImprovement:")
accuracy_diff = finetuned_results['eval_accuracy'] - original_results['eval_accuracy']
f1_diff = finetuned_results['eval_f1'] - original_results['eval_f1']
print(f"  Accuracy: {accuracy_diff:+.4f}")
print(f"  F1 Score: {f1_diff:+.4f}")


PERFORMANCE COMPARISON
Original Model:
  Accuracy: 0.5010
  F1 Score: 0.3442
  Precision: 0.5230
  Recall: 0.5010

Fine-tuned Model:
  Accuracy: 0.9170
  F1 Score: 0.9170
  Precision: 0.9172
  Recall: 0.9170

Improvement:
  Accuracy: +0.4160
  F1 Score: +0.5728


## EXAMPLE INFERENCE

In [27]:
# Test examples
test_texts = [
    "This movie was absolutely Great! Great acting and storyline.",  # Positive
    "Terrible film, waste of time. Poor acting and boring plot.",     # Negative
    "I loved the cinematography and the soundtrack was beautiful.",   # Positive
    "The plot was predictable and the characters were dull.",         # Negative
    "It was okay, not the best but not the worst either.",            # Neutral
    "What a masterpiece! I was on the edge of my seat the entire time.",  # Positive
    "Completely unwatchable. I walked out of the theater halfway through.",  # Negative
    "Decent movie for a rainy afternoon. Nothing too special though.",  # Neutral
    "Absolutely stunning visuals and a compelling story!",            # Positive
    "The jokes were flat and the pacing was off throughout.",         # Negative
    "The performances were strong but the plot lacked originality.",  # Neutral
    "Heartwarming and inspiring — definitely a must-watch!",          # Positive
]


for i, text in enumerate(test_texts):
    inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = loaded_peft_model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
    
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    confidence = predictions[0][predicted_class].item()
    
    print(f"\nText {i+1}: {text[:50]}...")
    print(f"Prediction: {sentiment} ({confidence:.3f})")

print("\n=== Project Complete ===")
print(f"Saved files in './gpt2_lora_imdb':")
for file in os.listdir("./gpt2_lora_imdb"):
    print(f"  - {file}")


Text 1: This movie was absolutely Great! Great acting and ...
Prediction: Positive (1.000)

Text 2: Terrible film, waste of time. Poor acting and bori...
Prediction: Negative (1.000)

Text 3: I loved the cinematography and the soundtrack was ...
Prediction: Positive (1.000)

Text 4: The plot was predictable and the characters were d...
Prediction: Negative (0.999)

Text 5: It was okay, not the best but not the worst either...
Prediction: Negative (0.738)

Text 6: What a masterpiece! I was on the edge of my seat t...
Prediction: Positive (0.999)

Text 7: Completely unwatchable. I walked out of the theate...
Prediction: Negative (0.998)

Text 8: Decent movie for a rainy afternoon. Nothing too sp...
Prediction: Negative (0.592)

Text 9: Absolutely stunning visuals and a compelling story...
Prediction: Positive (1.000)

Text 10: The jokes were flat and the pacing was off through...
Prediction: Negative (0.999)

Text 11: The performances were strong but the plot lacked o...
Prediction: Neg