# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [1]:
import os
import json
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding
)
from peft import (
    LoraConfig, 
    get_peft_model, 
    PeftConfig,
    prepare_model_for_kbit_training
)

In [2]:

# Load the dataset and split it
dataset = load_dataset("sms_spam", split="train").train_test_split(test_size=0.2, shuffle=True, seed=42)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["sms"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [3]:
# Load the base model
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.eos_token_id
model.resize_token_embeddings(len(tokenizer))

# Configure LoRA
lora_config = LoraConfig(
    r=8,  # Rank of the low-rank adaptation matrices
    lora_alpha=16,
    target_modules=["c_proj", "c_attn"],  # Fine-tune attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"  # Sequence classification task
)


# Prepare the model for LoRA fine-tuning and apply the LoRA configuration to the model

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Print trainable parameters info

print(model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 812,544 || all params: 125,253,888 || trainable%: 0.6487
PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
        



In [4]:
# Define compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": (preds == labels).mean()}

# Initialize the trainer with training arguments
trainer = Trainer(
    model= model,
    args = TrainingArguments(
        output_dir = "./lora_spam_model",
        evaluation_strategy ="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        load_best_model_at_end=True,
        save_total_limit=2,
        fp16=True,
    ),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()
results = trainer.evaluate()
print(results)

  trainer = Trainer(


  0%|          | 0/3345 [00:00<?, ?it/s]

{'loss': 0.867, 'grad_norm': 1.545792579650879, 'learning_rate': 1.705829596412556e-05, 'epoch': 0.45}
{'loss': 0.2549, 'grad_norm': 1.6139965057373047, 'learning_rate': 1.4068759342301946e-05, 'epoch': 0.9}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.10456153005361557, 'eval_accuracy': 0.9811659192825112, 'eval_runtime': 50.2, 'eval_samples_per_second': 22.211, 'eval_steps_per_second': 5.558, 'epoch': 1.0}




{'loss': 0.1313, 'grad_norm': 0.002746493322774768, 'learning_rate': 1.1079222720478326e-05, 'epoch': 1.35}
{'loss': 0.1145, 'grad_norm': 0.00021145293430890888, 'learning_rate': 8.089686098654708e-06, 'epoch': 1.79}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.11024250835180283, 'eval_accuracy': 0.9865470852017937, 'eval_runtime': 28.7975, 'eval_samples_per_second': 38.719, 'eval_steps_per_second': 9.688, 'epoch': 2.0}
{'loss': 0.0693, 'grad_norm': 0.006196494214236736, 'learning_rate': 5.100149476831091e-06, 'epoch': 2.24}
{'loss': 0.1468, 'grad_norm': 0.010995774529874325, 'learning_rate': 2.110612855007474e-06, 'epoch': 2.69}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.11637648195028305, 'eval_accuracy': 0.9865470852017937, 'eval_runtime': 28.9992, 'eval_samples_per_second': 38.449, 'eval_steps_per_second': 9.621, 'epoch': 3.0}
{'train_runtime': 1114.6726, 'train_samples_per_second': 12.001, 'train_steps_per_second': 3.001, 'train_loss': 0.24462810054488246, 'epoch': 3.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.10456153005361557, 'eval_accuracy': 0.9811659192825112, 'eval_runtime': 28.9574, 'eval_samples_per_second': 38.505, 'eval_steps_per_second': 9.635, 'epoch': 3.0}


## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [5]:
# Save the LoRA fine-tuned model
model_path = os.path.join(os.getcwd(), "lora_finetuned_model")
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

#Select random samples for evaluation
random_indices = np.random.choice(len(dataset["test"]), size=20, replace=False)
items_for_manual_review = dataset["test"].select(random_indices)

In [6]:
# Create a dataset with the required format
def prepare_data_for_prediction(examples):
    tokenized = tokenizer(
        examples["sms"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors=None  # Important: return lists, not tensors
    )
    tokenized["labels"] = examples["label"]
    return tokenized

# Convert to proper dataset format
review_dataset = Dataset.from_dict({
    "sms": items_for_manual_review["sms"],
    "label": items_for_manual_review["label"]
})

# Tokenize and format
tokenized_review_dataset = review_dataset.map(
    prepare_data_for_prediction,
    batched=True,
    remove_columns=review_dataset.column_names
)

# Get predictions
results = trainer.predict(tokenized_review_dataset)

# Get evaluation results and prepare data for JSON
eval_results = trainer.evaluate()

# Convert predictions to DataFrame
predictions = np.argmax(results.predictions, axis=1)
df = pd.DataFrame({
    "sms": items_for_manual_review["sms"],
    "label": items_for_manual_review["label"],
    "prediction": predictions
})

# Create a dictionary with both evaluation and prediction results
final_results = {
    "model_evaluation": eval_results,
    "predictions": df.to_dict(orient='records')
}

# Save to JSON with indentation for readability
json_output_path = os.path.join(os.getcwd(), "lora_prediction_results.json")
with open(json_output_path, 'w') as f:
    json.dump(final_results, f, indent=4)
print(f"\nResults saved to: {json_output_path}")


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/279 [00:00<?, ?it/s]


Results saved to: /home/nav/Projects_1/GenAI/PEFT_gp2/lora_prediction_results.json
