<a href="https://colab.research.google.com/github/marsalan06/finetuning-llm/blob/main/lower_rank_adaption_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U peft transformers datasets evaluate

Collecting peft
  Downloading peft-0.16.0-py3-none-any.whl.metadata (14 kB)
Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from

In [2]:
import torch
import csv
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import evaluate

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
def load_data():
    """Loads the dataset from Hugging Face"""
    dataset = load_dataset("iamtarun/code_instructions_120k_alpaca")
    print(f"Dataset loaded: {dataset}")

    # Split the dataset into train and validation sets (80% train, 20% validation)
    dataset = dataset["train"].train_test_split(test_size=0.2)
    print(f"Dataset after splitting: {dataset}")

    return dataset

In [5]:
# Load and split the dataset
dataset = load_data()

# Step 3: Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
# Ensure a padding token exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/753 [00:00<?, ?B/s]

(…)-00000-of-00001-d9b93805488c263e.parquet:   0%|          | 0.00/72.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/121959 [00:00<?, ? examples/s]

Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 121959
    })
})
Dataset after splitting: DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 97567
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt'],
        num_rows: 24392
    })
})


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
def preprocess_data(dataset, tokenizer):
    """Preprocesses the dataset by combining instruction and input as input prompt."""
    def preprocess_function(examples):
        # Combine the instruction and input to form a prompt
        combined_input = [f"Instruction: {instruction}\nInput: {input_data}" for instruction, input_data in zip(examples['instruction'], examples['input'])]

        # Tokenize the combined input as prompt, and the output as the label
        tokenized_inputs = tokenizer(combined_input, padding="max_length", truncation=True, max_length=512)

        # Tokenize the output (target text) - this will be the label
        tokenized_labels = tokenizer(examples['output'], padding="max_length", truncation=True, max_length=512)

        # Return the tokenized inputs and outputs
        tokenized_inputs['labels'] = tokenized_labels['input_ids']
        return tokenized_inputs

    # Apply the tokenization function to the dataset
    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    print(f"Dataset after tokenization: {tokenized_dataset}")
    return tokenized_dataset

# Preprocess the dataset
tokenized_dataset = preprocess_data(dataset, tokenizer)
print("----tokenized-dataset---")
print(tokenized_dataset)
print(tokenized_dataset['train'][2])

Map:   0%|          | 0/97567 [00:00<?, ? examples/s]

Map:   0%|          | 0/24392 [00:00<?, ? examples/s]

Dataset after tokenization: DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 97567
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 24392
    })
})
----tokenized-dataset---
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 97567
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 24392
    })
})
{'instruction': 'Create a Java program for the following scenario', 'input': 'A mobile app that allows users to check if a course syllabus is up to date.', 'output': 'import java.io.*;\nimport java.util.*;\n\npublic class CheckCourse {\n    public static void main(String[] args) {\n\n  

the first tokenizer operation adds input ids and attention mask for the instruction+ input , the second tokenizer goes to tokenized_labels with input_ids and attention_mask of labels that we add to the first variable under labels

In [7]:
def setup_model(model_checkpoint="distilgpt2"):
    """Set up the base model and tokenizer"""
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    if tokenizer.pad_token is None:
      tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
    model.to(device)  # Move the model to the selected device (GPU or CPU)
    print(f"Base model {model_checkpoint} loaded successfully.")
    return model, tokenizer

In [8]:
#load base model and tokenizer
model, tokenizer = setup_model()


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Base model distilgpt2 loaded successfully.


In [9]:
#print(model) #to see the layers
def setup_lora(model, tokenizer):
    """Set up LoRA (Low-Rank Adaptation) for fine-tuning"""
    lora_config = LoraConfig(
        task_type="CAUSAL_LM", #for models that predict next words
        inference_mode=False, #inference mode false , train mode true
        r=8, #rank 8
        lora_alpha=32, #scaling factor, larger value indicates greater impact of rank during training
        lora_dropout=0.1, #drop out 10% of layers
        target_modules=["attn.c_attn",  # Attention layer: query, key, value weights
            "attn.c_proj",  # Attention layer: projection weights
            "mlp.c_fc",     # MLP layer: fully connected layer
            "mlp.c_proj"]   # MLP layer: projection layer] #attention layers to update
    )
    model = get_peft_model(model, lora_config) #apply lora_config to model
    print("LoRA configuration applied.")
    return model

In [10]:
print(model)
model = setup_lora(model, tokenizer)
print(model)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
LoRA configuration applied.
PeftModelForCausalLM(
  (ba



In [11]:
!pip install --upgrade transformers



In [12]:
from transformers import Trainer, TrainingArguments


In [13]:
#create trainer
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    load_best_model_at_end=True,          # Ensure the best model is saved at the end
    metric_for_best_model="accuracy",    # Track best model based on accuracy
    report_to = "none"
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],   # Now using the 'test' split as validation
    tokenizer=tokenizer
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Fine-tune the model
print("Training the model...")
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
print("Fine-tuned model saved to './fine_tuned_model'")

Training the model...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss


In [None]:
def generate_text(model, tokenizer, input_text, max_length=100):
    """Generate text from the model based on input text"""
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move input data to device

    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], max_length=max_length, num_return_sequences=1, do_sample=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


In [None]:
inputs = [
    "Write a Python function to reverse a string",
    "Write a function to check if a number is prime"
]

# Generate text from the base model
base_model_outputs = [generate_text(model, tokenizer, input_text) for input_text in inputs]

# Load the fine-tuned model
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./fine_tuned_model")
fine_tuned_model.to(device)  # Move fine-tuned model to device
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_model")

# Generate text from the fine-tuned model
finetuned_model_outputs = [generate_text(fine_tuned_model, fine_tuned_tokenizer, input_text) for input_text in inputs]


In [None]:
# Print the outputs from both models
for input_text, base_output, finetuned_output in zip(inputs, base_model_outputs, finetuned_model_outputs):
    print(f"Input: {input_text}")
    print(f"Base Model Output: {base_output}")
    print(f"Fine-Tuned Model Output: {finetuned_output}")
    print("="*50)

# Step 9: Save Predictions to CSV for Manual Ranking
# Function to save predictions to CSV
def save_predictions_to_csv(inputs, base_model_outputs, finetuned_model_outputs):
    """Save the input and model outputs into a CSV file"""
    with open("generated_outputs.csv", mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Input", "Base Model Output", "Fine-Tuned Model Output"])
        for input_text, base_output, finetuned_output in zip(inputs, base_model_outputs, finetuned_model_outputs):
            writer.writerow([input_text, base_output, finetuned_output])

# Save predictions to CSV
save_predictions_to_csv(inputs, base_model_outputs, finetuned_model_outputs)

print("Generated outputs saved to 'generated_outputs.csv'")