<a href="https://colab.research.google.com/github/mabench-tuc/LoRA-of-LLMs/blob/main/LoRA_Gpt_2_NL_to_Code_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import nbformat, os, shutil
FOLDER = "/content/My Drive/ModSem/Github"

for fname in os.listdir(FOLDER):
    if fname.endswith(".ipynb"):
        path = os.path.join(FOLDER, fname)
        backup = path + ".backup.ipynb"
        shutil.copy2(path, backup)
        nb = nbformat.read(path, as_version=4)
        changed = False
        for cell in nb.cells:
            if "widgets" in cell.get("metadata", {}):
                cell["metadata"].pop("widgets", None)
                changed = True
            if cell.get("outputs"):
                cell["outputs"] = []
                changed = True
            if cell.get("execution_count") is not None:
                cell["execution_count"] = None
                changed = True
        if changed:
            nbformat.write(nb, path)
            print("Cleaned:", path)
        else:
            print("OK:", path)


##Setup Installation Process

In [None]:
#!pip install git+https://github.com/microsoft/LoRA
!pip install -qU bitsandbytes datasets accelerate loralib transformers peft trl
#!pip install -U datasets
!pip install -U sacrebleu evaluate rouge-score

In [None]:
import torch
torch.cuda.is_available()
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, TrainingArguments
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForSequenceClassification
from datasets import load_dataset
import bitsandbytes as bnb

## Model's loading
Here we load the model with its weights and the tokenizer

## Load the GPT-2 Large model

In [None]:
# Move the model to the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the GPT-2 Large model and tokenizer
print("Loading gpt2-large model...")
gpt2_large_model = AutoModelForCausalLM.from_pretrained("gpt2-large").to(device)

gpt2_large_tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
print("Successfully loaded gpt2-large model.")


In [None]:
model=gpt2_large_model
tokenizer= gpt2_large_tokenizer

## Post-processing on the model
### Freezing the original weights
Finally, we need to apply some post-processing on the n-bit model to enable training, let's freeze all our layers, and cast the layer-norm in floatm for stability.

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

###Display Trainable Parameters

This is a function to print out how much LoRA reduces the number of trainable parameters.

In [None]:
def print_trainable_parameters(model):

    #Prints the number of trainable parameters in the model.

    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

#### Model Architecture

It's important to observe the model's construction so you can ensure you know which modules you should apply LoRA to.

As per the paper, we're going to focus on the attention weights - so keep an eye out for modules like: `q_proj`, `v_proj`, `query_key_value`. This is model dependent - In our case (GPT-2), the target module is `attn.c_attn`

In [None]:
print(model)
#print_trainable_parameters(gpt2_large_model)

##Parameter Efficient Fine Tuning
###Set up the LoRA Adapter
Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`.

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=["attn.c_attn"],
    #target_modules=["query_key_value"],
    #target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

## target_modules='v', This represents the value projection layer in the transformer model. The value projection layer transforms input tokens into value vectors,
# which are the actual values that are attended to based on the attention scores computed from query and key vectors.

## target_modules='q',This typically refers to the query projection layer in a transformer-based model. The query projection layer is responsible for transforming
# input tokens into query vectors, which are used to attend to other tokens in the sequence during self-attention mechanism.

#c_attn: This is the convolution layer that computes the query, key, and value projections. The "q_proj" and "v_proj" are part of this layer.

model = get_peft_model(model, config)

###Display trainable parameters

In [None]:
print_trainable_parameters(model)

## Merge the CodeAlpaca and MBPP Datasets

In [None]:
# File: merge_mbpp_codealpaca_dataset.py

from datasets import load_dataset, Dataset
import random

# Load datasets
mbpp = load_dataset("mbpp")
# Load CodeAlpaca dataset using the correct identifier
codealpaca = load_dataset("sahil2801/CodeAlpaca-20k")

merged_data = []

# Format MBPP: as instruction-tuning format with test cases included
for row in mbpp["train"]:
    # Construct the prompt in the desired format
    prompt = f"### Instruction\n{row['text'].strip()}\n\n### Input\n{row['code'].strip()}\n\n### Test Cases\n{row['test_list']}"
    merged_data.append({
        "instruction": row["text"].strip(),
        "input": row["code"].strip(),
        "output": row["code"].strip(), # Using code as output for MBPP based on context
        "source": "mbpp",
        "text": prompt # Add the formatted prompt to the data
    })

# Format CodeAlpaca
for row in codealpaca["train"]:
    # Construct the prompt in the desired format
    prompt = f"### Instruction\n{row['instruction'].strip()}\n\n### Input\n{row['input'].strip()}\n\n### Response:\n{row['output'].strip()}"
    merged_data.append({
        "instruction": row["instruction"].strip(),
        "input": row["input"].strip(),
        "output": row["output"].strip(),
        "source": "codealpaca",
        "text": prompt # Add the formatted prompt to the data
    })


# Shuffle merged dataset
random.shuffle(merged_data)

# Save as HuggingFace Dataset
merged_dataset = Dataset.from_list(merged_data)
merged_dataset.save_to_disk("merged_mbpp_codealpaca")

print("✅ Merged dataset saved to: merged_mbpp_codealpaca")

In [None]:
# prompt: tokenize the merged_dataset

# Set the padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
  return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = merged_dataset.map(tokenize_function, batched=True)
print("✅ Merged dataset tokenized.")

In [None]:
dataset = merged_dataset

In [None]:
tokenized_datasets=tokenized_dataset

In [None]:
# prompt: split the tokenized_datasets into "train", "test" and "vlidation"

# Define split ratios
train_ratio = 0.8
test_ratio = 0.1
validation_ratio = 0.1

# Split the dataset
train_test_validation_split = tokenized_datasets.train_test_split(test_size=test_ratio + validation_ratio, seed=42)
test_validation_split = train_test_validation_split["test"].train_test_split(test_size=validation_ratio / (test_ratio + validation_ratio), seed=42)

train_dataset = train_test_validation_split["train"]
test_dataset = test_validation_split["train"]
validation_dataset = test_validation_split["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")

## Load CodeAlpaca Dataset

In [None]:
### Code Dataset

from datasets import load_dataset

ds = load_dataset("sahil2801/CodeAlpaca-20k")

### Dataset Splits

In [None]:
from datasets import DatasetDict

# Combine into one dataset and shuffle
ds_all = dataset['train'].shuffle(seed=42)

# Split into train (80%), val (10%), test (10%)
ds_split = ds_all.train_test_split(test_size=0.2, seed=42)
ds_temp = ds_split['test'].train_test_split(test_size=0.5, seed=42)
ds_dict = DatasetDict({
    'train': ds_split['train'],
    'validation': ds_temp['train'],
    'test': ds_temp['test']
})

In [None]:
tokenizer.pad_token = tokenizer.eos_token  # ensure padding token is set
# Define prompt template
def format_prompt(example):
    prompt = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return {"text": prompt}

# Apply prompt formatting
ds_dict = ds_dict.map(format_prompt)

# Tokenize function
def tokenize_function(examples):
    tokens = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

# Tokenize the dataset
ds_tokenized = ds_dict.map(tokenize_function, batched=True, remove_columns=ds_dict["train"].column_names)

In [None]:
tokenized_datasets=ds_tokenized
tokenized_datasets.keys()

In [None]:
#Extract a small portion of the
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(3000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(700))
small_val_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(700))

##Training Process

In [None]:
#Import the necessary modules from the transformers library
import transformers
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer

###Train LoRA Adapter

The `Trainer` class contains all the usual hyper-parameters from traditional ML applications!

If you're running into CUDA memory issues - please modify both the `per_device_train_batch_size` to be lower, and also reduce `r` in your LoRAConfig.

In [None]:
#LoRA paper for hyperparameters for GPT-2 Medium
# Training Arguments
training_args = TrainingArguments(
    output_dir="./output_lora_gpt2",  # Directory for saving the model
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    warmup_steps=500,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="./logs_lora_gpt2",  # Directory for logging
    logging_steps=10,
    save_total_limit=2,  # Keep only 2 model checkpoints
    load_best_model_at_end=True,
    report_to="none",  # Disable reporting to WandB or other loggers
    fp16=True,  # Enable mixed precision training if you have a GPU
)

# Define a custom data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # Causal LM does not use Masked Language Modeling (MLM)
)

In [None]:
# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    #train_dataset=tokenized_datasets["train"],
    #eval_dataset=tokenized_datasets.get("validation", None),  # Use validation if available
    #tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

In [None]:
model.eval()

### Pushing the Model to the Hub

In [None]:
HUGGING_FACE_USER_NAME = "mabc-3"
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model_name = "gpt-2-LLoRA-NLP2Code"

model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)

### Load Adapters from the Hub

You can also directly load adapters from the Huggingface Hub using the commands below:

In [None]:
HUGGING_FACE_USER_NAME = "mabc-3"
model_name = "gpt-2-LLoRA-NLP2Code"

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = f"{HUGGING_FACE_USER_NAME}/{model_name}"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

In [None]:
# Load the Lora model
lora_model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
print(lora_model)

##GPU Check

In [None]:
!nvidia-smi

## Inference on Code Tasks

###LoRA Model vs Base Model

In [None]:
base_model_name = "gpt2-large" ##"gpt2"  # Or use gpt2-large

# Load tokenizer
gpt2_large_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# ✅ STEP 4: Load Full Model (no LoRA)
def load_full_model():
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    model.eval()
    return model

In [None]:
# ✅ Shared settings
#prompt = "Write a Python function to check if a number is prime"
#prompt ="Find the product of all digits in a string, ignoring non-digit characters."
#prompt="Return a list of all substrings of length 3 that are palindromes in the given string."
prompt= "write a function in python to check if a number is a power of two without using loops or recursion."
#prompt= "Sort a list of tuples by the second element in descending order, then by the first element ascending if tie. data = [(1, 3), (2, 3), (3, 2), (4, 4)"
#prompt= "Write a Python function that takes a list of integers and returns the list sorted in descending order."
#prompt= "Write a Python function that returns the factorial of a number using recursion."

max_new_tokens = 250

import time


# ✅ STEP 6: Benchmarking Function
def benchmark_model(model, tokenizer, label="Model"):
    # Ensure padding token is set for the tokenizer
    tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)

    # Warm-up
    _ = model.generate(**inputs, max_new_tokens=10)

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    start = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    end = time.time()

    latency = end - start
    peak_mem = torch.cuda.max_memory_allocated() / 1024**2
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"\n📌 {label} Result:")
    print("Generated Output:\n", decoded)
    print(f"⏱ Latency: {latency:.3f} seconds")
    print(f"💾 Peak GPU Memory: {peak_mem:.2f} MB")
    return latency, peak_mem


# ✅ STEP 7: Run Comparison
full_model = load_full_model() #change this to BM
lora_model = model ##change this to gpt-2 lora

print("🧪 Benchmarking FULL Model...")
latency_full, mem_full = benchmark_model(full_model, gpt2_large_tokenizer, label="Full GPT-2")

print("\n🧪 Benchmarking LoRA Model...")
latency_lora, mem_lora = benchmark_model(lora_model, tokenizer, label="LoRA GPT-2")

# ✅ STEP 8: Summary Comparison
print("\n📊 COMPARISON SUMMARY")
print(f"Latency: Full = {latency_full:.3f}s | LoRA = {latency_lora:.3f}s")
print(f"Memory:  Full = {mem_full:.2f} MB | LoRA = {mem_lora:.2f} MB")

### Python Programming Prompts

Practical coding prompts designed to illustrate and assess the fundamental programming capabilities of the fine-tuned LLM.

In [None]:
def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n-1)

print(factorial(5))

In [None]:
factorial(5)

In [None]:
def sort_list(numbers):
    return sorted(numbers)

numbers= [13, 2, 33, 43, 15]
print(sort_list(numbers))

In [None]:
def is_power_of_two_without_loops(num):
    if num % 2 == 0:
        return True
    else:
        return False

In [None]:
def is_power_of_two(num):
    if num % 2 == 0:
        return True
    else:
        return False

In [None]:
print(is_power_of_two_without_loops(16))
print(is_power_of_two_without_loops(18))

##Evaluation of the Model

###Evaluation on HumanEval Benchmark

This evaluation framework tests the fine-tuned GPT-2 model on HumanEval by generating code completions, executing them safely, and measuring correctness through Pass@k scores.

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

In [None]:
# File: evaluate_lora_gpt2_humaneval.py

import os
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from accelerate import init_empty_weights, infer_auto_device_map
from peft import PeftModel
from tqdm import tqdm

# Config
#MODEL_PATH = "path/to/your/lora/fine-tuned/gpt2"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_NEW_TOKENS = 170   #128
NUM_SAMPLES = 5
GENERATION_ARGS = dict(do_sample=True, temperature=0.8, max_new_tokens=MAX_NEW_TOKENS, num_return_sequences=NUM_SAMPLES)

# Load tokenizer and model
#tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)
#model = GPT2LMHeadModel.from_pretrained(MODEL_PATH, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
model = lora_model
model.eval()
model.to(DEVICE)

# Load HumanEval dataset
dataset = load_dataset("openai_humaneval")

def evaluate_sample_k(prompt: str) -> list:
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(**inputs, **GENERATION_ARGS)
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return [out[len(prompt):] for out in decoded_outputs]

def main():
    passed = 0
    total = 0

    print("\n==================== Evaluation Start ====================\n")

    for idx, item in enumerate(tqdm(dataset["test"], desc="Evaluating")):
        prompt = item["prompt"]
        entry_point = item["entry_point"]
        canonical_solution = item["canonical_solution"]

        completions = evaluate_sample_k(prompt)
        test_passed = False

        for code in completions:
            try:
                local_env = {}
                exec(prompt + code, local_env)
                generated_func = local_env[entry_point]

                reference_env = {}
                exec(prompt + canonical_solution, reference_env)
                reference_func = reference_env[entry_point]

                match_all = True
                for inp in item["test"]:
                    if eval(inp, {}, {entry_point: generated_func}) != eval(inp, {}, {entry_point: reference_func}):
                        match_all = False
                        break

                if match_all:
                    test_passed = True
                    break

            except Exception:
                continue

        status = "✅ PASS" if test_passed else "❌ FAIL"
        print(f"[{idx+1:03}] {entry_point:<30} | {status}")
        print("→ Generated: ", completions[0].strip().splitlines()[0][:120])
        print("→ Reference: ", canonical_solution.strip().splitlines()[0][:120])
        print("------------------------------------------------------------")

        passed += int(test_passed)
        total += 1

    print("\n==================== Evaluation Summary ====================")
    print(f"Total Passed: {passed} / {total}")
    if total > 0:
        print(f"Pass@{NUM_SAMPLES} Score: {passed / total:.2%}")
    else:
        print("Pass@k Score: N/A (No samples evaluated)")
    print("===========================================================\n")

if __name__ == "__main__":
    main()

### Analysis:
LoRA fine-tuned GPT-2 Large on CodeAlpaca performs well on code-style prompts but fails on the HumanEval dataset.

*   CodeAlpaca focuses on instruction-style completions, mostly docstring-to-code or natural language to code.
*   These prompts are simplified and often lack complex logic, recursion, or rigorous test cases.

*   LoRA fine-tuning adapts your model well to generate syntactically-correct, readable code, especially if prompts resemble your training distribution.

###Failure on HumanEval:
HumanEval is a strict unit-test-driven benchmark. It requires:

*   Full correctness (not just plausible code)
*   Handling edge cases
*   Understanding nuanced logic (e.g., dynamic programming, recursion, math, string parsing)

▶ It fails to learn **deep functional reasoning**, which requires **diverse and structured training signals** — missing from CodeAlpaca.

##Switching between adapters
You can “merge” two LoRA adapters, you apply both updates to the base model:


You can also reload GPT-2 and swap adapters anytime:

In [None]:
from peft import PeftModel, AutoModelForCausalLM

# Load base GPT-2 again
base_model = AutoModelForCausalLM.from_pretrained("gpt2 Large")

# Load CodeAlpaca LoRA
codealpaca_model = PeftModel.from_pretrained(base_model, "./lora-codealpaca")

# Load MBPP LoRA
mbpp_model = PeftModel.from_pretrained(base_model, "./lora-mbpp")

# Switch between them
mbpp_model.set_adapter("mbpp_adapter")

## Further Fine-Tuning on MBPP Dataset

In [None]:
!pip install python-Levenshtein

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# 1. Load MBPP dataset
dataset = load_dataset("mbpp")

# 2. Choose tokenizer (GPT-2 in your case)
#tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

# 3. Preprocessing: join prompt + code solution
def preprocess(batch):
    # You can customize the prompt style here
    text = [
        f"Problem: {p}\nSolution:\n{s}"
        for p, s in zip(batch["text"], batch["code"])
    ]
    return tokenizer(text, truncation=True, padding="max_length", max_length=512)

# 4. Apply preprocessing
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

print(tokenized_dataset)

In [None]:
# Add labels = input_ids to every split
tokenized_dataset = tokenized_dataset.map(
    lambda batch: {"labels": batch["input_ids"]},
    batched=True
)

# Now you can safely pass train/val datasets
train_dataset = tokenized_dataset["train"]
eval_dataset  = tokenized_dataset["validation"]
test_dataset = tokenized_dataset["test"]

In [None]:
print(tokenized_dataset)

In [None]:
# metrics.py
import numpy as np
import evaluate
import Levenshtein
import contextlib
import io

# Load Hugging Face metrics once
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

def safe_exec(code, func_name, test_inputs, expected_outputs):
    """Run generated code in restricted env and validate against tests."""
    env = {}
    try:
        with contextlib.redirect_stdout(io.StringIO()):
            exec(code, env)
        func = env.get(func_name, None)
        if func is None:
            return False, 0.0

        passed, total = 0, len(test_inputs)
        for inp, exp in zip(test_inputs, expected_outputs):
            out = func(*inp) if isinstance(inp, tuple) else func(inp)
            if out == exp:
                passed += 1
        return passed == total, passed / total
    except Exception:
        return False, 0.0


def pass_at_k(num_correct, num_samples, k):
    """Compute Pass@k as in HumanEval."""
    if num_samples == 0 or num_correct == 0:
        return 0.0
    return 1.0 - np.prod([
        (1.0 - (num_correct / (num_samples - i)))
        for i in range(min(k, num_samples - num_correct + 1))
    ])


def build_compute_metrics(tokenizer, k=3, mbpp_tests=None):
    """
    Returns a compute_metrics function compatible with Hugging Face Trainer.
    Captures tokenizer + optional MBPP test set.
    """
    def compute_metrics(eval_preds):
        preds, labels = eval_preds

        # ---- Prepare predictions and labels ----
        # If logits, take argmax over vocab
        if preds.ndim == 3:  # (batch, seq_len, vocab_size)
            preds = np.argmax(preds, axis=-1)

        preds = preds.astype(int)
        labels = labels.astype(int)

        # Replace ignore index (-100) with pad token id
        labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

        # Decode into strings
        decoded_preds = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]
        decoded_labels = [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

        # ---- Text-based metrics ----
        results = {
            "bleu": bleu.compute(
                predictions=decoded_preds, references=[[l] for l in decoded_labels]
            )["bleu"],
            "rougeL": rouge.compute(
                predictions=decoded_preds, references=decoded_labels
            )["rougeL"],
            "exact_match": float(np.mean([
                p.strip() == l.strip() for p, l in zip(decoded_preds, decoded_labels)
            ])),
            "avg_edit_distance": float(np.mean([
                Levenshtein.distance(p, l) for p, l in zip(decoded_preds, decoded_labels)
            ])),
            "pass@1": None,
            "pass@k": None,
            "exec_accuracy": None,
        }


        # --- inside compute_metrics, MBPP block ---
        if mbpp_tests is not None:
        # Collect indices actually evaluated in this batch
            eval_indices = [idx for idx in range(len(decoded_preds)) if idx in mbpp_tests]
            num_correct = 0
            exec_accs = []

            for idx in eval_indices:
              pred = decoded_preds[idx]
              tests = mbpp_tests[idx]
              func_name, inputs, outputs = tests["func_name"], tests["inputs"], tests["outputs"]
              passed_all, acc = safe_exec(pred, func_name, inputs, outputs)
              if passed_all:
                num_correct += 1
              exec_accs.append(acc)

            if exec_accs:
              num_evaluated = len(exec_accs)
              results["exec_accuracy"] = float(np.mean(exec_accs))
             # normalize by the number actually evaluated (not the full batch size)
              results["pass@1"] = num_correct / num_evaluated
              results["pass@k"] = pass_at_k(num_correct, num_evaluated, k)

        return results

    return compute_metrics


In [None]:
# timing_callback.py
import time
import csv
import os
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl

class TimingCallback(TrainerCallback):
    """
    Records step times and epoch times. Writes a CSV `throughput_log.csv` with columns:
    step, epoch, step_time_s, iter_time_s (same), samples_per_step, tokens_per_step, cumulative_time_s
    """
    def __init__(self, out_path="throughput_log.csv", seq_length=None):
        self.start_step_time = None
        self.cumulative_time = 0.0
        self.out_path = out_path
        self.seq_length = seq_length
        # create header
        if not os.path.exists(self.out_path):
            with open(self.out_path, "w", newline="") as f:
                writer = csv.writer(f)
                writer.writerow(["global_step","epoch","step_time_s","samples_per_step","tokens_per_step","cumulative_time_s"])

    def on_step_begin(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        # called at start of each optimizer step (approx)
        self.start_step_time = time.time()

    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        # called at end of step
        if self.start_step_time is None:
            return
        step_time = time.time() - self.start_step_time
        self.cumulative_time += step_time

        # derive samples_per_step:
        # Trainer stores per_device_train_batch_size and gradient_accumulation_steps in args
        per_device_bs = args.per_device_train_batch_size
        accum = getattr(args, "gradient_accumulation_steps", 1)
        # If using multiple devices, state.num_processes is total workers (world_size)
        world_size = getattr(state, "num_processes", 1)
        samples_per_step = per_device_bs * accum * world_size

        if self.seq_length is None:
            # fallback - if seq_length unknown, you can set it when constructing the callback
            seq_length = getattr(args, "max_seq_length", None) or getattr(args, "max_length", None) or 512
        else:
            seq_length = self.seq_length

        tokens_per_step = samples_per_step * seq_length

        with open(self.out_path, "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerow([state.global_step, getattr(state, "epoch", None), round(step_time,6), samples_per_step, tokens_per_step, round(self.cumulative_time,6)])


In [None]:
from peft import LoraConfig, get_peft_model

mbpp_lora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=["attn.c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
# Add MBPP adapter without removing CodeAlpaca adapter
lora_model.add_adapter("mbpp_adapter", mbpp_lora_config)

# Switch active adapter to MBPP (so training only updates this one)
#lora_model.set_adapter("mbpp_adapter")

In [None]:
from transformers import TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./sft-mbpp-lora",       # where to save checkpoints
    num_train_epochs=5,                 # MBPP is small -> don’t need too many
    per_device_train_batch_size=2,      # keep small to fit on free Colab GPUs
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,      # effective batch size = 16
    learning_rate=2e-5,                 # safe LR for LoRA fine-tuning
    weight_decay=0.01,
    warmup_ratio=0.05,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=False,
    gradient_checkpointing=False,
    report_to="none",
    seed=42,
)
# Define a custom data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # Causal LM does not use Masked Language Modeling (MLM)
)

In [None]:
# optional: pass seq_length if you know it (e.g., 512)
timing_cb = TimingCallback(out_path="throughput_log.csv", seq_length=512)

trainer = SFTTrainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[timing_cb], # Wrap the callback in a list
    compute_metrics=build_compute_metrics(tokenizer, k=7, mbpp_tests=eval_dataset)
)

trainer.train()

# Save only the MBPP LoRA adapter
model.save_pretrained("./lora-mbpp")

In [None]:
model = lora_model
# Save only the MBPP LoRA adapter
model.save_pretrained("./lora-mbpp")

In [None]:
# After training
logs = trainer.state.log_history

# Collect evaluation history only
eval_history = [entry for entry in logs if "eval_loss" in entry]

# Convert into lists of steps and metrics
steps = [entry["step"] for entry in eval_history]

h_metrics = {
    "bleu": [entry.get("eval_bleu") for entry in eval_history],
    "rougeL": [entry.get("eval_rougeL") for entry in eval_history],
    "exact_match": [entry.get("eval_exact_match") for entry in eval_history],
    "avg_edit_distance": [entry.get("eval_avg_edit_distance") for entry in eval_history],
    "pass@1": [entry.get("eval_pass@1") for entry in eval_history],
    "pass@k": [entry.get("eval_pass@k") for entry in eval_history],
    "exec_accuracy": [entry.get("eval_exec_accuracy") for entry in eval_history],
}


In [None]:
h_metrics

##Visualizing Throughput

In [None]:
# Academic-style plotting for throughput_log.csv (from TimingCallback)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# === User parameters ===
CSV_PATH = "throughput_log.csv"         # path you passed to TimingCallback
OUT_DIR = "./figures"
os.makedirs(OUT_DIR, exist_ok=True)

WARMUP_STEPS = 50       # number of initial optimizer steps to exclude as warm-up (tune as needed)
ROLL_WINDOW = 200       # rolling window (in steps) for smoothing; adjust to your log length
SAVE_DPI = 300

# === Read CSV ===
df = pd.read_csv(CSV_PATH)

# Ensure expected numeric columns
for c in ["step_time_s", "samples_per_step", "tokens_per_step", "global_step"]:
    if c not in df.columns and c == "global_step":
        # some callbacks wrote "global_step", else use index
        df["global_step"] = df.index
        break

# convert types
df["step_time_s"] = pd.to_numeric(df["step_time_s"], errors="coerce")
df["samples_per_step"] = pd.to_numeric(df["samples_per_step"], errors="coerce")
df["tokens_per_step"] = pd.to_numeric(df["tokens_per_step"], errors="coerce")

# compute throughput metrics
df["samples_per_sec"] = df["samples_per_step"] / df["step_time_s"]
df["tokens_per_sec"] = df["tokens_per_step"] / df["step_time_s"]

# rolling stats
roll = lambda s: s.rolling(window=ROLL_WINDOW, min_periods=1, center=True).mean()
df["tokens_per_sec_roll"] = roll(df["tokens_per_sec"])
df["samples_per_sec_roll"] = roll(df["samples_per_sec"])
df["tokens_per_sec_std"] = df["tokens_per_sec"].rolling(window=ROLL_WINDOW, min_periods=1).std()

# steady-state selection (after warm-up)
steady = df.iloc[WARMUP_STEPS : ].copy()
if steady.empty:
    steady = df.copy()

# summary statistics (steady-state)
summary = {
    "tokens_per_sec_mean": steady["tokens_per_sec"].mean(),
    "tokens_per_sec_median": steady["tokens_per_sec"].median(),
    "tokens_per_sec_p5": steady["tokens_per_sec"].quantile(0.05),
    "tokens_per_sec_p95": steady["tokens_per_sec"].quantile(0.95),
    "samples_per_sec_mean": steady["samples_per_sec"].mean(),
    "samples_per_sec_median": steady["samples_per_sec"].median()
}
print("Steady-state throughput summary (after warm-up):")
for k,v in summary.items():
    print(f"  {k}: {v:.2f}")

# === Plot settings for academic style ===
plt.rcParams.update({
    "figure.figsize": (9,4.5),
    "font.size": 12,
    "axes.titlesize": 13,
    "axes.labelsize": 12,
    "legend.fontsize": 11,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "savefig.dpi": SAVE_DPI,
    "lines.linewidth": 1.2
})

# ---- Figure 1: Tokens/sec over steps (with rolling avg + variability) ----
fig, ax = plt.subplots()
x = df["global_step"] if "global_step" in df.columns else df.index
ax.plot(x, df["tokens_per_sec"], alpha=0.25, label="tokens/sec (per step)")
ax.plot(x, df["tokens_per_sec_roll"], label=f"tokens/sec (rolling, window={ROLL_WINDOW})")
# shaded ±1 std of rolling window (if available)
std = df["tokens_per_sec_std"].fillna(0)
ax.fill_between(x, df["tokens_per_sec_roll"] - std, df["tokens_per_sec_roll"] + std, alpha=0.12)
# annotate warm-up region
if WARMUP_STEPS > 0:
    warmup_x = x.iloc[:WARMUP_STEPS]
    ax.axvspan(warmup_x.min(), warmup_x.max(), color='grey', alpha=0.08, label="warm-up (excluded)")

ax.set_xlabel("Global optimizer step")
ax.set_ylabel("Tokens / second")
ax.set_title("Training throughput — tokens per second")
ax.grid(True, linestyle="--", linewidth=0.4, alpha=0.7)
ax.legend()
plt.tight_layout()
fpath = os.path.join(OUT_DIR, "tokens_per_sec_over_steps.png")
plt.savefig(fpath, bbox_inches="tight")
print(f"Saved figure: {fpath}")
plt.show()

# ---- Figure 2: Samples/sec & per-epoch aggregation ----
fig, ax = plt.subplots()
ax.plot(x, df["samples_per_sec"], alpha=0.25, label="samples/sec (per step)")
ax.plot(x, df["samples_per_sec_roll"], label=f"samples/sec (rolling, window={ROLL_WINDOW})")

# If epoch column exists, draw vertical boundaries and annotate epoch averages
if "epoch" in df.columns:
    epochs = df["epoch"].fillna(method="ffill").unique()
    for e in epochs:
        # find first index of epoch e
        idxs = df.index[df["epoch"]==e].tolist()
        if not idxs:
            continue
        ax.axvline(x=idxs[0], linestyle=":", linewidth=0.7)
        # epoch mean
        e_mean = df.loc[df["epoch"]==e, "samples_per_sec"].mean()
        ax.text(idxs[0]+0.5, ax.get_ylim()[1]*0.95, f"Epoch {int(e)} mean: {e_mean:.1f}", fontsize=9, verticalalignment='top')

ax.set_xlabel("Global optimizer step")
ax.set_ylabel("Samples / second (examples/sec)")
ax.set_title("Training throughput — samples per second")
ax.grid(True, linestyle="--", linewidth=0.4, alpha=0.7)
ax.legend()
plt.tight_layout()
fpath = os.path.join(OUT_DIR, "samples_per_sec_over_steps.png")
plt.savefig(fpath, bbox_inches="tight")
print(f"Saved figure: {fpath}")
plt.show()

# ---- Optional: Small table of summary stats saved to CSV for the thesis ----
pd.Series(summary).to_csv(os.path.join(OUT_DIR, "throughput_summary_stats.csv"))
print("Summary statistics saved.")


##Visualizing LoRA Model

In [None]:
import pandas as pd
import numpy as np

# Data from the mbpp_adapter
data = {
    "Epoch": [1, 2, 3, 4, 5],
    "Training Loss": [2.341400, 2.157700, 1.956200, 1.832300, 1.803000],
    "Validation Loss": [2.353944, 2.135633, 2.006576, 1.915481, 1.880655],
    "Bleu": [0.193154, 0.194993, 0.202520, 0.201009, 0.200844],
    "RougeL": [0.533877, 0.536636, 0.555202, 0.553825, 0.553992],
    "Exact Match": [0.0, 0.0, 0.0, 0.0, 0.0],
    "Avg Edit Distance": [1043.011111, 1047.111111, 1044.511111, 1045.600000, 1046.400000],
    "Entropy": [4.754701, 4.759662, 4.767116, 4.765970, 4.765502],
    "Num Tokens": [191488, 382976, 574464, 765952, 957440],
    "Mean Token Accuracy": [0.639754, 0.650686, 0.666477, 0.667128, 0.667628],
}


df = pd.DataFrame(data)

# Display nicely
pd.set_option("display.precision", 6)
print(df)

# Save to CSV for use in plotting / thesis figures
df.to_csv("training_metrics_table.csv", index=False)
print("\nSaved CSV -> training_metrics_table.csv")


### Evaluation Metrics – BLEU, ROUGE-L, Exact Match
hows model generation quality improving (or stabilizing) across epochs. These are standard text generation metrics.

In [None]:
# File: analysis_plots For GPT-2 LoRA on MBPP and Codealpaca Adapters.py

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set(style="whitegrid", context="paper", font_scale=1.1)

df = pd.DataFrame(data)

In [None]:
plt.figure(figsize=(8,5))
sns.lineplot(data=df, x="Epoch", y="BLEU", marker="o", label="BLEU")
sns.lineplot(data=df, x="Epoch", y="ROUGE-L", marker="o", label="ROUGE-L")
sns.lineplot(data=df, x="Epoch", y="Exact Match", marker="o", label="Exact Match")

plt.title("Evaluation Metrics Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

OUTDIR = "figures"
os.makedirs(OUTDIR, exist_ok=True)

# === Data from your table ===
Data = pd.DataFrame({
    "Epoch":           [1, 2, 3, 4, 5],
    "Training Loss":   [2.341400, 2.157700, 1.956200, 1.832300, 1.803000],
    "Validation Loss": [2.353944, 2.135633, 2.006576, 1.915148, 1.880655],
    "Bleu":            [0.193154, 0.194993, 0.202520, 0.201009, 0.200844],
    "ROUGE-L":         [0.533877, 0.536636, 0.555202, 0.553825, 0.553992],
    "Exact Match":     [0.0, 0.0, 0.0, 0.0, 0.0],       # constant zero - not informative
    "Avg Edit Distance":[1043.011111, 1047.111111, 1044.511111, 1045.600000, 1046.400000],
    "Entropy":         [4.754701, 4.759662, 4.767116, 4.765970, 4.765502],
    "Num Tokens":      [191488.0, 382976.0, 574464.0, 765952.0, 957440.0],
    "Mean Token Accuracy":[0.639754, 0.650686, 0.666477, 0.667128, 0.667628],
})


In [None]:
# Training & Validation Loss vs Epoch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

sns.set(context="paper", style="whitegrid", font_scale=1.1)

# Use previously defined Data
# Columns used: Epoch, Training Loss, Validation Loss
plt.figure(figsize=(9,5))
ax = sns.lineplot(data=Data, x="Epoch", y="Training Loss", marker="o", label="Training Loss")
sns.lineplot(data=Data, x="Epoch", y="Validation Loss", marker="s", label="Validation Loss", ax=ax)

# Annotate values above validation loss points
for x, y in zip(Data["Epoch"], Data["Validation Loss"]):
    ax.annotate(f"{y:.3f}", (x, y), textcoords="offset points", xytext=(0,6), ha='center', fontsize=9)

ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
ax.set_xticks(Data["Epoch"])
ax.set_title("Training and Validation Loss vs Epoch (GPT-2 LoRA)")
ax.grid(True, linestyle="--", linewidth=0.45, alpha=0.9)
plt.tight_layout()
plt.savefig("loss_vs_epoch.png", dpi=300)
plt.show()


In [None]:
# Evaluation Metrics (BLEU, ROUGE-L, Exact Match)
import matplotlib.ticker as ticker
plt.figure(figsize=(9,5))

# Columns to plot: BLEU, ROUGE-L, Exact Match
metrics_to_plot = ["Bleu", "ROUGE-L", "Exact Match"]

# Plot each metric
for metric in metrics_to_plot:
    plt.plot(Data["Epoch"], Data[metric]*100 if metric!="Exact Match" else Data[metric],
             marker="o", label=metric)

# Annotate last point of each metric
for metric in metrics_to_plot:
    x_last = Data["Epoch"].iloc[-1]
    y_last = Data[metric].iloc[-1]*100 if metric!="Exact Match" else Data[metric].iloc[-1]
    plt.annotate(f"{y_last:.3f}", (x_last, y_last), textcoords="offset points", xytext=(6,0), ha='left', fontsize=9)

plt.xlabel("Epoch")
plt.ylabel("Score (%)")
plt.title("Evaluation Metrics over Epochs (BLEU, ROUGE-L, Exact Match)")
plt.xticks(Data["Epoch"])
plt.grid(True, linestyle="--", linewidth=0.45, alpha=0.9)
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("eval_metrics_over_epochs.png", dpi=300)
plt.show()


In [None]:
df

In [None]:
# ===================================================================
# Plot : Mean Token Accuracy (left) & Entropy (right) vs epoch (twin axis)
# ===================================================================
plt.figure(figsize=(8,5))
ax1 = plt.gca()
ax2 = ax1.twinx()
sns.lineplot(x="Epoch", y="Mean Token Accuracy", data=df, marker="o", color="tab:blue", label="Mean Token Accuracy", ax=ax1)
sns.lineplot(x="Epoch", y="Entropy", data=df, marker="s", color="tab:orange", label="Entropy", ax=ax2)
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Mean Token Accuracy")
ax2.set_ylabel("Entropy (per-token)")
ax1.set_xticks(df["Epoch"])
ax1.set_ylim(0.60, 0.70)
ax2.set_ylim(df['Entropy'].min()*0.995, df['Entropy'].max()*1.005)
ax1.set_title("Token Accuracy vs Entropy")
ax1.legend(loc="upper left")
ax2.legend([plt.Line2D([0],[0], color='tab:orange')], ["Entropy"], loc="upper right")
plt.tight_layout()
plt.savefig("token_acc_entropy.png", dpi=300)
plt.show()

##Visualizing LoRA Model with MBPP Adapters

In [None]:
import matplotlib.pyplot as plt


epochs = [1, 2, 3, 4, 5]
train_loss = [3.0928, 2.9043, 2.5556, 2.4169, 2.4171]
val_loss   = [2.8876, 2.6947, 2.5502, 2.4668, 2.4399]
bleu       = [0.1255, 0.1192, 0.1156, 0.1129, 0.1115]
rougeL     = [0.4431, 0.4469, 0.4680, 0.4768, 0.4793]
accuracy   = [0.5760, 0.5791, 0.5871, 0.5900, 0.5914]
entropy    = [4.2323, 4.3061, 4.3552, 4.3781, 4.3863]

# === 1. Training and Validation Loss ===
plt.figure(figsize=(6,4))
plt.plot(epochs, train_loss, marker='o', label='Train Loss')
plt.plot(epochs, val_loss, marker='o', label='Validation Loss')
plt.xlabel("Epochs"); plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Epochs")
plt.legend(); plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig("loss_over_epochs.pdf", dpi=300)   # save as PDF
plt.savefig("loss_over_epochs.png", dpi=300)   # optional PNG
#plt.close()

# === 2. BLEU Score ===
plt.figure(figsize=(6,4))
plt.plot(epochs, bleu, marker='o', color='C2')
plt.xlabel("Epochs"); plt.ylabel("BLEU Score")
plt.title("BLEU Score Over Epochs")
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig("bleu_over_epochs.pdf", dpi=300)
plt.savefig("bleu_over_epochs.png", dpi=300)
#plt.close()

# === 3. ROUGE-L Score ===
plt.figure(figsize=(6,4))
plt.plot(epochs, rougeL, marker='o', color='C3')
plt.xlabel("Epochs"); plt.ylabel("ROUGE-L Score")
plt.title("ROUGE-L Score Over Epochs")
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig("rougeL_over_epochs.pdf", dpi=300)
plt.savefig("rougeL_over_epochs.png", dpi=300)
#plt.close()

# === 4. Token Accuracy ===
plt.figure(figsize=(6,4))
plt.plot(epochs, accuracy, marker='o', color='C4')
plt.xlabel("Epochs"); plt.ylabel("Mean Token Accuracy")
plt.title("Token Prediction Accuracy Over Epochs")
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig("accuracy_over_epochs.pdf", dpi=300)
plt.savefig("accuracy_over_epochs.png", dpi=300)
#plt.close()

# === 5. Entropy ===
plt.figure(figsize=(6,4))
plt.plot(epochs, entropy, marker='o', color='C5')
plt.xlabel("Epochs"); plt.ylabel("Entropy")
plt.title("Entropy of Predictions Over Epochs")
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig("entropy_over_epochs.pdf", dpi=300)
plt.savefig("entropy_over_epochs.png", dpi=300)
#plt.close()


In [None]:
from google.colab import files
files.download("loss_over_epochs.pdf")
files.download("rougeL_over_epochs.pdf")
files.download("bleu_over_epochs.pdf")
files.download("accuracy_over_epochs.pdf")
files.download("entropy_over_epochs.pdf")

###Training and Validation Loss Over Epochs

In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss curves
epochs = [1, 2, 3, 4, 5]
train_loss = [3.0928, 2.9043, 2.5556, 2.4169, 2.4171]
val_loss   = [2.8876, 2.6947, 2.5502, 2.4668, 2.4399]

plt.figure(figsize=(6,4))
plt.plot(epochs, train_loss, marker='o', color='C0', label='Train Loss')
plt.plot(epochs, val_loss,   marker='o', color='C1', label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()


###BLEU Score Over Epochs

In [None]:
# Plot BLEU score curve
bleu = [0.1255, 0.1192, 0.1156, 0.1129, 0.1115]  # from epochs 1–5

plt.figure(figsize=(6,4))
plt.plot(epochs, bleu, marker='o', color='C2')
plt.xlabel('Epoch')
plt.ylabel('BLEU Score')
plt.ylim(0.10, 0.13)
plt.title('BLEU Score During Training')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()


###ROUGE-L Score Over Epochs

In [None]:
# Plot ROUGE-L score curve
rouge = [0.4431, 0.4469, 0.4680, 0.4768, 0.4793]  # from epochs 1–5

plt.figure(figsize=(6,4))
plt.plot(epochs, rouge, marker='o', color='C3')
plt.xlabel('Epoch')
plt.ylabel('ROUGE-L Score')
plt.ylim(0.44, 0.50)
plt.title('ROUGE-L Score During Training')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt


epochs = [1, 2, 3, 4, 5]
bleu = [0.1255, 0.1192, 0.1156, 0.1129, 0.1115]  # from epochs 1–5
rouge = [0.4431, 0.4469, 0.4680, 0.4768, 0.4793]  # from epochs 1–5

plt.figure(figsize=(8, 5)) # Increased figure size for better readability

# Plot BLEU score
plt.plot(epochs, bleu, marker='o', color='C2', label='BLEU Score')

# Plot ROUGE-L score
plt.plot(epochs, rouge, marker='o', color='C3', label='ROUGE-L Score')

plt.xlabel('Epoch')
plt.ylabel('Score')
# Set a title that reflects both metrics
plt.title('BLEU and ROUGE-L Scores Over Epochs')
plt.legend() # Add a legend to distinguish the lines
plt.grid(True, linestyle='--', linewidth=0.5) # Add a grid
plt.tight_layout() # Adjust layout to prevent labels overlapping
plt.show()

###Mean Token Accuracy Over Epochs

In [None]:
# Plot mean token accuracy curve
acc = [0.5760, 0.5791, 0.5871, 0.5900, 0.5914]  # from epochs 1–5

plt.figure(figsize=(6,4))
plt.plot(epochs, acc, marker='o', color='C4')
plt.xlabel('Epoch')
plt.ylabel('Mean Token Accuracy')
plt.ylim(0.57, 0.60)
plt.title('Token Prediction Accuracy During Training')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()


###Entropy of Model Predictions Over Epochs

In [None]:
# Plot prediction entropy curve
entropy = [4.2323, 4.3061, 4.3552, 4.3781, 4.3863]  # from epochs 1–5

plt.figure(figsize=(6,4))
plt.plot(epochs, entropy, marker='o', color='C5')
plt.xlabel('Epoch')
plt.ylabel('Entropy')
plt.title('Entropy of Predictions Over Epochs')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()


### Trainable Parameter Count: Full Model vs LoRA Adapter

In [None]:
# Bar chart of trainable parameters (example values)
import numpy as np

params = {'Full Model': 124.0,    # e.g., 124M total parameters
          'LoRA (r=4) Modules': 0.15}  # e.g., ~0.15M from LoRA layers
labels = list(params.keys())
vals   = list(params.values())

plt.figure(figsize=(6,4))
bars = plt.bar(labels, vals, color=['#ff8080','#80b3ff'])
plt.yscale('log')
plt.ylabel('Trainable Parameters (millions, log scale)')
plt.title('Trainable Parameters: Full Model vs LoRA (r=4)')
# Annotate absolute values
plt.text(0, vals[0]*1.1, f"{vals[0]:.1f}M", ha='center')
plt.text(1, vals[1]*8,    f"{vals[1]:.2f}M", ha='center')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()


##Inference for NLG

In [None]:
# Load the tokenizer and model
tokenizergpt2 = GPT2Tokenizer.from_pretrained("gpt2")
# Load the tokenizer and model
modelgpt2 = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
base_tokenizer=tokenizergpt2
base_model= modelgpt2

In [None]:
model= lora_model

In [None]:
# Prompt
#prompt = "Once upon a time"
prompt= "Write a Python function to check if a number is prime."

# Tokenize the input text
inputs = tokenizer(prompt, return_tensors="pt")
inputs

In [None]:
# Generate text
output_ids = model.generate(
    inputs.input_ids.to(model.device),
    attention_mask=inputs.attention_mask.to(model.device),
    pad_token_id=tokenizer.eos_token_id,
    max_length=50,
    num_return_sequences=1
)

output_ids

In [None]:
# Decode the generated text
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(generated_text)