**Clone the GitHub in Colab **

In [1]:
from getpass import getpass
token = getpass()

# Construct the URL for git clone, typically to the root of the repository
# Assuming the repository is 'tttry' on GitHub for user 'micag2025'
git_url = f"https://micag2025:{token}@github.com/micag2025/llmed_certification_FineTuneFlow.git"

# Install git (if not already installed)
!apt-get install -y git

# Clone the repository
!git clone "$git_url"

¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.15).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Cloning into 'llmed_certification_FineTuneFlow'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 31 (delta 3), reused 16 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (31/31), 469.79 KiB | 1.97 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [None]:
# =========================================================
# NOTEBOOK D ‚Äî Auto Fine-Tuning Recommendation & Plan 
# =========================================================

!pip install -q pandas matplotlib

import os, json
import pandas as pd
from pathlib import Path
from textwrap import dedent

# -------------------------
# Paths
# -------------------------
MERGED_DIR = "llmed_certification_FineTuneFlow/Notebook_C.ipynb"
FINAL_CSV = "/content/llmed_certification_FineTuneFlow/outputs/benchmarks/notebook_C/final_ranking.csv"
#OUT_DIR = "/content/llmed_certification_FineTuneFlow/outputs/benchmarks"
OUT_DIR = "/content/llmed_certification_FineTuneFlow/outputs/benchmarks/notebook_D"
os.makedirs(OUT_DIR, exist_ok=True)

if not os.path.exists(FINAL_CSV):
    raise FileNotFoundError(f"Cannot find merged ranking CSV at {FINAL_CSV}. Run Notebook C first.")

df = pd.read_csv(FINAL_CSV, index_col=0)
print("Loaded final ranking:\n")
display(df)

# -------------------------
# Model size / architecture hints
# -------------------------
size_hints = {
    "bart-large": "0.4B",
    "bart": "0.4B",
    "t5-large": "0.8B",
    "t5": "0.8B",
    "llama-1b": "1B",
    "llama-3b": "3B",
    "llama": "3B",
    "phi-3-mini": "3B",
    "phi": "3B",
}

def infer_model_size(model_name):
    key = model_name.lower()
    for k, v in size_hints.items():
        if k in key:
            return v
    return "unknown"

df["size_hint"] = df.index.map(infer_model_size)

# -------------------------
# Recommendation logic
# -------------------------
def recommend_method(model_name, size_hint, gpu_mem_gb=None):
    ln = model_name.lower()
    # Encoder-decoder models ‚Üí LoRA preferred
    if "bart" in ln or "t5" in ln:
        return "LoRA (PEFT) ‚Äî encoder‚Äìdecoder friendly"

    # Decoder models (LLaMA / Phi)
    try:
        gb = float(size_hint.replace("B", ""))
    except:
        return "QLoRA (recommended) / manual check"

    if gb <= 1.5:
        return "LoRA or full fine-tune"
    if gb <= 3.5:
        return "QLoRA (4-bit)"
    if gb <= 8:
        return "QLoRA (4-bit) ‚Äî use A100 / L4 recommended"
    return "Hosted fine-tuning / QLoRA on A100/H100"

def hyperparams_suggestion(size_hint):
    try:
        gb = float(size_hint.replace("B",""))
    except:
        gb = 3.0
    if gb <= 1.5:
        return {"epochs": 3, "micro_batch_size": 8, "lr": 2e-4}
    if gb <= 3.5:
        return {"epochs": 3, "micro_batch_size": 4, "lr": 1e-4}
    if gb <= 8:
        return {"epochs": 3, "micro_batch_size": 1, "lr": 1e-4}
    return {"epochs": 2, "micro_batch_size": 1, "lr": 5e-5}

TOP_K = 2
top_models = df.sort_values("composite_score", ascending=False).head(TOP_K)
print("\nTop selected models:")
display(top_models)

recommendations = {}

for model in top_models.index:
    size_hint = infer_model_size(model)
    method = recommend_method(model, size_hint)
    hps = hyperparams_suggestion(size_hint)
    recommendations[model] = {
        "size_hint": size_hint,
        "method": method,
        "recommended_hyperparams": hps,
    }

# -------------------------
# Generate finetune_plan.md
# -------------------------
plan = []
plan.append("# Fine-tuning Plan (Auto-Generated)\n")
plan.append("This document summarizes top fine-tuning choices based on Notebook C.\n")

for i, (name, row) in enumerate(top_models.iterrows(), start=1):
    rec = recommendations[name]
    plan.append(f"### {i}. {name}")
    plan.append(f"- Composite score: {row['composite_score']:.4f}")
    plan.append(f"- ROUGE-L: {row['rougeL']:.2f}%")
    plan.append(f"- Inferred size: {rec['size_hint']}")
    plan.append(f"- Recommended method: **{rec['method']}**")
    plan.append(f"- Hyperparameters: `{rec['recommended_hyperparams']}`\n")

with open(os.path.join(OUT_DIR, "finetune_plan.md"), "w") as f:
    f.write("\n".join(plan))

print("\n‚úñ Fine-tuning plan written to finetune_plan.md")

# -------------------------
# Auto-generate training scripts
# -------------------------
top_model = top_models.index[0]
method = recommendations[top_model]["method"]
print(f"\nüî• Selected #1 model for fine-tuning: {top_model}")
print("‚Üí Training strategy:", method)

# ---- create LoRA script (encoder‚Äìdecoder models)
train_lora_script = dedent("""
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

MODEL = "{MODEL}"
DATASET_PATH = "{DATASET_PATH}"
OUTPUT_DIR = "{OUTPUT_DIR}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token

lora_cfg = LoraConfig(r=8, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05)
model = get_peft_model(model, lora_cfg)

ds = load_dataset("json", data_files={{"train": DATASET_PATH}})["train"]

def tokenize_fn(example):
    out = tokenizer(example["dialogue"], truncation=True, max_length=1024)
    labels = tokenizer(example["summary"], truncation=True, max_length=128).input_ids
    out["labels"] = labels
    return out

train_ds = ds.map(tokenize_fn, remove_columns=ds.column_names)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size={BATCH},
    num_train_epochs={EPOCHS},
    learning_rate={LR},
    fp16=True,
    save_strategy="no",
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_ds)
trainer.train()
model.save_pretrained(OUTPUT_DIR)
""")

# ---- create QLoRA script (decoder-only models)
train_q_lora_script = dedent("""
import torch, transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_dataset

MODEL = "{MODEL}"
DATASET_PATH = "{DATASET_PATH}"
OUTPUT_DIR = "{OUTPUT_DIR}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    load_in_4bit=True,
    device_map="auto",
    quantization_config=transformers.BitsAndBytesConfig(load_in_4bit=True)
)
model = prepare_model_for_kbit_training(model)

lora_cfg = LoraConfig(r=8, lora_alpha=32, target_modules=["q_proj","v_proj"], lora_dropout=0.05)
model = get_peft_model(model, lora_cfg)

ds = load_dataset("json", data_files={{"train": DATASET_PATH}})["train"]

def tokenize_fn(example):
    prompt = f"<|system|>Summarize the conversation.<|end|>\n{example['dialogue']}\n<|assistant|>"
    tok = tokenizer(prompt, truncation=True, max_length=1024)
    labels = tokenizer(example["summary"], truncation=True, max_length=128).input_ids
    tok["labels"] = labels
    return tok

train_ds = ds.map(tokenize_fn, remove_columns=ds.column_names)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size={BATCH},
    num_train_epochs={EPOCHS},
    learning_rate={LR},
    fp16=True,
    save_strategy="no",
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_ds)
trainer.train()
model.save_pretrained(OUTPUT_DIR)
""")

# Pick script type
DATASET_PATH = "./highlightsum_train.jsonl"
OUTPUT_DIR = f"./ft_outputs/{top_model.replace('/', '_')}"
hps = recommendations[top_model]["recommended_hyperparams"]

if "LoRA" in method and "encoder" in method:
    script = train_lora_script
    filename = "train_LoRA.py"
else:
    script = train_q_lora_script
    filename = "train_QLoRA.py"

script = script.format(
    MODEL=top_model,
    DATASET_PATH=DATASET_PATH,
    OUTPUT_DIR=OUTPUT_DIR,
    BATCH=hps["micro_batch_size"],
    EPOCHS=hps["epochs"],
    LR=hps["lr"]
)

with open(os.path.join(OUT_DIR, filename), "w") as f:
    f.write(script)

# save recommendations JSON
with open(os.path.join(OUT_DIR, "recommendations.json"), "w") as f:
    json.dump(recommendations, f, indent=2)

print("\nüìÅ Outputs written to:", OUT_DIR)
print("Files:\n ", "\n ".join(os.listdir(OUT_DIR)))

Loaded final ranking:



Unnamed: 0,rouge1,rouge2,rougeL,time,throughput,efficiency,composite_score
BART-large,28.107296,9.229025,21.038022,214.299854,0.933272,0.098171,1.0
LLaMA-1B,22.225994,9.552435,16.049638,479.141066,0.417414,0.033497,0.451424
Phi-3-Mini,24.155261,10.463062,17.668568,1280.113872,0.156236,0.013802,0.42735
LLaMA-3B,22.273046,9.871545,16.004961,968.169577,0.206575,0.016531,0.358622
T5-large,10.756401,1.884288,9.492152,731.983771,0.27323,0.012968,0.022585



Top selected models:


Unnamed: 0,rouge1,rouge2,rougeL,time,throughput,efficiency,composite_score,size_hint
BART-large,28.107296,9.229025,21.038022,214.299854,0.933272,0.098171,1.0,0.4B
LLaMA-1B,22.225994,9.552435,16.049638,479.141066,0.417414,0.033497,0.451424,1B



‚úñ Fine-tuning plan written to finetune_plan.md

üî• Selected #1 model for fine-tuning: BART-large
‚Üí Training strategy: LoRA (PEFT) ‚Äî encoder‚Äìdecoder friendly

üìÅ Outputs written to: /content/llmed_certification_FineTuneFlow/outputs/benchmarks/notebook_D
Files:
  recommendations.json
 finetune_plan.md
 train_LoRA.py


In [6]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model
import torch

# -------------------------
# Config
# -------------------------
MODEL_NAME = "facebook/bart-large-cnn"
OUTPUT_DIR = "./ft_outputs/bart_lora_highlightsum"
N_SAMPLES = 2000   # first 2k samples for fine-tuning
EPOCHS = 1         # set 1 epoch as requested
MICRO_BATCH_SIZE = 4
LEARNING_RATE = 2e-4
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128

os.makedirs(OUTPUT_DIR, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üî• Using device: {device}")

# -------------------------
# Load dataset
# -------------------------
dataset = load_dataset("knkarthick/highlightsum")["train"].select(range(N_SAMPLES))
print(f"Loaded {len(dataset)} samples for training.")

# -------------------------
# Tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# ‚ò¢‚Ä≤ Ensure pad_token is set
tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token

# -------------------------
# Model + LoRA
# -------------------------
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"], # Changed 'q', 'v' to 'q_proj', 'v_proj'
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)
model.to(device)

# -------------------------
# Tokenization function
# -------------------------
def tokenize_fn(example):
    inputs = tokenizer(example["dialogue"], truncation=True, max_length=MAX_INPUT_LENGTH)
    labels = tokenizer(example["summary"], truncation=True, max_length=MAX_TARGET_LENGTH).input_ids
    inputs["labels"] = labels
    return inputs

tokenized_dataset = dataset.map(tokenize_fn, remove_columns=dataset.column_names)

# -------------------------
# Training Arguments
# -------------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=50,
    save_strategy="no",
    gradient_accumulation_steps=1,
)

# -------------------------
# Data Collator
# -------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# -------------------------
# Trainer
# -------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator, # Use the specialized data collator
)

# -------------------------
# Run training
# -------------------------
trainer.train()

# -------------------------
# Save model & tokenizer
# -------------------------
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"\n‚ÑπÔ∏è Fine-tuned model saved to {OUTPUT_DIR}")

üî• Using device: cuda
Loaded 2000 samples for training.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss
50,2.2513
100,2.2971
150,1.8459
200,1.9525
250,2.1228
300,1.8205
350,1.7701
400,1.8539
450,1.7009
500,1.84



‚ÑπÔ∏è Fine-tuned model saved to ./ft_outputs/bart_lora_highlightsum


In [9]:
%%writefile train_bart_lora.py
# =====================================================
!pip install -q datasets transformers peft wandb

import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model
import torch
import wandb

# -------------------------
# Config
# -------------------------
MODEL_NAME = "facebook/bart-large-cnn"
OUTPUT_DIR = "./ft_outputs/bart_lora_highlightsum"
N_SAMPLES = 2000   # first 2k samples for fine-tuning
EPOCHS = 1         # set 1 epoch for debugging
MICRO_BATCH_SIZE = 4
LEARNING_RATE = 2e-4
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128
WANDB_PROJECT = "highlightsum_bart_lora"

os.makedirs(OUTPUT_DIR, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üî• Using device: {device}")

# -------------------------
# W&B login
# -------------------------
wandb.login()  # prompts for API key in Colab

# -------------------------
# Load dataset
# -------------------------
dataset = load_dataset("knkarthick/highlightsum")["train"].select(range(N_SAMPLES))
print(f"Loaded {len(dataset)} samples for training.")

# -------------------------
# Tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# ‚ò¢ Ensure pad_token is set
tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token

# -------------------------
# Model + LoRA
# -------------------------
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # attention proj layers
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)
model.to(device)

# -------------------------
# Tokenization function
# -------------------------
def tokenize_fn(example):
    inputs = tokenizer(example["dialogue"], truncation=True, max_length=MAX_INPUT_LENGTH)
    labels = tokenizer(example["summary"], truncation=True, max_length=MAX_TARGET_LENGTH).input_ids
    inputs["labels"] = labels
    return inputs

tokenized_dataset = dataset.map(tokenize_fn, remove_columns=dataset.column_names)

# -------------------------
# Training Arguments
# -------------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=50,
    save_strategy="no",
    gradient_accumulation_steps=1,
    report_to="wandb",   # enables W&B logging
    run_name="bart_lora_highlightsum"
)

# -------------------------
# Data Collator
# -------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# -------------------------
# Trainer
# -------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# -------------------------
# Run training
# -------------------------
trainer.train()

# -------------------------
# Save model & tokenizer
# -------------------------
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"\n‚ÑπÔ∏è Fine-tuned model saved to {OUTPUT_DIR}")

Writing train_bart_lora.py


In [10]:
# =====================================================
# Evaluate fine-tuned BART LoRA model on HighlightSUM
# with sample predictions
# =====================================================

!pip install -q datasets transformers rouge-score torch

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer

# -------------------------
# Config
# -------------------------
MODEL_DIR = "./ft_outputs/bart_lora_highlightsum"
N_VAL = 200    # number of validation samples for quick evaluation
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128
NUM_SAMPLES_TO_DISPLAY = 5  # number of sample predictions to show
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üî• Using device: {device}")

# -------------------------
# Load validation data
# -------------------------
dataset = load_dataset("knkarthick/highlightsum")["test"].select(range(N_VAL))
print(f"Loaded {len(dataset)} validation samples.")

# -------------------------
# Load fine-tuned model & tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR).to(device)

# -------------------------
# ROUGE scorer
# -------------------------
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def compute_rouge(preds, refs):
    agg = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
    for pred, ref in zip(preds, refs):
        scores = scorer.score(ref, pred)
        for k in agg:
            agg[k] += scores[k].fmeasure
    n = len(preds)
    return {k: v / n * 100 for k, v in agg.items()}

# -------------------------
# Generate summaries
# -------------------------
preds = []
refs = dataset["summary"]

for text in dataset["dialogue"]:
    inputs = tokenizer(
        text,
        truncation=True,
        padding="longest",
        max_length=MAX_INPUT_LENGTH,
        return_tensors="pt"
    ).to(device)
    output_ids = model.generate(**inputs, max_new_tokens=MAX_TARGET_LENGTH)
    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    preds.append(pred)

# -------------------------
# Compute ROUGE
# -------------------------
scores = compute_rouge(preds, refs)
print("\nüìä ROUGE scores on validation set:")
for k, v in scores.items():
    print(f"{k}: {v:.2f}")

# -------------------------
# Display some sample predictions
# -------------------------
print(f"\nüìù Sample predictions (showing {NUM_SAMPLES_TO_DISPLAY} examples):\n")
for i in range(NUM_SAMPLES_TO_DISPLAY):
    print(f"--- Example {i+1} ---")
    print("Dialogue:\n", dataset[i]["dialogue"])
    print("\nReference Summary:\n", dataset[i]["summary"])
    print("\nPredicted Summary:\n", preds[i])
    print("------------------------------\n")


üî• Using device: cuda
Loaded 200 validation samples.

üìä ROUGE scores on validation set:
rouge1: 36.46
rouge2: 17.38
rougeL: 27.12

üìù Sample predictions (showing 5 examples):

--- Example 1 ---
Dialogue:
 Speaker A: Yes. Okay, when we talk about uh components design, um it's really about the material and the and uh uh really the stuff we build uh the remote controls of. Um, a remote control consist of uh components and the components of a remote control consist of uh properties and material. We have to choose th uh these uh wisely and it could affect uh uh a kind of grow of in uh in buying uh the remote controls. Um, the components of a remote control are of course uh the case. Uh the properties of the case, um it has to be solid uh in hard material like uh hard plastic uh with soft rubber for uh falling and and uh uh yeah, it feels uh good in your hand. Mm the buttons has to be uh solid too, and the material is soft rubber. Uh I've got a uh email from the possibilities of Real 