# Track A: Extended pre-training

In [1]:
!pip install -q transformers datasets accelerate torch evaluate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [3]:
import json
import time
import math
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)


In [4]:
with open("dataset/datacard.json", "r") as f:
    DATACARD = json.load(f)

DATACARD["track_a_pretraining"]


{'purpose': 'Extended pre-training of a language model on raw Python code',
 'dataset_statistics': {'total_samples': 115,
  'train_samples': 103,
  'val_samples': 12},
 'modeling_results': {}}

In [5]:
dataset = load_dataset(
    "json",
    data_files={
        "train": "dataset/track_a_train.jsonl",
        "validation": "dataset/track_a_val.jsonl"
    }
)

dataset


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'source'],
        num_rows: 103
    })
    validation: Dataset({
        features: ['text', 'source'],
        num_rows: 12
    })
})

In [6]:
#MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MODEL_NAME = "Qwen/Qwen2.5-Coder-0.5B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to("cuda")
model.gradient_checkpointing_enable()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/139 [00:00<?, ?B/s]

In [7]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=256
    )


In [8]:
tokenized_ds = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names
)


Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

In [9]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [10]:
training_args = TrainingArguments(
    output_dir="./track_a_outputs",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=5e-5,
    logging_steps=50,
    save_strategy="epoch",
    fp16=False,
    bf16=False,
    max_grad_norm=0.0,
    report_to="none"
)


In [11]:
baseline_trainer = Trainer(
    model=model,          # pretrained model
    args=training_args,
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

baseline_metrics = baseline_trainer.evaluate()
baseline_loss = baseline_metrics["eval_loss"]

import math
baseline_ppl = math.exp(baseline_loss)

print("Baseline loss:", baseline_loss)
print("Baseline perplexity:", baseline_ppl)


  baseline_trainer = Trainer(


Baseline loss: 1.6545332670211792
Baseline perplexity: 5.230638041095127


In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator
)


In [13]:
start_time = time.time()
train_result = trainer.train()
training_time = (time.time() - start_time) / 60


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss


In [14]:
eval_metrics = trainer.evaluate()
eval_loss = eval_metrics["eval_loss"]
perplexity = math.exp(eval_loss)

eval_metrics, perplexity


({'eval_loss': 1.580885410308838,
  'eval_runtime': 0.9534,
  'eval_samples_per_second': 12.586,
  'eval_steps_per_second': 12.586,
  'epoch': 2.0},
 4.859256342748171)

In [15]:
DATACARD["track_a_results"] = {
    "model": MODEL_NAME,
    "epochs": training_args.num_train_epochs,
    "train_batch_size": training_args.per_device_train_batch_size,
    "gradient_accumulation": training_args.gradient_accumulation_steps,
    "learning_rate": training_args.learning_rate,
    "eval_loss": round(eval_loss, 4),
    "eval_perplexity": round(perplexity, 4),
    "training_time_minutes": round(training_time, 2),
    "hardware": "Google Colab GPU"
}


In [16]:
with open("dataset/datacard.json", "w") as f:
    json.dump(DATACARD, f, indent=2)


In [17]:
#save model
trainer.save_model("./track_a_model")
tokenizer.save_pretrained("./track_a_model")


('./track_a_model/tokenizer_config.json',
 './track_a_model/special_tokens_map.json',
 './track_a_model/chat_template.jinja',
 './track_a_model/vocab.json',
 './track_a_model/merges.txt',
 './track_a_model/added_tokens.json',
 './track_a_model/tokenizer.json')

## Track A Summary Generation


In [18]:
def collect_track_a_examples(dataset, model, tokenizer, num_samples=5):
    import torch

    examples = []

    for item in dataset.select(range(min(num_samples, len(dataset)))):
        # Take first 100 tokens as context
        context = item['text']

        inputs = tokenizer(context, return_tensors="pt", truncation=True, max_length=128).to("cuda")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        completion = generated[len(context):]

        examples.append({
            'context': context,
            'completion': completion,
            'source': item.get('source', 'N/A')
        })

    return examples


In [20]:
import json
from datetime import datetime

def generate_track_a_html(baseline_loss, baseline_ppl, final_loss, final_ppl,
                          training_time, model_name, epochs, batch_size, examples=None):
    def escape_html(text):
        return (str(text)
                .replace("&", "&amp;")
                .replace("<", "&lt;")
                .replace(">", "&gt;")
                .replace('"', "&quot;")
                .replace("'", "&#39;"))

    improvement = ((baseline_ppl - final_ppl) / baseline_ppl) * 100

    html = f"""
    <div class="track-section" id="track-a">
        <h2>Track A: Extended Pre-training</h2>

        <div class="methodology">
            <h3>Methodology</h3>
            <p><strong>Objective:</strong> Continue training a small code language model on a curated code corpus to adapt it to specific coding patterns.</p>
            <ul>
                <li><strong>Model:</strong> {model_name}</li>
                <li><strong>Dataset:</strong> 103 training samples, 12 validation samples</li>
                <li><strong>Training Configuration:</strong>
                    <ul>
                        <li>Epochs: {epochs}</li>
                        <li>Batch Size: {batch_size}</li>
                        <li>Training Time: {training_time:.2f} minutes</li>
                    </ul>
                </li>
                <li><strong>Objective:</strong> Causal language modeling (next-token prediction)</li>
                <li><strong>Loss Masking:</strong> Automatic padding token masking via DataCollatorForLanguageModeling</li>
            </ul>
        </div>

        <div class="metrics">
            <h3>Metrics & Results</h3>
            <table>
                <tr>
                    <th>Metric</th>
                    <th>Baseline (Before Training)</th>
                    <th>After Training</th>
                    <th>Improvement</th>
                </tr>
                <tr>
                    <td>Validation Loss</td>
                    <td>{baseline_loss:.4f}</td>
                    <td>{final_loss:.4f}</td>
                    <td class="{'positive' if final_loss < baseline_loss else 'negative'}">
                        {((baseline_loss - final_loss) / baseline_loss * 100):+.2f}%
                    </td>
                </tr>
                <tr>
                    <td>Perplexity</td>
                    <td>{baseline_ppl:.4f}</td>
                    <td>{final_ppl:.4f}</td>
                    <td class="{'positive' if improvement > 0 else 'negative'}">
                        {improvement:+.2f}%
                    </td>
                </tr>
            </table>
        </div>
    """

    # Add examples section if provided
    if examples:
        html += """
        <div class="examples">
            <h3>Example Code Completions</h3>
            <p><em>The model generates code continuations given a context. These examples show the model's ability to understand and continue code patterns.</em></p>
            <div class="example-container">
        """

        for i, ex in enumerate(examples[:3], 1):
            html += f"""
                <div class="example good">
                    <p><strong>Example {i}</strong> | <em>Source: {escape_html(ex.get('source', 'N/A'))}</em></p>
                    <p><strong>Context (Input):</strong></p>
                    <pre><code>{escape_html(ex['context'])}</code></pre>
                    <p><strong>Model Completion:</strong></p>
                    <pre><code>{escape_html(ex['completion'])}</code></pre>
                </div>
            """

        html += """
            </div>
        </div>
        """

    html += f"""
        <div class="insights">
            <h3>Key Insights</h3>
            <ul>
                <li>{'✓' if improvement > 0 else '✗'} Model perplexity {'decreased' if improvement > 0 else 'increased'} by {abs(improvement):.2f}%, indicating {'better' if improvement > 0 else 'worse'} language modeling capabilities</li>
                <li>Lower perplexity suggests the model is better at predicting the next token in the code corpus</li>
                <li>Training completed efficiently on Google Colab T4 GPU in {training_time:.2f} minutes</li>
            </ul>
        </div>
    </div>
    """
    return html

examples = collect_track_a_examples(dataset["validation"], model, tokenizer, num_samples=5)
track_a_html = generate_track_a_html(
    baseline_loss=baseline_loss,
    baseline_ppl=baseline_ppl,
    final_loss=eval_loss,
    final_ppl=perplexity,
    training_time=training_time,
    model_name=MODEL_NAME,
    epochs=training_args.num_train_epochs,
    batch_size=training_args.per_device_train_batch_size,
    examples=examples
)

# Track B


In [21]:
del trainer
del model
torch.cuda.empty_cache()


In [22]:
!pip install peft bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.0


Clean the track B dataset: track b dataset jsonl files have markdowns in some responses, and "test" , "improve" and "refactor" responses often contain hallucinations, repeated input codes as output, or sometimes they start with ''' python (since these are synthetic dataset generated using llm), hence we remove those and keep only "explain" and "docstring" type ground truths (cleaned)

In [23]:
# keeping only explain and docstring instructions

import json
import re

def clean_explain_docstring_dataset(
    input_path,
    output_path,
    min_explain_len=80,
    max_explain_len=500
):
    cleaned = []
    stats = {
        'total': 0,
        'kept': 0,
        'cleaned_markdown': 0,
        'cleaned_docstring': 0,
        'cleaned_explain': 0,
        'drop_wrong_task': 0,
        'drop_unrepairable': 0,
    }

    def strip_markdown(text):
        # Remove fenced blocks
        text = re.sub(r'```(?:python|json)?', '', text)
        # Remove markdown headers
        text = re.sub(r'#+\s*', '', text)
        # Remove bold / italics
        text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
        text = re.sub(r'\*(.*?)\*', r'\1', text)
        return text.strip()

    def normalize_whitespace(text):
        text = re.sub(r'\n{3,}', '\n\n', text)
        return text.strip()

    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            stats['total'] += 1
            ex = json.loads(line)

            ttype = ex.get('template_type')
            resp = ex.get('response', '').strip()

            # ---- only explain + docstring ----
            if ttype not in ['explain', 'docstring']:
                stats['drop_wrong_task'] += 1
                continue

            orig_resp = resp
            resp = strip_markdown(resp)
            resp = normalize_whitespace(resp)

            if resp != orig_resp:
                stats['cleaned_markdown'] += 1

            # ============================================================
            # DOCSTRING
            # ============================================================
            if ttype == 'docstring':
                # Remove leading language hints
                resp = re.sub(r'^\s*(python|py)\s*\n', '', resp, flags=re.I)

                # Extract first docstring block
                m = re.search(r'("""|\'\'\')(.*?)(\1)', resp, re.DOTALL)
                if not m:
                    stats['drop_unrepairable'] += 1
                    continue

                doc = m.group(0).strip()

                # Sanity checks
                if any(k in doc for k in ['def ', 'class ', 'import ']):
                    stats['drop_unrepairable'] += 1
                    continue

                ex['response'] = doc
                cleaned.append(ex)
                stats['cleaned_docstring'] += 1
                stats['kept'] += 1
                continue

            # ============================================================
            # EXPLAIN
            # ============================================================
            if ttype == 'explain':
                # Remove accidental code-like lines
                resp = re.sub(r'^.*\b(def|class|import|return)\b.*$', '', resp, flags=re.M)

                # Length normalization
                if len(resp) > max_explain_len:
                    resp = resp[:max_explain_len].rsplit('.', 1)[0] + '.'

                if len(resp) < min_explain_len:
                    stats['drop_unrepairable'] += 1
                    continue

                ex['response'] = resp
                cleaned.append(ex)
                stats['cleaned_explain'] += 1
                stats['kept'] += 1
                continue

    with open(output_path, 'w', encoding='utf-8') as f:
        for ex in cleaned:
            f.write(json.dumps(ex) + '\n')

    print("\n=== Clean + Repair Stats (Explain & Docstring) ===")
    for k, v in stats.items():
        print(f"{k}: {v}")

    print(f"\nSaved {len(cleaned)} clean samples → {output_path}")
    return output_path


In [24]:
clean_explain_docstring_dataset(
    input_path="dataset/track_b_train.jsonl",
    output_path="dataset/track_b_train_explain_docstring_clean.jsonl"
)

clean_explain_docstring_dataset(
    input_path = "dataset/track_b_test.jsonl",
    output_path = "dataset/track_b_test_explain_docstring_clean.jsonl"
)


=== Clean + Repair Stats (Explain & Docstring) ===
total: 450
kept: 152
cleaned_markdown: 182
cleaned_docstring: 65
cleaned_explain: 87
drop_wrong_task: 265
drop_unrepairable: 33

Saved 152 clean samples → dataset/track_b_train_explain_docstring_clean.jsonl

=== Clean + Repair Stats (Explain & Docstring) ===
total: 50
kept: 11
cleaned_markdown: 15
cleaned_docstring: 6
cleaned_explain: 5
drop_wrong_task: 35
drop_unrepairable: 4

Saved 11 clean samples → dataset/track_b_test_explain_docstring_clean.jsonl


'dataset/track_b_test_explain_docstring_clean.jsonl'

In [25]:

from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "Qwen/Qwen2.5-Coder-0.5B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# -------------------------
# BASE MODEL (no fine-tuning)
# -------------------------
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

base_model.eval()




`torch_dtype` is deprecated! Use `dtype` instead!


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2

Using LORA for fine-tuning

In [26]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 4,325,376 || all params: 498,358,144 || trainable%: 0.8679


In [27]:
from datasets import load_dataset

dataset_b = load_dataset(
    "json",
    data_files={
        "train": "dataset/track_b_train_explain_docstring_clean.jsonl",
        "test": "dataset/track_b_test_explain_docstring_clean.jsonl"
    }
)


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [28]:
def format_prompt(example):
    prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['response']}"""
    return {"text": prompt}

dataset_b = dataset_b.map(format_prompt)


Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [29]:
def tokenize_function(examples):
    # Tokenize full text
    model_inputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

    labels = []
    for i, text in enumerate(examples["text"]):
        # Find where response starts
        response_start = text.find("### Response:\n") + len("### Response:\n")

        # Tokenize up to response
        prefix = text[:response_start]
        prefix_ids = tokenizer(prefix, add_special_tokens=False)["input_ids"]

        # Full tokenization
        full_ids = model_inputs["input_ids"][i]

        # Create labels: -100 for prefix, actual tokens for response
        label = [-100] * len(prefix_ids) + full_ids[len(prefix_ids):]
        label = label[:512]  # Truncate to max_length

        # Pad labels
        label = label + [-100] * (512 - len(label))
        labels.append(label)

    model_inputs["labels"] = labels
    return model_inputs

In [30]:
tokenized_ds = dataset_b.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset_b["train"].column_names
)

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [31]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args_b = TrainingArguments(
    output_dir="./track_b_improved",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    warmup_steps=50,
    logging_steps=25,
    save_strategy="epoch",
    fp16=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    report_to="none"
)


In [32]:
trainer_b = Trainer(
    model=model,
    args=training_args_b,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=data_collator
)

train_result_b = trainer_b.train()
eval_metrics_b = trainer_b.evaluate()

eval_metrics_b


The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
25,1.4223
50,1.1292


{'eval_loss': 1.1471518278121948,
 'eval_runtime': 1.6276,
 'eval_samples_per_second': 6.758,
 'eval_steps_per_second': 3.686,
 'epoch': 3.0}

In [33]:
model.save_pretrained("./track_b_lora_adapters")
tokenizer.save_pretrained("./track_b_lora_adapters")


('./track_b_lora_adapters/tokenizer_config.json',
 './track_b_lora_adapters/special_tokens_map.json',
 './track_b_lora_adapters/chat_template.jinja',
 './track_b_lora_adapters/vocab.json',
 './track_b_lora_adapters/merges.txt',
 './track_b_lora_adapters/added_tokens.json',
 './track_b_lora_adapters/tokenizer.json')

In [34]:
# pass@k evaluation (metrics: pass-rate for unit test subs, stylescore (docstring), hallucination rate) for track b
import ast
import json
import math
import re
import torch
from collections import defaultdict


In [35]:
def generate_k_responses(model, tokenizer, prompt, temp, k=5, max_new_tokens=256):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            do_sample=True,
            temperature=temp,
            top_p=0.95,
            top_k=50,
            num_return_sequences=k,
            max_new_tokens=max_new_tokens,
            pad_token_id = tokenizer.eos_token_id
        )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


In [36]:
#AST validity
def is_valid_python(code):
    try:
        ast.parse(code)
        return True
    except:
        return False


In [37]:
#Unit-test stub validity
def is_valid_test(code):
    if not is_valid_python(code):
        return False
    return ("assert" in code) or ("unittest.TestCase" in code)


In [38]:
#doc string style
def has_valid_docstring(text):
    if not text.strip().startswith(('"""', "'''")):
        return False
    return any(word in text for word in ["param", "return", "args"])


In [39]:
#PEP8 proxy
def pep8_proxy_score(code):
    lines = code.splitlines()
    long_lines = sum(len(l) > 100 for l in lines)
    return 1.0 - (long_lines / max(len(lines), 1))


In [40]:
#Hallucination detection
BUILTINS = {
    "range", "len", "print", "list", "dict", "set", "int",
    "float", "str", "enumerate", "zip", "min", "max"
}

def hallucination_rate(code, input_code):
    try:
        gen_tree = ast.parse(code)
        input_tree = ast.parse(input_code)
    except:
        return 1.0

    gen_names = {
        n.id for n in ast.walk(gen_tree)
        if isinstance(n, ast.Name)
    }
    input_names = {
        n.id for n in ast.walk(input_tree)
        if isinstance(n, ast.Name)
    }

    extra = gen_names - input_names - BUILTINS
    return 1.0 if len(extra) > 3 else 0.0


In [41]:
#main evaluation loop
def evaluate_track_b(
    dataset,
    model,
    tokenizer,
    k=5,
    max_eval=50
):
    stats = defaultdict(int)
    qualitative_examples = []

    total = min(len(dataset), max_eval)

    for example in dataset.select(range(total)):
        prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
"""
        generations = generate_k_responses(model, tokenizer, prompt, 0.7, k=k)

        passed_any = False

        for gen in generations:
            response = gen.split("### Response:")[-1].strip()
            inst_type = example["template_type"]

            passed = False

            if inst_type == "test":
                passed = is_valid_test(response)
                stats["unit_test_attempts"] += 1
                stats["unit_test_pass"] += int(passed)

            elif inst_type == "docstring":
                passed = has_valid_docstring(response)
                stats["docstring_total"] += 1
                stats["docstring_style_score"] += int(passed)

            elif inst_type in ["refactor", "improve"]:
                passed = is_valid_python(response)

            elif inst_type == "explain":
                passed = len(response.split()) > 20

            passed_any = passed_any or passed

            stats["hallucinations"] += hallucination_rate(
                response, example["input"]
            )

        stats["pass_k"] += int(passed_any)
        stats["total"] += 1

        if len(qualitative_examples) < 20:
          qualitative_examples.append({
              "instruction": example["instruction"],
              "input_code": example["input"],
              "generated_responses": generations,
              "template_type": example["template_type"],
              "source_file": example.get("source_file", "N/A")
          })

    # Aggregate metrics
    results = {
        "pass@k": stats["pass_k"] / stats["total"],
        "unit_test_pass_rate": (
            stats["unit_test_pass"] / max(stats["unit_test_attempts"], 1)
        ),
        "docstring_style_score": (
            stats["docstring_style_score"] / max(stats["docstring_total"], 1)
        ),
        "hallucination_rate": stats["hallucinations"] / (stats["total"] * k),
        "num_evaluated_samples": stats["total"],
    }

    return results, qualitative_examples


In [42]:
# -------------------------
# BASELINE evaluation
# -------------------------
base_results, base_examples = evaluate_track_b(
    dataset_b["test"],
    base_model,
    tokenizer,
    k=5
)

# -------------------------
# LoRA evaluation
# -------------------------
lora_results, lora_examples = evaluate_track_b(
    dataset_b["test"],
    model,
    tokenizer,
    k=5
)

base_results, lora_results

({'pass@k': 0.45454545454545453,
  'unit_test_pass_rate': 0.0,
  'docstring_style_score': 0.0,
  'hallucination_rate': 0.7090909090909091,
  'num_evaluated_samples': 11},
 {'pass@k': 0.8181818181818182,
  'unit_test_pass_rate': 0.0,
  'docstring_style_score': 0.36666666666666664,
  'hallucination_rate': 0.45454545454545453,
  'num_evaluated_samples': 11})

In [43]:
def compute_deltas(base, lora):
    return {
        metric: lora[metric] - base.get(metric, 0.0)
        for metric in lora
        if isinstance(lora[metric], (int, float))
    }

track_b_deltas = compute_deltas(base_results, lora_results)


In [44]:
import html

def escape_html(text):
    return html.escape(str(text))


def write_review_html_from_lora_examples(
    lora_examples,
    output_html_path,
    max_samples=20
):

    with open(output_html_path, "w", encoding="utf-8") as f:
        f.write("""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Track B – LoRA Qualitative Review</title>
<style>
body {
    font-family: Arial, sans-serif;
    margin: 30px;
}
.sample {
    border: 1px solid #ddd;
    padding: 20px;
    margin-bottom: 30px;
    border-radius: 8px;
}
.meta {
    color: #555;
    font-size: 14px;
    margin-bottom: 10px;
}
h2 {
    margin-top: 0;
}
pre {
    background: #f6f8fa;
    padding: 12px;
    overflow-x: auto;
    border-radius: 6px;
}
.code {
    font-family: Consolas, monospace;
    font-size: 14px;
}
.task {
    font-weight: bold;
    color: #0b5394;
}
.candidate {
    margin-top: 15px;
}
</style>
</head>
<body>

<h1>Track B – LoRA Model Qualitative Review</h1>
<p>
This page shows raw generations produced by the LoRA-finetuned model during
Pass@k evaluation. These examples are intended for human inspection and error analysis.
</p>
""")

        for i, ex in enumerate(lora_examples[:max_samples]):
            f.write(f"""
<div class="sample">
<h2>Example {i+1}</h2>

<div class="meta">
<span class="task">Task:</span> {escape_html(ex.get('template_type', 'N/A'))}<br>
<span class="task">Source:</span> {escape_html(ex.get('source_file', 'N/A'))}
</div>

<h3>Instruction</h3>
<p>{escape_html(ex['instruction'])}</p>

<h3>Input Code</h3>
<pre class="code">{escape_html(ex['input_code'])}</pre>

<h3>Generated Responses (k candidates)</h3>
""")

            for j, resp in enumerate(ex["generated_responses"]):
                # Extract only the response portion if the prompt leaked
                clean_resp = resp.split("### Response:")[-1].strip()

                f.write(f"""
<div class="candidate">
<strong>Candidate {j+1}</strong>
<pre class="code">{escape_html(clean_resp)}</pre>
</div>
""")

            f.write("</div>")

        f.write("""
</body>
</html>
""")

    print(f"Saved LoRA qualitative review to: {output_html_path}")


In [45]:
write_review_html_from_lora_examples(
    lora_examples,
    output_html_path="track_b_lora_review.html",
    max_samples=20
)


Saved LoRA qualitative review to: track_b_lora_review.html


In [46]:
# manually choose the good and bad examples for final summary file
good_indices = {0, 1, 3, 4, 7}
good_cand = {2, 1, 3, 0, 0}
bad_indices = {0, 2, 5, 8, 10}
bad_cand = {1, 2, 0, 0, 1}

In [None]:
with open("track_b_results_summary.json", "w") as f:
    json.dump(
        {
            "track": "B",
            "baseline_metrics": base_results,
            "lora_metrics": lora_results,
            "metric_improvements": track_b_deltas,
            "evaluation_config": {
                "k": 5,
                "temperature": 0.7,
                "top_p": 0.9,
                "max_new_tokens": 256
            },
            "num_samples": lora_results["num_evaluated_samples"],
        },
        f,
        indent=2
    )


## Track B Result Summary Generation


In [47]:
# def generate_track_b_html(
#     baseline_results,
#     lora_results,
#     track_b_deltas,
#     good_examples,
#     bad_examples
# ):
#     def escape_html(text):
#         return (
#             str(text)
#             .replace("&", "&amp;")
#             .replace("<", "&lt;")
#             .replace(">", "&gt;")
#             .replace('"', "&quot;")
#             .replace("'", "&#39;")
#         )

#     def render_collapsible(title, content, lang=""):
#         return f"""
#         <details>
#             <summary><strong>{title}</strong></summary>
#             <pre><code class="{lang}">{escape_html(content)}</code></pre>
#         </details>
#         """

#     html = f"""
#     <div class="track-section" id="track-b">
#         <h2>Track B: Instruction Tuning (Supervised Fine-Tuning)</h2>

#         <div class="methodology">
#             <h3>Methodology</h3>
#             <p>
#                 This track evaluates whether lightweight supervised fine-tuning
#                 can improve instruction-following behavior for code understanding tasks.
#             </p>
#             <ul>
#                 <li><strong>Model:</strong> Qwen2.5-Coder-0.5B</li>
#                 <li><strong>Fine-tuning:</strong> LoRA (parameter-efficient SFT)</li>
#                 <li><strong>Instruction Types:</strong> explain, docstring</li>
#                 <li><strong>Trainable Parameters:</strong> &lt; 1% of total model</li>
#             </ul>
#         </div>

#         <div class="metrics">
#             <h3>Quantitative Results</h3>
#             <table>
#                 <tr>
#                     <th>Metric</th>
#                     <th>Baseline</th>
#                     <th>After LoRA</th>
#                     <th>Δ</th>
#                 </tr>
#                 <tr>
#                     <td>Pass@5</td>
#                     <td>{baseline_results['pass@k']:.2%}</td>
#                     <td>{lora_results['pass@k']:.2%}</td>
#                     <td class="{'positive' if track_b_deltas['pass@k'] > 0 else 'negative'}">
#                         {track_b_deltas['pass@k']:+.2%}
#                     </td>
#                 </tr>
#                 <tr>
#                     <td>Docstring Style Score</td>
#                     <td>{baseline_results['docstring_style_score']:.2%}</td>
#                     <td>{lora_results['docstring_style_score']:.2%}</td>
#                     <td class="{'positive' if track_b_deltas['docstring_style_score'] > 0 else 'negative'}">
#                         {track_b_deltas['docstring_style_score']:+.2%}
#                     </td>
#                 </tr>
#                 <tr>
#                     <td>Hallucination Rate</td>
#                     <td>{baseline_results['hallucination_rate']:.2%}</td>
#                     <td>{lora_results['hallucination_rate']:.2%}</td>
#                     <td class="{'positive' if track_b_deltas['hallucination_rate'] < 0 else 'negative'}">
#                         {track_b_deltas['hallucination_rate']:+.2%}
#                     </td>
#                 </tr>
#             </table>
#         </div>

#         <div class="examples">
#             <h3>Representative Success Cases</h3>
#     """

#     # ---------- GOOD EXAMPLES ----------
#     for i, ex in enumerate(good_examples, 1):
#         html += f"""
#         <div class="example good">
#             <p>
#                 <strong>Example {i}</strong>
#                 | <em>Type: {escape_html(ex['template_type'])}</em>
#                 | <em>Score: {ex['score']}</em>
#             </p>

#             <p><strong>Instruction</strong></p>
#             <pre><code>{escape_html(ex['instruction'])}</code></pre>

#             {render_collapsible("Input Code", ex["input"], "python")}
#             {render_collapsible("Generated Response", ex["response"])}
#         </div>
#         """

#     # ---------- BAD EXAMPLES ----------
#     html += """
#         <h3>Representative Failure Cases</h3>
#     """

#     for i, ex in enumerate(bad_examples, 1):
#         html += f"""
#         <div class="example bad">
#             <p>
#                 <strong>Example {i}</strong>
#                 | <em>Type: {escape_html(ex['template_type'])}</em>
#                 | <em>Score: {ex['score']}</em>
#             </p>

#             <p><strong>Failure Mode:</strong> {escape_html(ex.get("issue", "Instruction mismatch or hallucination"))}</p>

#             <p><strong>Instruction</strong></p>
#             <pre><code>{escape_html(ex['instruction'])}</code></pre>

#             {render_collapsible("Input Code", ex["input"], "python")}
#             {render_collapsible("Generated Response", ex["response"])}
#         </div>
#         """

#     html += f"""
#         </div>

#         <div class="insights">
#             <h3>Key Observations</h3>
#             <ul>
#                 <li>
#                     LoRA fine-tuning did <strong>not consistently improve Pass@5</strong>,
#                     indicating sensitivity to synthetic data quality.
#                 </li>
#                 <li>
#                     Gains are most visible in isolated explain-style instructions,
#                     while structured docstring generation remains challenging.
#                 </li>
#             </ul>
#         </div>
#     </div>
#     """

#     return html

def generate_track_b_html(
    baseline_results,
    lora_results,
    track_b_deltas,
    qualitative_examples,
    good_indices,
    good_cand,
    bad_indices,
    bad_cand
):
    def escape_html(text):
        return (
            str(text)
            .replace("&", "&amp;")
            .replace("<", "&lt;")
            .replace(">", "&gt;")
            .replace('"', "&quot;")
            .replace("'", "&#39;")
        )

    def render_collapsible(title, content, lang=""):
        return f"""
        <details>
            <summary><strong>{title}</strong></summary>
            <pre><code class="{lang}">{escape_html(content)}</code></pre>
        </details>
        """

    html = f"""
    <div class="track-section" id="track-b">
        <h2>Track B: Instruction Tuning (Supervised Fine-Tuning)</h2>
        <div class="methodology">
            <h3>Methodology</h3>
            <p>Evaluating lightweight supervised fine-tuning (LoRA) for code understanding.</p>
            <ul>
                <li><strong>Model:</strong> Qwen2.5-Coder-0.5B</li>
                <li><strong>Fine-tuning:</strong> LoRA (parameter-efficient SFT)</li>
                <li><strong>Instruction Types:</strong> explain, docstring</li>
            </ul>
        </div>

        <div class="metrics">
            <h3>Quantitative Results</h3>
            <table>
                <tr><th>Metric</th><th>Baseline</th><th>After LoRA</th><th>Δ</th></tr>
                <tr>
                    <td>Pass@5</td>
                    <td>{baseline_results['pass@k']:.2%}</td>
                    <td>{lora_results['pass@k']:.2%}</td>
                    <td class="{'positive' if track_b_deltas['pass@k'] > 0 else 'negative'}">{track_b_deltas['pass@k']:+.2%}</td>
                </tr>
                <tr>
                    <td>Docstring Style Score</td>
                    <td>{baseline_results['docstring_style_score']:.2%}</td>
                    <td>{lora_results['docstring_style_score']:.2%}</td>
                    <td class="{'positive' if track_b_deltas['docstring_style_score'] > 0 else 'negative'}">{track_b_deltas['docstring_style_score']:+.2%}</td>
                </tr>
                <tr>
                    <td>Hallucination Rate</td>
                    <td>{baseline_results['hallucination_rate']:.2%}</td>
                    <td>{lora_results['hallucination_rate']:.2%}</td>
                    <td class="{'positive' if track_b_deltas['hallucination_rate'] < 0 else 'negative'}">{track_b_deltas['hallucination_rate']:+.2%}</td>
                </tr>
            </table>
        </div>

        <div class="examples">
            <h3>Representative Success Cases</h3>
    """

    # ---------- GOOD EXAMPLES ----------
    # Zip indices and candidate choices to extract correct data
    for i, (idx, c_idx) in enumerate(zip(good_indices, good_cand), 1):
        ex = qualitative_examples[idx]
        raw_gen = ex['generated_responses'][c_idx]
        # Clean the response to remove prompt prefix
        clean_res = raw_gen.split("### Response:")[-1].strip()

        html += f"""
        <div class="example good">
            <p><strong>Example {i}</strong> | <em>Type: {escape_html(ex['template_type'])}</em></p>
            <p><strong>Instruction</strong></p>
            <pre><code>{escape_html(ex['instruction'])}</code></pre>
            {render_collapsible("Input Code", ex["input_code"], "python")}
            {render_collapsible(f"Generated Response (Candidate {c_idx+1})", clean_res)}
        </div>
        """

    # ---------- BAD EXAMPLES ----------
    html += "<h3>Representative Failure Cases</h3>"

    for i, (idx, c_idx) in enumerate(zip(bad_indices, bad_cand), 1):
        ex = qualitative_examples[idx]
        raw_gen = ex['generated_responses'][c_idx]
        clean_res = raw_gen.split("### Response:")[-1].strip()

        html += f"""
        <div class="example bad">
            <p><strong>Example {i}</strong> | <em>Type: {escape_html(ex['template_type'])}</em></p>
            <p><strong>Instruction</strong></p>
            <pre><code>{escape_html(ex['instruction'])}</code></pre>
            {render_collapsible("Input Code", ex["input_code"], "python")}
            {render_collapsible(f"Generated Response (Candidate {c_idx+1})", clean_res)}
        </div>
        """

    html += """</div></div>"""
    return html



In [None]:
# def collect_track_b_examples(dataset, model, tokenizer, num_good=5, num_bad=5):
#     import torch

#     def generate_response(prompt):
#         inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda")
#         with torch.no_grad():
#             outputs = model.generate(
#                 **inputs,
#                 max_new_tokens=256,
#                 temperature=0.7,
#                 do_sample=True,
#                 pad_token_id=tokenizer.eos_token_id
#             )
#         return tokenizer.decode(outputs[0], skip_special_tokens=True)

#     def score_response(generated, ground_truth, input_code, template_type):
#         """Score response quality (higher is better)"""
#         score = 0
#         response = generated.split("### Response:")[-1].strip()

#         # Length check
#         if template_type == "docstring":
#             has_triple_quotes = '"""' in response or "'''" in response
#             score += 30 if has_triple_quotes else 0
#             score += 10 if len(response) > 20 else 0
#         elif template_type == "explain":
#             score += 20 if len(response.split()) > 20 else 0
#             score += 10 if len(response.split()) < 100 else 0  # Not too long

#         # Check for hallucinations (mentioning functions/vars not in input)
#         try:
#             import ast
#             input_tree = ast.parse(input_code)
#             input_names = {n.id for n in ast.walk(input_tree) if isinstance(n, ast.Name)}

#             # Extract potential identifiers from response
#             import re
#             response_identifiers = set(re.findall(r'\b[a-z_][a-z0-9_]*\b', response.lower()))

#             # Penalty for many unknown identifiers
#             unknown = response_identifiers - input_names - {'the', 'a', 'an', 'is', 'are', 'this', 'that'}
#             if len(unknown) > 5:
#                 score -= 20
#         except:
#             pass

#         # Check for code in explanation (bad)
#         if template_type == "explain" and ("def " in response or "class " in response):
#             score -= 15

#         # Similarity to ground truth (rough check)
#         common_words = set(ground_truth.lower().split()) & set(response.lower().split())
#         score += min(len(common_words), 20)

#         return score

#     examples_with_scores = []

#     for ex in dataset:
#         prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

# ### Instruction:
# {ex['instruction']}

# ### Input:
# {ex['input']}

# ### Response:
# """
#         generated = generate_response(prompt)
#         score = score_response(generated, ex['response'], ex['input'], ex['template_type'])

#         examples_with_scores.append({
#             'instruction': ex['instruction'],
#             'input': ex['input'],
#             'response': generated.split("### Response:")[-1].strip(),
#             'ground_truth': ex['response'],
#             'source': ex.get('source_file', 'N/A'),
#             'template_type': ex['template_type'],
#             'score': score,
#             'issue': 'Low quality output' if score < 20 else None
#         })

#     # Sort by score
#     examples_with_scores.sort(key=lambda x: x['score'], reverse=True)

#     good_examples = examples_with_scores[:num_good]
#     bad_examples = examples_with_scores[-num_bad:]

#     return good_examples, bad_examples

def collect_track_b_examples_from_eval(
    eval_examples,
    num_good=4,
    num_bad=4
):
    """
    Select good and bad qualitative examples from evaluate_track_b outputs
    without re-running generation.
    """

    def score_response(resp, input_code, template_type):
        score = 0

        # ---------- Template-specific quality ----------
        if template_type == "docstring":
            score += 30 if ('"""' in resp or "'''" in resp) else 0
            score += 10 if len(resp.split()) > 20 else 0

        elif template_type == "explain":
            words = len(resp.split())
            score += 20 if words > 30 else 0
            score += 10 if words < 120 else 0

        # ---------- Hallucination penalty ----------
        try:
            import ast, re
            input_tree = ast.parse(input_code)
            input_names = {n.id for n in ast.walk(input_tree) if isinstance(n, ast.Name)}
            resp_names = set(re.findall(r"\b[a-zA-Z_]\w*\b", resp))
            hallucinated = resp_names - input_names
            score -= min(len(hallucinated), 10)
        except:
            score -= 5

        return score

    scored = []

    for ex in eval_examples:
        for cand in ex["generated_responses"]:
            response = cand.split("### Response:")[-1].strip()

            s = score_response(
                response,
                ex["input_code"],
                ex.get("template_type", "explain")
            )

            scored.append({
                "instruction": ex["instruction"],
                "input": ex["input_code"],
                "response": response,
                "template_type": ex.get("template_type", "unknown"),
                "score": s,
                "source": ex.get("source_file", "N/A")
            })

    # Sort by quality
    scored.sort(key=lambda x: x["score"], reverse=True)

    # Ensure diversity
    def select_diverse(examples, n):
        selected, seen = [], set()
        for ex in examples:
            if ex["template_type"] not in seen:
                selected.append(ex)
                seen.add(ex["template_type"])
            if len(selected) == n:
                break
        return selected

    good = select_diverse(scored, num_good)
    bad = select_diverse(list(reversed(scored)), num_bad)

    return good, bad


In [49]:
# good_examples, bad_examples = collect_track_b_examples(
#     dataset_b["test"], model, tokenizer, num_good=5, num_bad=5
# )
# good_examples, bad_examples = collect_track_b_examples_from_eval(
#     lora_examples,
#     num_good=4,
#     num_bad=3
# )

track_b_html = generate_track_b_html(
    baseline_results=base_results,
    lora_results=lora_results,
    track_b_deltas=track_b_deltas,
    qualitative_examples=lora_examples,
    good_indices=good_indices,
    good_cand=good_cand,
    bad_indices=bad_indices,
    bad_cand=bad_cand
)


# Track C

In [50]:
del model
torch.cuda.empty_cache()


In [51]:

import itertools
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import re





Create retrieval data from track b dataset: use ground truth responses for explanation and docstring queries in track b as query in track c and use the corresponding input code as the code for track c

In [52]:

def make_intent_query(text, max_words=12):
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    return " ".join(words[:max_words])

def create_retrieval_pairs(dataset_b):
    retrieval_pairs = []

    for ex in dataset_b:
        ttype = ex["template_type"]
        code = ex["input"].strip()

        # Skip weak query types
        if ttype not in ["explain", "docstring"]:
            continue

        # Use explanation / docstring as intent
        #query = ex["response"].strip()
        query = make_intent_query(ex["response"])

        # Basic sanity filtering
        if len(query) < 30 or len(code) < 30:
            continue

        retrieval_pairs.append({
            "query": f"query: {query}",
            "code": f"passage: {code}",
            "template_type": ttype
        })

    return retrieval_pairs


combined_dataset = list(
    itertools.chain(
        dataset_b["train"],
        dataset_b["test"]
    )
)

print(f"Loaded {len(combined_dataset)} total Track B samples")

retrieval_data = create_retrieval_pairs(combined_dataset)

print(f"Created {len(retrieval_data)} retrieval pairs")


Loaded 163 total Track B samples
Created 162 retrieval pairs


In [53]:
train_data, test_data = train_test_split(
    retrieval_data,
    test_size=0.2,
    random_state=42
)

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")



Train samples: 129
Test samples: 33


In [54]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # or "intfloat/e5-small-v2"
model = SentenceTransformer(MODEL_NAME)

print(f"Loaded model: {MODEL_NAME}")


train_examples = [
    InputExample(texts=[item["query"], item["code"]])
    for item in train_data
]

train_loader = DataLoader(
    train_examples,
    shuffle=True,
    batch_size=16
)

train_loss = losses.MultipleNegativesRankingLoss(model)


# Use ALL retrieval data as corpus (train + test)
global_code_corpus = [item["code"] for item in retrieval_data]



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded model: sentence-transformers/all-MiniLM-L6-v2


In [55]:

def evaluate_retrieval(model, test_data, code_corpus, k=10):
    code_embeddings = model.encode(code_corpus, convert_to_tensor=True)

    ranks = []

    for item in test_data:
        query_emb = model.encode(item["query"], convert_to_tensor=True)

        scores = cosine_similarity(
            query_emb.cpu().numpy().reshape(1, -1),
            code_embeddings.cpu().numpy()
        )[0]

        ranking = np.argsort(scores)[::-1]

        true_index = code_corpus.index(item["code"])
        rank = np.where(ranking == true_index)[0][0]
        ranks.append(rank)

    # Metrics
    def mrr_at_k(ranks, k):
        return np.mean([1/(r+1) if r < k else 0 for r in ranks])

    def ndcg_at_k(ranks, k):
        return np.mean([1/np.log2(r+2) if r < k else 0 for r in ranks])

    def recall_at_k(ranks, k):
        return np.mean([1 if r < k else 0 for r in ranks])

    results = {
        "MRR@10": mrr_at_k(ranks, 10),
        "nDCG@10": ndcg_at_k(ranks, 10),
        "Recall@10": recall_at_k(ranks, 10),
        "MRR@5": mrr_at_k(ranks, 5),
        "Recall@5": recall_at_k(ranks, 5),
        "num_queries": len(ranks)
    }

    return results, ranks



In [56]:
#Baseline Evaluation (Before Training)
print("\n=== BASELINE METRICS (Before Training) ===")
baseline_results, baseline_ranks = evaluate_retrieval(
    model, test_data, global_code_corpus
)
for metric, value in baseline_results.items():
    if metric != "num_queries":
        print(f"{metric}: {value:.4f}")




=== BASELINE METRICS (Before Training) ===
MRR@10: 0.4455
nDCG@10: 0.5154
Recall@10: 0.7273
MRR@5: 0.4354
Recall@5: 0.6667


In [57]:
# FINE-TUNE
print("\nFine-tuning embedding model...")
model.fit(
    train_objectives=[(train_loader, train_loss)],
    epochs=3,
    warmup_steps=100,
    optimizer_params={'lr': 2e-5},
    show_progress_bar=True
)

# EVALUATE AFTER TRAINING
print("\n=== AFTER FINE-TUNING ===")
finetuned_results, finetuned_ranks = evaluate_retrieval(
    model, test_data, global_code_corpus
)
for metric, value in finetuned_results.items():
    if metric != "num_queries":
        print(f"{metric}: {value:.4f}")

# CALCULATE IMPROVEMENTS
print("\n=== IMPROVEMENTS ===")
for metric in ["MRR@10", "nDCG@10", "Recall@10"]:
    improvement = finetuned_results[metric] - baseline_results[metric]
    print(f"{metric}: {improvement:+.4f} ({improvement/baseline_results[metric]*100:+.1f}%)")



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



Fine-tuning embedding model...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss



=== AFTER FINE-TUNING ===
MRR@10: 0.4785
nDCG@10: 0.5480
Recall@10: 0.7576
MRR@5: 0.4747
Recall@5: 0.7273

=== IMPROVEMENTS ===
MRR@10: +0.0331 (+7.4%)
nDCG@10: +0.0327 (+6.3%)
Recall@10: +0.0303 (+4.2%)


In [58]:
model.save("./track_c_finetuned_embeddings")
print("\nModel saved to ./track_c_finetuned_embeddings")

# ERROR ANALYSIS
print("\n=== ERROR ANALYSIS ===")
print("Worst performing queries (after fine-tuning):")

# Get worst cases
worst_indices = np.argsort(finetuned_ranks)[-10:]

for idx in worst_indices:
    rank = finetuned_ranks[idx]
    query = test_data[idx]["query"].replace("query: ", "")
    code = test_data[idx]["code"].replace("passage: ", "")[:100]
    print(f"\nRank: {rank}")
    print(f"Query: {query}")
    print(f"Code: {code}...")


Model saved to ./track_c_finetuned_embeddings

=== ERROR ANALYSIS ===
Worst performing queries (after fine-tuning):

Rank: 3
Query: url = 'https://example.com/api/data' headers = {'Authorization': 'Bearer your_access_token'} auth = HTTPBasicAuth('username', 'password')
Code: class HTTPBasicAuth(AuthBase):
    """Attaches HTTP Basic Authentication to the given Request object...

Rank: 7
Query: """ Take an object and test to see if it can be
Code: def to_key_val_list(value):
    """Take an object and test to see if it can be represented as a
    ...

Rank: 11
Query: 1. `path_url` Method: - This method constructs the path URL from the
Code: class RequestEncodingMixin:
    @property
    def path_url(self):
        """Build the path URL to u...

Rank: 18
Query: """ Prepares the entire request with the given parameters. Args: method (str):
Code: def prepare(
        self,
        method=None,
        url=None,
        headers=None,
        file...

Rank: 22
Query: Purpose: 2. Cookies Handli

## Track C Summary Generation


In [59]:
def generate_track_c_html(baseline_metrics, finetuned_metrics,
                          good_retrievals, bad_retrievals):

    def escape_html(text):
        return (str(text)
                .replace("&", "&amp;")
                .replace("<", "&lt;")
                .replace(">", "&gt;")
                .replace('"', "&quot;")
                .replace("'", "&#39;"))

    improvements = {
        k: finetuned_metrics[k] - baseline_metrics[k]
        for k in ['MRR@10', 'nDCG@10', 'Recall@10']
    }

    html = f"""
    <div class="track-section" id="track-c">
        <h2>Track C: Embedding Fine-tuning</h2>

        <div class="methodology">
            <h3>Methodology</h3>
            <p><strong>Objective:</strong> Adapt a small embedding model for text → code retrieval tasks.</p>
            <ul>
                <li><strong>Model:</strong> sentence-transformers/all-MiniLM-L6-v2</li>
                <li><strong>Dataset:</strong>
                    <ul>
                        <li>162 text-code pairs derived from Track B data</li>
                        <li>129 training pairs, 33 test pairs</li>
                        <li>Query: Shortened explanations/docstrings (max 12 words)</li>
                        <li>Code: Original function/class implementations</li>
                    </ul>
                </li>
                <li><strong>Training:</strong>
                    <ul>
                        <li>Loss Function: Multiple Negatives Ranking Loss</li>
                        <li>Epochs: 3</li>
                        <li>Batch Size: 16</li>
                        <li>Learning Rate: 2e-5</li>
                    </ul>
                </li>
                <li><strong>Evaluation:</strong> Retrieval from global corpus of 162 code snippets</li>
            </ul>
        </div>

        <div class="metrics">
            <h3>Metrics & Results</h3>
            <table>
                <tr>
                    <th>Metric</th>
                    <th>Baseline</th>
                    <th>After Fine-tuning</th>
                    <th>Improvement</th>
                </tr>
                <tr>
                    <td>MRR@10</td>
                    <td>{baseline_metrics['MRR@10']:.4f}</td>
                    <td>{finetuned_metrics['MRR@10']:.4f}</td>
                    <td class="{'positive' if improvements['MRR@10'] > 0 else 'negative'}">
                        {improvements['MRR@10']:+.4f} ({improvements['MRR@10']/baseline_metrics['MRR@10']*100:+.1f}%)
                    </td>
                </tr>
                <tr>
                    <td>nDCG@10</td>
                    <td>{baseline_metrics['nDCG@10']:.4f}</td>
                    <td>{finetuned_metrics['nDCG@10']:.4f}</td>
                    <td class="{'positive' if improvements['nDCG@10'] > 0 else 'negative'}">
                        {improvements['nDCG@10']:+.4f} ({improvements['nDCG@10']/baseline_metrics['nDCG@10']*100:+.1f}%)
                    </td>
                </tr>
                <tr>
                    <td>Recall@10</td>
                    <td>{baseline_metrics['Recall@10']:.4f}</td>
                    <td>{finetuned_metrics['Recall@10']:.4f}</td>
                    <td class="{'positive' if improvements['Recall@10'] > 0 else 'negative'}">
                        {improvements['Recall@10']:+.4f} ({improvements['Recall@10']/baseline_metrics['Recall@10']*100:+.1f}%)
                    </td>
                </tr>
            </table>
        </div>

        <div class="examples">
            <h3>Retrieval Examples</h3>

            <h4>✓ Good Retrievals (Low Rank = Successful)</h4>
            <div class="example-container">
    """

    for i, ex in enumerate(good_retrievals[:3], 1):
        rank_class = "excellent" if ex['rank'] == 0 else "good"
        html += f"""
                <div class="example {rank_class}">
                    <p><strong>Example {i}</strong> | <em>Rank: {ex['rank']}</em> | <em>Type: {escape_html(ex.get('template_type', 'N/A'))}</em></p>
                    <p><strong>Query:</strong> {escape_html(ex['query'])}</p>
                    <p><strong>Retrieved Code:</strong></p>
                    <pre><code>{escape_html(ex['code'][:500])}{'...' if len(ex['code']) > 500 else ''}</code></pre>
                    <p class="success-note">✓ Correct code retrieved at position {ex['rank']} (excellent ranking)</p>
                </div>
        """

    html += """
            </div>

            <h4>✗ Poor Retrievals (High Rank = Failed)</h4>
            <div class="example-container">
    """

    for i, ex in enumerate(bad_retrievals[:3], 1):
        html += f"""
                <div class="example bad">
                    <p><strong>Example {i}</strong> | <em>Rank: {ex['rank']}</em> | <em>Type: {escape_html(ex.get('template_type', 'N/A'))}</em></p>
                    <p><strong>Query:</strong> {escape_html(ex['query'])}</p>
                    <p><strong>Retrieved Code:</strong></p>
                    <pre><code>{escape_html(ex['code'][:500])}{'...' if len(ex['code']) > 500 else ''}</code></pre>
                    <p class="error-note">✗ Correct code ranked at position {ex['rank']} (should be in top 10)</p>
                    <p><strong>Issue:</strong> Query may be too generic or code semantics not captured well by embeddings</p>
                </div>
        """

    html += f"""
            </div>
        </div>

        <div class="insights">
            <h3>Key Insights</h3>
            <ul>
                <li>MRR@10 improved by {improvements['MRR@10']/baseline_metrics['MRR@10']*100:.1f}%, indicating better ranking quality</li>
                <li>Recall@10 improved by {improvements['Recall@10']/baseline_metrics['Recall@10']*100:.1f}%, showing more relevant items retrieved</li>
                <li>Fine-tuning with Multiple Negatives Ranking Loss effectively adapts embeddings for code retrieval</li>
                <li>Model struggles with queries containing generic terms or very specific implementation details</li>
                <li>Since Track C dataset has been derived from Track B responses, the queries are not very appropriate, for better results, generating synthetic dataset would be better, however, due to time and compute constraints, that has been kept as a future work</li>
            </ul>
        </div>
    </div>
    """
    return html



In [60]:
def collect_track_c_examples(test_data, model, code_corpus, num_good=5, num_bad=5):

    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np

    code_embeddings = model.encode(code_corpus, convert_to_tensor=True)

    examples_with_ranks = []

    for item in test_data:
        query_emb = model.encode(item["query"], convert_to_tensor=True)

        scores = cosine_similarity(
            query_emb.cpu().numpy().reshape(1, -1),
            code_embeddings.cpu().numpy()
        )[0]

        ranking = np.argsort(scores)[::-1]

        # Find rank of correct code
        true_index = code_corpus.index(item["code"])
        rank = int(np.where(ranking == true_index)[0][0])

        # Clean up query/code for display
        query_clean = item["query"].replace("query: ", "")
        code_clean = item["code"].replace("passage: ", "")

        examples_with_ranks.append({
            'query': query_clean,
            'code': code_clean,
            'rank': rank,
            'template_type': item.get('template_type', 'N/A'),
            'score': 1.0 / (rank + 1)  # Higher is better
        })

    # Sort by rank (lower is better for good, higher for bad)
    examples_with_ranks.sort(key=lambda x: x['rank'])

    good_examples = examples_with_ranks[:num_good]  # Low ranks (0-4)
    bad_examples = examples_with_ranks[-num_bad:]   # High ranks (worst retrievals)

    return good_examples, bad_examples




In [61]:
good_retrievals, bad_retrievals = collect_track_c_examples(
    test_data, model, global_code_corpus, num_good=5, num_bad=5
)
track_c_html = generate_track_c_html(
    baseline_metrics=baseline_results,
    finetuned_metrics=finetuned_results,
    good_retrievals=good_retrievals,
    bad_retrievals=bad_retrievals
)

# Final Summary Generation

In [64]:
def generate_final_html_report(track_a_html, track_b_html, track_c_html,
                               datacard_path="datacard.json"):
    # Load datacard for metadata
    try:
        with open(datacard_path, 'r') as f:
            datacard = json.load(f)
    except:
        datacard = {}

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ML Assignment Part 2 - Results Report</title>
    <style>
        * {{
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }}

        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            line-height: 1.6;
            color: #333;
            background: #f5f5f5;
            padding: 20px;
        }}

        .container {{
            max-width: 1200px;
            margin: 0 auto;
            background: white;
            padding: 40px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
            border-radius: 8px;
        }}

        h1 {{
            color: #2c3e50;
            margin-bottom: 10px;
            border-bottom: 3px solid #3498db;
            padding-bottom: 10px;
        }}

        .metadata {{
            background: #ecf0f1;
            padding: 15px;
            border-radius: 5px;
            margin-bottom: 30px;
        }}

        .metadata p {{
            margin: 5px 0;
        }}

        .track-section {{
            margin: 40px 0;
            padding: 20px;
            border-left: 4px solid #3498db;
            background: #f8f9fa;
        }}

        h2 {{
            color: #2980b9;
            margin-bottom: 20px;
        }}

        h3 {{
            color: #34495e;
            margin: 20px 0 10px 0;
        }}

        h4 {{
            color: #7f8c8d;
            margin: 15px 0 10px 0;
        }}

        .methodology, .metrics, .examples, .insights {{
            margin: 20px 0;
        }}

        table {{
            width: 100%;
            border-collapse: collapse;
            margin: 15px 0;
            background: white;
        }}

        th, td {{
            padding: 12px;
            text-align: left;
            border: 1px solid #ddd;
        }}

        th {{
            background: #3498db;
            color: white;
            font-weight: bold;
        }}

        tr:nth-child(even) {{
            background: #f2f2f2;
        }}

        .positive {{
            color: #27ae60;
            font-weight: bold;
        }}

        .negative {{
            color: #e74c3c;
            font-weight: bold;
        }}

        ul {{
            margin: 10px 0 10px 20px;
        }}

        li {{
            margin: 5px 0;
        }}

        .example-container {{
            margin: 15px 0;
        }}

        .example {{
            background: white;
            padding: 15px;
            margin: 10px 0;
            border-radius: 5px;
            border-left: 4px solid #95a5a6;
        }}

        .example.good {{
            border-left-color: #27ae60;
        }}

        .example.excellent {{
            border-left-color: #27ae60;
            background: #eafaf1;
        }}

        .example.bad {{
            border-left-color: #e74c3c;
        }}

        .success-note {{
            color: #27ae60;
            font-weight: bold;
            margin: 10px 0;
        }}

        .error-note {{
            color: #e74c3c;
            font-weight: bold;
            margin: 10px 0;
        }}

        pre {{
            background: #2c3e50;
            color: #ecf0f1;
            padding: 10px;
            border-radius: 4px;
            overflow-x: auto;
            margin: 10px 0;
        }}

        code {{
            font-family: 'Courier New', monospace;
            font-size: 0.9em;
        }}

        .toc {{
            background: #ecf0f1;
            padding: 20px;
            border-radius: 5px;
            margin: 20px 0;
        }}

        .toc ul {{
            list-style: none;
            margin: 10px 0;
        }}

        .toc a {{
            color: #2980b9;
            text-decoration: none;
        }}

        .toc a:hover {{
            text-decoration: underline;
        }}

        @media print {{
            body {{
                background: white;
            }}
            .container {{
                box-shadow: none;
            }}
        }}
    </style>
</head>
<body>
    <div class="container">
        <h1>ML Assignment Part 2: Fine-tuning Small Transformer Models</h1>

        <div class="metadata">
            <p><strong>Report Generated:</strong> {timestamp}</p>
            <p><strong>Hardware:</strong> Google Colab T4 GPU</p>
            <p><strong>Source Code Repo:</strong><a href="https://github.com/psf/requests.git">https://github.com/psf/requests.git</a></p>
        </div>

        <div class="toc">
            <h3>Table of Contents</h3>
            <ul>
                <li><a href="#track-a">Track A: Extended Pre-training</a></li>
                <li><a href="#track-b">Track B: Instruction Tuning (SFT)</a></li>
                <li><a href="#track-c">Track C: Embedding Fine-tuning</a></li>
            </ul>
        </div>

        {track_a_html}
        {track_b_html}
        {track_c_html}

        <div class="track-section">
            <h2>Overall Conclusions</h2>
            <ul>
                <li>All three tracks demonstrated measurable improvements after fine-tuning</li>
                <li>LoRA proved to be an efficient method for instruction tuning with minimal parameters</li>
                <li>Embedding fine-tuning significantly improved code retrieval capabilities</li>
                <li>Dataset for Track C has been derived from Track B responses (used as query, truncated), so it is not perfect. Generating Synthetic dataset required more compute.</li>
            </ul>
        </div>
    </div>
</body>
</html>
    """

    return html



In [65]:

final_html = generate_final_html_report(
    track_a_html=track_a_html,
    track_b_html=track_b_html,
    track_c_html=track_c_html
)


with open("ML_Assignment_Part2_Results.html", "w", encoding="utf-8") as f:
    f.write(final_html)

print("HTML report saved to ML_Assignment_Part2_Results.html")

HTML report saved to ML_Assignment_Part2_Results.html
