# Notebook 2 · Load Base `nanochat` Checkpoint

This notebook loads the `sdobson/nanochat` checkpoint, evaluates its baseline performance on the prepared ScienceQA subset, and saves the raw baseline responses for later comparison.


In [3]:
import sys
from pathlib import Path
import subprocess

# Clone the nanochat repository (idempotent)
nanochat_repo = Path("nanochat")
if not nanochat_repo.exists():
    print("Cloning karpathy/nanochat...")
    subprocess.run(["git", "clone", "https://github.com/karpathy/nanochat.git", str(nanochat_repo)], check=True)
else:
    print("nanochat repository already present.")

# Ensure the nanochat package is importable
package_path = nanochat_repo.resolve()
if str(package_path) not in sys.path:
    sys.path.insert(0, str(package_path))

from nanochat.gpt import GPT, GPTConfig
from nanochat.tokenizer import RustBPETokenizer

Cloning karpathy/nanochat...


Cloning into 'nanochat'...


In [4]:
import torch

if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print(f"CUDA is available. You have {device_count} GPU(s) available.")
    for i in range(device_count):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. PyTorch is using the CPU.")

# Load the model using nanochat's native scripts ---
print("\nLoading model using nanochat's native functions...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device selected: {device}")


CUDA is available. You have 1 GPU(s) available.
GPU 0: Tesla P100-PCIE-16GB

Loading model using nanochat's native functions...
Device selected: cuda


In [5]:
from huggingface_hub import hf_hub_download

# Download checkpoint artifacts from Hugging Face (if needed)
model_repo = "sdobson/nanochat"
base_cache = Path.home() / ".cache" / "nanochat"

files_to_download = {
    "model_000650.pt": base_cache / "chatsft_checkpoints" / "d20",
    "meta_000650.json": base_cache / "chatsft_checkpoints" / "d20",
    "tokenizer.pkl": base_cache / "tokenizer",
    "token_bytes.pt": base_cache / "tokenizer",
}

for filename, target_dir in files_to_download.items():
    target_dir.mkdir(parents=True, exist_ok=True)
    target_path = target_dir / filename
    if not target_path.exists():
        print(f"Downloading {filename}...")
        hf_hub_download(
            repo_id=model_repo,
            filename=filename,
            local_dir=str(target_dir),
            local_dir_use_symlinks=False,
        )
    else:
        print(f"Found cached {filename}.")


Downloading model_000650.pt...


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


model_000650.pt:   0%|          | 0.00/2.08G [00:00<?, ?B/s]

Downloading meta_000650.json...


meta_000650.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

Downloading tokenizer.pkl...


tokenizer.pkl:   0%|          | 0.00/846k [00:00<?, ?B/s]

Downloading token_bytes.pt...


token_bytes.pt:   0%|          | 0.00/264k [00:00<?, ?B/s]

In [6]:
# # Check if model is already loaded
# # The model was loaded in Cell 4 - no need to reload it!

# try:
#     # Check if model exists and has parameters
#     _ = next(model.parameters())
#     model_device = next(model.parameters()).device
#     print("✓ Model is already loaded from Cell 4")
#     print(f"✓ Model is on device: {model_device}")
#     print(f"✓ Model has {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M parameters")
#     print("\nNo need to reload - the model is ready to use!")
# except (NameError, StopIteration):
#     print("⚠ Model not found. Please run Cell 4 first to load the model.")
#     raise RuntimeError("Model must be loaded first. Run Cell 4 before this cell.")


In [7]:
# GPU Memory Management & Model Saving/Loading Guide
# ====================================================

# NOTE: Your GPU has only 3.63 GiB memory. The model (561M params) uses ~2-3 GiB.
# If you run out of GPU memory, you have several options:

# Option 1: Clear GPU cache (run this if you get OOM errors)
# import torch
# torch.cuda.empty_cache()
# print("GPU cache cleared")

# Option 2: Move model to CPU (slower but no memory limits)
# device = torch.device('cpu')
# model.to(device)
# print(f"Model moved to {device}")

# Option 3: Use smaller batch size in training (already set to 2 in Cell 21)

# ====================================================
# Saving a fine-tuned model (after training)
# ====================================================

# model_save_path = 'my_finetuned_scienceqa_model.pth'
# torch.save(model.state_dict(), model_save_path)
# print(f"Model saved to {model_save_path}")

# ====================================================
# Loading a previously saved fine-tuned model
# ====================================================

# model_load_path = 'my_finetuned_scienceqa_model.pth'
# if Path(model_load_path).exists():
#     # Load to CPU first to avoid OOM, then optionally move to GPU
#     state_dict = torch.load(model_load_path, map_location='cpu')
#     model.load_state_dict(state_dict)
#     model.eval()
#     # model.to('cuda')  # Uncomment if you have enough GPU memory
#     print(f"Fine-tuned model loaded from {model_load_path}")
# else:
#     print(f"Warning: {model_load_path} not found.")

print("✓ GPU memory management options available above (commented out)")


✓ GPU memory management options available above (commented out)


In [8]:
# # Clear GPU memory if you had OOM errors
# import torch
# import gc

# # Force garbage collection
# gc.collect()

# # Clear CUDA cache if CUDA is available
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()
#     print("✓ GPU cache cleared")
#     print(f"✓ GPU memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GiB")
#     print(f"✓ GPU memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GiB")
# else:
#     print("✓ No GPU available - running on CPU")


In [9]:
# Clear GPU memory if you had OOM errors
import torch
import gc

# Force garbage collection
gc.collect()

# Clear CUDA cache if CUDA is available
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("✓ GPU cache cleared")
else:
    print("✓ No GPU available - running on CPU")


✓ GPU cache cleared


In [10]:
import os
import json
import subprocess
from pathlib import Path

import torch
from huggingface_hub import hf_hub_download

# --- 1. Set up the nanochat environment ---
if not Path("nanochat").exists():
    print("Cloning karpathy/nanochat repository...")
    subprocess.run(["git", "clone", "https://github.com/karpathy/nanochat.git", "nanochat"], check=True)
else:
    print("nanochat repository already exists.")

# Ensure that we can import the nanochat modules
import sys
package_path = os.path.abspath("nanochat")
if package_path not in sys.path:
    sys.path.insert(0, package_path)

from nanochat.common import autodetect_device_type, compute_init
from nanochat.checkpoint_manager import build_model

# --- 2. Download (or reuse) the checkpoint artifacts ---
model_repo = "sdobson/nanochat"
base_cache = Path.home() / ".cache" / "nanochat"
files_to_download = {
    "model_000650.pt": base_cache / "chatsft_checkpoints" / "d20",
    "meta_000650.json": base_cache / "chatsft_checkpoints" / "d20",
    "tokenizer.pkl": base_cache / "tokenizer",
    "token_bytes.pt": base_cache / "tokenizer",
}
base_cache.mkdir(parents=True, exist_ok=True)

print("Ensuring local checkpoint cache is populated...")
for filename, target_dir in files_to_download.items():
    target_dir.mkdir(parents=True, exist_ok=True)
    target_path = target_dir / filename
    if not target_path.exists():
        print(f"Downloading {filename}...")
        hf_hub_download(
            repo_id=model_repo,
            filename=filename,
            local_dir=str(target_dir),
            local_dir_use_symlinks=False,
        )
    else:
        print(f"Found cached {filename}.")

# --- 3. Initialize distributed context & load the model ---
device_type = autodetect_device_type()
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type=device_type)
print(f"Rank {ddp_rank}: using device {device}")

checkpoint_dir = base_cache / "chatsft_checkpoints" / "d20"
checkpoint_step = int(os.environ.get("SCIASSIST_BASE_STEP", "650"))
model, tokenizer, meta = build_model(
    checkpoint_dir=str(checkpoint_dir),
    step=checkpoint_step,
    device=device,
    phase="train",
)
base_model = model  # keep an unwrapped handle for optimizer setup/saving

print(
    f"Loaded model depth={meta['model_config']['n_layer']} | params "+
    f"~{sum(p.numel() for p in base_model.parameters()) / 1e6:.0f}M | world size={ddp_world_size}"
)


2025-11-24 02:05:12,402 - nanochat.common - [32m[1mINFO[0m - Distributed world size: 1


nanochat repository already exists.
Ensuring local checkpoint cache is populated...
Found cached model_000650.pt.
Found cached meta_000650.json.
Found cached tokenizer.pkl.
Found cached token_bytes.pt.
Autodetected device type: cuda
Rank 0: using device cuda


2025-11-24 02:05:14,222 - nanochat.checkpoint_manager - [32m[1mINFO[0m - Building model with config: {'sequence_len': 2048, 'vocab_size': 65536, 'n_layer': 20, 'n_head': 10, 'n_kv_head': 10, 'n_embd': 1280}


Loaded model depth=20 | params ~561M | world size=1


In [11]:
# --- Configuration: Multi-GPU vs Single-GPU ---
USE_SINGLE_GPU_MODE = True  # Set False for multi-GPU DDP training

# DDP wrapping for multi-GPU training
if ddp:
    import torch.nn.parallel
    model = torch.nn.parallel.DistributedDataParallel(
        model,
        device_ids=[ddp_local_rank],
        output_device=ddp_local_rank
    )
    print(f"Rank {ddp_rank}: Model wrapped in DDP")

# Single T4 GPU optimizations
if USE_SINGLE_GPU_MODE and ddp_world_size == 1:
    print("⚙️ SINGLE T4 GPU MODE - Applying memory optimizations:")
    # Enable gradient checkpointing to save memory (trade compute for memory)
    # Note: nanochat's GPT doesn't have built-in checkpointing, but we reduce batch size
    print("  - Reduced device_batch_size to 1")
    print("  - Increased gradient_accumulation_steps to maintain effective batch size")
    print("  - Using bfloat16 mixed precision (already enabled in model)")


⚙️ SINGLE T4 GPU MODE - Applying memory optimizations:
  - Reduced device_batch_size to 1
  - Increased gradient_accumulation_steps to maintain effective batch size
  - Using bfloat16 mixed precision (already enabled in model)


In [12]:
# Initialize the nanochat tokenizer
tokenizer_dir = base_cache / "tokenizer"
tokenizer = RustBPETokenizer.from_directory(str(tokenizer_dir))

print("Tokenizer ready.")


Tokenizer ready.


In [13]:
# Precompute frequently used special token IDs
bos_id = tokenizer.get_bos_token_id()
assistant_start_id = tokenizer.encode_special("<|assistant_start|>")
assistant_end_id = tokenizer.encode_special("<|assistant_end|>")
user_start_id = tokenizer.encode_special("<|user_start|>")
user_end_id = tokenizer.encode_special("<|user_end|>")


In [14]:
def build_completion_prompt(conversation):
    """Return token ids primed for assistant completion."""
    conversation_for_completion = json.loads(json.dumps(conversation))  # deep copy via json
    conversation_for_completion["messages"][-1]["content"] = ""
    prompt_tokens = tokenizer.render_for_completion(conversation_for_completion)
    return prompt_tokens


def generate_response_from_tokens(prompt_tokens, *, max_tokens=256, temperature=0.7, top_k=50):
    tokens = list(prompt_tokens)
    generated = []
    # Access the underlying model if it's wrapped in DataParallel
    model_to_use = model.module if isinstance(model, torch.nn.DataParallel) else model
    for token in model_to_use.generate(tokens, max_tokens=max_tokens, temperature=temperature, top_k=top_k):
        tokens.append(token)
        generated.append(token)
        if token == assistant_end_id:
            break
    text = tokenizer.decode([token for token in generated if token not in {assistant_end_id}])
    return text.strip(), generated


def generate_response_for_conversation(conversation, **generation_kwargs):
    prompt_tokens = build_completion_prompt(conversation)
    text, token_sequence = generate_response_from_tokens(prompt_tokens, **generation_kwargs)
    return {
        "prompt_tokens": prompt_tokens,
        "generated_tokens": token_sequence,
        "response_text": text,
    }


In [15]:
import sys
# This will print the path to the Python executable the kernel is using
print(sys.executable)

/usr/bin/python3


In [16]:
!pip install -q transformers datasets torch accelerate huggingface_hub bitsandbytes

# HuggingFace authentication (using Kaggle Secrets)
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

try:
    user_secrets = UserSecretsClient()
    # Replace 'HF_TOKEN' with the exact name of your secret in Kaggle
    hf_token = user_secrets.get_secret("HF_TOKEN") 
    login(token=hf_token)
    print("✅ Successfully logged in to Hugging Face using Kaggle Secret.")
except Exception as e:
    print(f"⚠️ Failed to login with Kaggle Secret: {e}")
    # Fallback for local testing or if secret is missing
    # login() 

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m997.1 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [17]:
from datasets import load_dataset
import pandas as pd

# Load full dataset
full_dataset = load_dataset('derek-thomas/ScienceQA')

# Analyze dataset structure
print(f"Train samples: {len(full_dataset['train'])}")
print(f"Validation samples: {len(full_dataset['validation'])}")
print(f"Test samples: {len(full_dataset['test'])}")

# Examine sample
sample = full_dataset['train'][0]
print(sample.keys())

2025-11-24 02:06:44,523 - numexpr.utils - [32m[1mINFO[0m - NumExpr defaulting to 4 threads.
2025-11-24 02:06:44,730 - datasets - [32m[1mINFO[0m - TensorFlow version 2.18.0 available.
2025-11-24 02:06:44,733 - datasets - [32m[1mINFO[0m - JAX version 0.5.2 available.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-1028f23e353fbe(…):   0%|          | 0.00/377M [00:00<?, ?B/s]

data/validation-00000-of-00001-6c7328ff6(…):   0%|          | 0.00/126M [00:00<?, ?B/s]

data/test-00000-of-00001-f0e719df791966f(…):   0%|          | 0.00/122M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12726 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Train samples: 12726
Validation samples: 4241
Test samples: 4241
dict_keys(['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'])


In [18]:
# Explore structure
print(full_dataset['train'][1000])

# Create balanced subset across topics
train_subset = full_dataset['train'].shuffle(seed=42).select(range(12000))
val_subset = full_dataset['validation'].shuffle(seed=42).select(range(4000))
test_subset = full_dataset['test'].shuffle(seed=42).select(range(4000))

# Save locally for reuse
train_subset.save_to_disk('data/train_subset')
val_subset.save_to_disk('data/val_subset')
test_subset.save_to_disk('data/test_subset')

{'image': None, 'question': 'Which sentence states a fact?', 'choices': ['Atlanta is the capital of Georgia.', 'Atlanta is too hot in the summer.'], 'answer': 0, 'hint': '', 'task': 'closed choice', 'grade': 'grade5', 'subject': 'social science', 'topic': 'civics', 'category': 'Social studies skills', 'skill': 'Identify facts and opinions', 'lecture': 'A fact is something that can be proved to be true. Facts can be proved by observing, measuring, or studying information.\nThe flag of the United States has 13 stripes.\nThis is a fact. It can be proved by looking at the flag and counting the number of stripes.\nAn opinion is something that a person believes, thinks, or feels. An opinion cannot be proved true.\nThe flag of the United States is easy to draw.\nThis is an opinion. People may have different opinions about what makes a flag "easy" to draw.', 'solution': 'The second sentence states a fact.\nAtlanta is the capital of Georgia.\nIt can be proved by looking at a map of Georgia.\nTh

Saving the dataset (0/1 shards):   0%|          | 0/12000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [19]:
def format_scienceqa_for_chat(example):
    """Convert ScienceQA to conversational format"""

    # Build question with choices
    question = example['question']
    choices = example['choices']
    choices_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])

    full_question = f"{question}\n\n{choices_text}"

    # Build answer with explanation
    answer_idx = example['answer']
    correct_answer = choices[answer_idx]

    response = f"The correct answer is {chr(65+answer_idx)}. {correct_answer}"

    # Add explanation if available
    if example.get('solution'):
        response += f"\n\nExplanation: {example['solution']}"

    # Add lecture context if available
    if example.get('lecture'):
        response += f"\n\nBackground: {example['lecture']}"

    # Format as conversational message
    return {
        "messages": [
            {"role": "system", "content": "You are a helpful science tutor for elementary through high school students. Explain concepts clearly with examples."},
            {"role": "user", "content": full_question},
            {"role": "assistant", "content": response}
        ]
    }

In [20]:
# Apply formatting
train_formatted = train_subset.map(format_scienceqa_for_chat, remove_columns=train_subset.column_names)
val_formatted = val_subset.map(format_scienceqa_for_chat, remove_columns=val_subset.column_names)
test_formatted = test_subset.map(format_scienceqa_for_chat, remove_columns=test_subset.column_names)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [21]:
# persist the formatted splits
train_formatted.save_to_disk("data/train_formatted")
val_formatted.save_to_disk("data/val_formatted")
test_formatted.save_to_disk("data/test_formatted")

Saving the dataset (0/1 shards):   0%|          | 0/12000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [22]:
from datasets import load_from_disk
from pathlib import Path

# Load PRE-FORMATTED ScienceQA evaluation subset
DATA_DIR = Path("data")

try:
    # Load the data that was already processed in Notebook 1
    test_formatted = load_from_disk(str(DATA_DIR / "test_subset"))
    print(f"Loaded formatted test subset with {len(test_formatted)} examples.")
except FileNotFoundError:
    test_formatted = None
    print("Warning: data/test_formatted not found. Please run Notebook 1 first and save the formatted data.")


Loaded formatted test subset with 4000 examples.


In [23]:
from datasets import load_from_disk
from pathlib import Path

test_formatted = load_from_disk(str(DATA_DIR / "test_formatted"))
print(f"Loaded formatted test subset with {len(test_formatted)} examples.")

conversations = []
for example in test_formatted:
    messages = example["messages"]  # now present
    conversations.append({
        "id": example.get("id"),
        "conversation": {"messages": messages},
        "expected_response": messages[2]["content"],
    })
print(f"Prepared {len(conversations)} conversations from pre-processed file.")

Loaded formatted test subset with 4000 examples.
Prepared 4000 conversations from pre-processed file.


In [24]:
# Prepare conversations from the pre-formatted file
if test_formatted is not None:
    conversations = []
    for example in test_formatted:
        # The assistant's content is the expected response
        conversations.append({
            "id": example.get("id"), # We can still get the ID
            "conversation": {"messages": example["messages"]},
            "expected_response": example["messages"][2]["content"], # 2 is the assistant's message
        })
    print(f"Prepared {len(conversations)} conversations from pre-processed file.")
else:
    conversations = []


Prepared 4000 conversations from pre-processed file.


In [25]:
# Generate baseline responses (configure sample size as needed)
EVAL_SAMPLE_SIZE = 50  # adjust to cover more/less examples

def evaluate_baseline(sample_size=EVAL_SAMPLE_SIZE, temperature=0.7, top_k=50):
    assert conversations, "No conversations prepared. Ensure Notebook 1 has been run."
    results = []
    for idx, example in enumerate(conversations[:sample_size]):
        convo = example["conversation"]
        generation = generate_response_for_conversation(
            convo,
            max_tokens=256,
            temperature=temperature,
            top_k=top_k,
        )
        results.append({
            "index": idx,
            "example_id": example.get("id"),
            "question": convo["messages"][1]["content"],
            "expected": example["expected_response"],
            "response": generation["response_text"],
            "prompt_tokens": generation["prompt_tokens"],
            "generated_tokens": generation["generated_tokens"],
        })
    return results

# baseline_results = evaluate_baseline()
# len(baseline_results)


In [26]:
def extract_choice_letter(text):
    for letter in ["A", "B", "C", "D", "E", "F"]:
        if f"{letter}." in text:
            return letter
        if f" {letter} " in text:
            return letter
    return None


def compute_accuracy(results):
    correct = 0
    total = 0
    for entry in results:
        expected_letter = extract_choice_letter(entry["expected"])
        predicted_letter = extract_choice_letter(entry["response"])
        if expected_letter and predicted_letter:
            total += 1
            if expected_letter == predicted_letter:
                correct += 1
    accuracy = correct / total if total else 0.0
    return {
        "evaluated": len(results),
        "scored": total,
        "correct": correct,
        "accuracy": accuracy,
    }

# metrics = compute_accuracy(baseline_results)
# metrics


In [27]:
def summarize_result(entry, idx=0):
    print(f"Example {idx}")
    print("Question:\n", entry["question"])
    print("\nExpected:\n", entry["expected"])
    print("\nModel Response:\n", entry["response"])

# if baseline_results:
#     summarize_result(baseline_results[0], idx=baseline_results[0]["index"])


## Next Steps

1. Uncomment the evaluation cells to generate baseline responses once the ScienceQA subsets are prepared.
2. Review `baseline_responses.json` to confirm output quality before starting fine-tuning.
3. Keep an eye on GPU availability; running on CPU will be slow for the full evaluation set.


In [28]:
# This is the data loader from scripts/chat_sft.py
def sft_data_generator(dataset, batch_size):
    pad_token_id = tokenizer.encode_special("<|assistant_end|>") # use <|assistant_end|> as the pad token is ok, these positions are masked in the loss

    # prepares a list of tokenized conversations into a batch and yields
    def collate_and_yield(batch):
        nrows = len(batch)
        ncols = max(len(ids) for ids, mask in batch) - 1 # seq of n creates inputs/targets of n-1
        inputs = torch.full((nrows, ncols), pad_token_id, dtype=torch.long)
        targets = torch.full((nrows, ncols), -1, dtype=torch.long) # -1 is ignore index
        for i, (ids, mask) in enumerate(batch):
            n = len(ids)
            ids_tensor = torch.tensor(ids, dtype=torch.long)
            inputs[i, :n-1] = ids_tensor[:-1]
            # recall -1 is the ignore index, so mask out targets where mask is 0
            row_targets = ids_tensor[1:]
            # mask[1:] omits the mask for the BOS token, which is never a target atm so it's ok
            mask_tensor = torch.tensor(mask[1:], dtype=torch.long)
            row_targets[mask_tensor == 0] = -1 # mask out targets where mask is 0
            targets[i, :n-1] = row_targets
        inputs = inputs.to(device) # move to device
        targets = targets.to(device)
        return inputs, targets

    # iterates over the dataset in epochs, tokenizes
    batch = []
    while True:
        for i in range(len(dataset)):
            doc = dataset[i]
            ids, mask = tokenizer.render_conversation(doc)
            batch.append((ids, mask))
            if len(batch) == batch_size:
                yield collate_and_yield(batch)
                batch = []

print("Data generator `sft_data_generator` is defined.")


Data generator `sft_data_generator` is defined.


In [29]:
# Cell: Define project paths
from pathlib import Path

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

# Define paths for formatted data
train_formatted_path = DATA_DIR / "train_formatted"
val_formatted_path = DATA_DIR / "val_formatted"
test_formatted_path = DATA_DIR / "test_formatted"


In [30]:
# Cell: Download and subset ScienceQA dataset
from datasets import load_dataset

# Load full dataset
full_dataset = load_dataset('derek-thomas/ScienceQA')

# Create balanced subset for fine-tuning
train_subset = full_dataset['train'].shuffle(seed=42).select(range(12000))
val_subset = full_dataset['validation'].shuffle(seed=42).select(range(4000))
test_subset = full_dataset['test'].shuffle(seed=42).select(range(4000))

print(f"Train subset: {len(train_subset)} | Validation subset: {len(val_subset)} | Test subset: {len(test_subset)}")


Train subset: 12000 | Validation subset: 4000 | Test subset: 4000


In [31]:
# Cell: Define formatting function
def format_scienceqa_for_chat(example):
    """Convert ScienceQA to conversational format for nanochat."""

    # Build question with choices
    question = example['question']
    choices = example['choices']
    choices_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])

    full_question = f"{question}\n\n{choices_text}"

    # Build answer with explanation
    answer_idx = example['answer']
    correct_answer = choices[answer_idx]

    response = f"The correct answer is {chr(65+answer_idx)}. {correct_answer}"

    # Add explanation if available
    if example.get('solution'):
        response += f"\n\nExplanation: {example['solution']}"

    # Add lecture context if available
    if example.get('lecture'):
        response += f"\n\nBackground: {example['lecture']}"

    # Format as conversational message
    return {
        "id": example.get("id"), # Keep id for tracking
        "messages": [
            {"role": "system", "content": "You are a helpful science tutor for elementary through high school students. Explain concepts clearly with examples."},
            {"role": "user", "content": full_question},
            {"role": "assistant", "content": response}
        ]
    }


In [32]:
# Cell: Apply formatting and save datasets
# Apply formatting
train_formatted = train_subset.map(format_scienceqa_for_chat, remove_columns=train_subset.column_names)
val_formatted = val_subset.map(format_scienceqa_for_chat, remove_columns=val_subset.column_names)
test_formatted = test_subset.map(format_scienceqa_for_chat, remove_columns=test_subset.column_names)

# Save formatted datasets to disk
train_formatted.save_to_disk(str(train_formatted_path))
val_formatted.save_to_disk(str(val_formatted_path))
test_formatted.save_to_disk(str(test_formatted_path))

print(f"Saved formatted datasets to {DATA_DIR}")
print(f"Train: {len(train_formatted)} | Val: {len(val_formatted)} | Test: {len(test_formatted)}")


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved formatted datasets to data
Train: 12000 | Val: 4000 | Test: 4000


In [33]:
# Cell: Load formatted datasets for training
from datasets import load_from_disk

# These paths were defined in a previous cell
if not train_formatted_path.exists() or not val_formatted_path.exists():
    raise FileNotFoundError(f"Missing formatted datasets. Please ensure the data preparation cells have been run.")

train_dataset = load_from_disk(str(train_formatted_path))
val_dataset = load_from_disk(str(val_formatted_path))

print(f"Loaded datasets for training. Train samples: {len(train_dataset)} | Validation samples: {len(val_dataset)}")


Loaded datasets for training. Train samples: 12000 | Validation samples: 4000


In [34]:
# SFT Hyperparameters (adapted from scripts/chat_sft.py)

# Memory-aware batch sizing
if USE_SINGLE_GPU_MODE and ddp_world_size == 1:
    device_batch_size = 1  # Minimum for T4 GPU with ~16GB
    target_examples_per_step = 32  # Keep effective batch size via grad accum
else:
    device_batch_size = 1  # For multi-GPU or larger GPUs
    target_examples_per_step = 8 * ddp_world_size  # Scale with world size

num_epochs = 2
unembedding_lr = 0.004
embedding_lr = 0.2
matrix_lr = 0.02
weight_decay = 0.0
init_lr_frac = 0.02
eval_every = 100  # Steps
eval_steps = 50

# Calculate number of iterations and gradient accumulation
grad_accum_steps = target_examples_per_step // device_batch_size
# Adjust iterations for DDP (each GPU sees a shard)
num_iterations = (len(train_dataset) // (target_examples_per_step * ddp_world_size)) * num_epochs

if ddp_rank == 0:
    print(f"Effective batch size (global): {target_examples_per_step * ddp_world_size}")
    print(f"Device batch size (per GPU): {device_batch_size}")
    print(f"Gradient accumulation steps: {grad_accum_steps}")
    print(f"Total training iterations: {num_iterations}")
    print(f"World size: {ddp_world_size}")

# Create data loaders (generators) - same for both modes
train_loader = sft_data_generator(train_dataset, batch_size=device_batch_size)
val_loader = sft_data_generator(val_dataset, batch_size=device_batch_size)


Effective batch size (global): 32
Device batch size (per GPU): 1
Gradient accumulation steps: 32
Total training iterations: 750
World size: 1


In [35]:
optimizers = base_model.setup_optimizers(
    unembedding_lr=unembedding_lr,
    embedding_lr=embedding_lr,
    matrix_lr=matrix_lr,
    weight_decay=weight_decay,
)

ptdtype = torch.bfloat16  # nanochat uses bfloat16

# Set the initial learning rate as a fraction of the base learning rate
for opt in optimizers:
    for group in opt.param_groups:
        group["lr"] = group["lr"] * init_lr_frac
        group["initial_lr"] = group["lr"]  # save the initial learning so we can decay easily later

print("Optimizers initialized.")


Scaling the LR for the AdamW parameters ∝1/√(1280/768) = 0.774597
Optimizers initialized.


In [36]:
import time
from contextlib import nullcontext

# Use the device_type from compute_init instead of hardcoding
ptdtype = torch.bfloat16  # nanochat uses bfloat16
autocast_ctx = torch.amp.autocast(device_type=device.type, dtype=ptdtype) if device.type == "cuda" else nullcontext()

# Learning rate scheduler
def get_lr_multiplier(it):
    lrm = 1.0 - it / num_iterations
    return lrm

if ddp_rank == 0:
    print("Starting fine-tuning...")

train_iter = iter(train_loader)
val_iter = iter(val_loader)
val_loss = 0.0
step_time_ema = 0.0

for step in range(num_iterations):
    t0 = time.time()
    last_step = step == num_iterations - 1

    # Validation loop
    if last_step or step % eval_every == 0:
        model.eval()
        if device.type == "cuda":
            torch.cuda.empty_cache()
        losses = []
        for _ in range(eval_steps):
            val_inputs, val_targets = next(val_iter)
            with torch.no_grad(), autocast_ctx:
                loss = model(val_inputs, val_targets)
            losses.append(loss)
        val_loss = torch.stack(losses).mean().item()

        # DDP: synchronize validation loss across ranks
        if ddp:
            val_loss_tensor = torch.tensor(val_loss, device=device)
            torch.distributed.all_reduce(val_loss_tensor, op=torch.distributed.ReduceOp.AVG)
            val_loss = val_loss_tensor.item()

        if ddp_rank == 0:
            print(f"Step {step:05d} | Validation loss: {val_loss:.6f}")
        model.train()

    # Training loop
    num_tokens = 0
    for micro_step in range(grad_accum_steps):
        train_inputs, train_targets = next(train_iter)
        with autocast_ctx:
            loss = model(train_inputs, train_targets)
        train_loss = loss.detach()  # for logging
        loss = loss / grad_accum_steps  # normalize for gradient accumulation
        loss.backward()  # accumulate gradients
        num_tokens += (train_targets >= 0).sum()

    # Update learning rate
    lrm = get_lr_multiplier(step)
    for opt in optimizers:
        for group in opt.param_groups:
            group["lr"] = group["initial_lr"] * lrm

    # Step the optimizers
    for opt in optimizers:
        opt.step()
    model.zero_grad(set_to_none=True)

    # Timing and logging (only rank 0)
    if ddp_rank == 0:
        t1 = time.time()
        dt = t1 - t0
        step_time_ema = dt if step_time_ema == 0 else 0.9 * step_time_ema + 0.1 * dt
        tokens_per_sec = num_tokens.item() / dt
        print(f"Step {step:05d}/{num_iterations:05d} | Train loss: {train_loss.item():.6f} | LR mult: {lrm:.4f} | Tok/s: {tokens_per_sec:,.0f} | Step time: {step_time_ema:.3f}s")

if ddp_rank == 0:
    print("\nTraining complete.")


Starting fine-tuning...
Step 00000 | Validation loss: 1.884863


W1124 02:07:27.299000 48 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode


Step 00000/00750 | Train loss: 3.137879 | LR mult: 1.0000 | Tok/s: 357 | Step time: 16.887s
Step 00001/00750 | Train loss: 1.594340 | LR mult: 0.9987 | Tok/s: 708 | Step time: 16.090s
Step 00002/00750 | Train loss: 1.949950 | LR mult: 0.9973 | Tok/s: 634 | Step time: 15.329s
Step 00003/00750 | Train loss: 1.822210 | LR mult: 0.9960 | Tok/s: 723 | Step time: 14.724s
Step 00004/00750 | Train loss: 2.193991 | LR mult: 0.9947 | Tok/s: 689 | Step time: 14.119s
Step 00005/00750 | Train loss: 2.077898 | LR mult: 0.9933 | Tok/s: 669 | Step time: 13.581s
Step 00006/00750 | Train loss: 1.742413 | LR mult: 0.9920 | Tok/s: 743 | Step time: 13.184s
Step 00007/00750 | Train loss: 0.816040 | LR mult: 0.9907 | Tok/s: 738 | Step time: 12.796s
Step 00008/00750 | Train loss: 0.995913 | LR mult: 0.9893 | Tok/s: 817 | Step time: 12.523s
Step 00009/00750 | Train loss: 1.376194 | LR mult: 0.9880 | Tok/s: 678 | Step time: 12.150s
Step 00010/00750 | Train loss: 2.318694 | LR mult: 0.9867 | Tok/s: 714 | Step ti

<!-- ## Save Final Model Artifacts -->


In [37]:
# final_dir.mkdir(parents=True, exist_ok=True)

# print(f"Saving final model checkpoint to {final_dir}...")

# # Create metadata for the checkpoint
# checkpoint_meta = {
#     "step": step,
#     "val_loss": val_loss,
#     "model_config": model.config.__dict__,
# }

# # Use the nanochat save_checkpoint function
# save_checkpoint(
#     checkpoint_dir=str(final_dir),
#     step=step, # You can use the final step number
#     model_state=model.state_dict(),
#     # We are not saving optimizer state in this simplified version
#     optimizer_state=None,
#     meta=checkpoint_meta,
# )

# # The tokenizer is a folder, so we can't save it with the checkpoint manager.
# # We'll just copy the files if they aren't already there.
# tokenizer.save_vocabulary(str(final_dir))

# print(f"Saved fine-tuned model and tokenizer to {final_dir}")


## Testing Option: Save Base Model as "Fine-tuned" for Validation

**UNCOMMENT THE CELL BELOW** if you want to skip training and just save the current base model as a "fine-tuned" model for testing purposes. This allows you to validate that all evaluation and demo components work before running actual training on a GPU.


In [38]:
# # Save the fine-tuned model after training completes (only rank 0)
# if ddp_rank == 0:
#     import shutil
#     from pathlib import Path

#     # Create directory for the fine-tuned model
#     finetuned_model_dir = Path("finetuned_model_checkpoint")
#     finetuned_model_dir.mkdir(exist_ok=True)

#     # Save model state
#     save_path = finetuned_model_dir / "model_finetuned.pt"
#     meta_path = finetuned_model_dir / "meta_finetuned.json"

#     # Unwrap DDP if necessary
#     model_to_save = base_model if ddp else model

#     torch.save(model_to_save.state_dict(), save_path)
#     print(f"✓ Fine-tuned model saved to {save_path}")

#     # Save metadata with training info
#     meta_with_training = meta.copy()
#     meta_with_training['training_info'] = {
#         'num_train_samples': len(train_dataset),
#         'num_val_samples': len(val_dataset),
#         'num_epochs': num_epochs,
#         'effective_batch_size': target_examples_per_step * ddp_world_size,
#         'final_val_loss': val_loss,
#         'world_size': ddp_world_size,
#     }

#     with open(meta_path, "w", encoding="utf-8") as f:
#         json.dump(meta_with_training, f, indent=2)
#     print(f"✓ Metadata saved to {meta_path}")

#     # Copy tokenizer files
#     tokenizer_source = base_cache / "tokenizer"
#     tokenizer_dest = finetuned_model_dir / "tokenizer"
#     if tokenizer_dest.exists():
#         shutil.rmtree(tokenizer_dest)
#     shutil.copytree(tokenizer_source, tokenizer_dest)
#     print(f"✓ Tokenizer files copied to {tokenizer_dest}")

#     print("\n✓ Fine-tuned model checkpoint saved successfully!")

# # Synchronize all ranks before continuing
# if ddp:
#     torch.distributed.barrier()


## Save Fine-Tuned Model (After Training)


In [39]:
# Save the fine-tuned model after training completes
import shutil
from pathlib import Path

# Create directory for the fine-tuned model
finetuned_model_dir = Path("finetuned_model_checkpoint")
finetuned_model_dir.mkdir(exist_ok=True)

# Save model state
save_path = finetuned_model_dir / "model_finetuned.pt"
meta_path = finetuned_model_dir / "meta_finetuned.json"

torch.save(model.state_dict(), save_path)
print(f"✓ Fine-tuned model saved to {save_path}")

# Save metadata with training info
meta_with_training = meta.copy()
meta_with_training['training_info'] = {
    'num_train_samples': len(train_dataset),
    'num_val_samples': len(val_dataset),
    'num_epochs': num_epochs,
    'effective_batch_size': target_examples_per_step,
    'final_val_loss': val_loss,
}

# If the model is wrapped in DataParallel, save the underlying model
model_to_save = model.module if isinstance(model, torch.nn.DataParallel) else model
torch.save(model_to_save.state_dict(), save_path)

with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(meta_with_training, f, indent=2)
print(f"✓ Metadata saved to {meta_path}")

# Copy tokenizer files
tokenizer_source = base_cache / "tokenizer"
tokenizer_dest = finetuned_model_dir / "tokenizer"
if tokenizer_dest.exists():
    shutil.rmtree(tokenizer_dest)
shutil.copytree(tokenizer_source, tokenizer_dest)
print(f"✓ Tokenizer files copied to {tokenizer_dest}")

print("\n✓ Fine-tuned model checkpoint saved successfully!")


✓ Fine-tuned model saved to finetuned_model_checkpoint/model_finetuned.pt
✓ Metadata saved to finetuned_model_checkpoint/meta_finetuned.json
✓ Tokenizer files copied to finetuned_model_checkpoint/tokenizer

✓ Fine-tuned model checkpoint saved successfully!


## Add file to the Colab local foler as zip for transfer to attached Google drive

In [40]:
# from google.colab import drive
# drive.mount('/content/drive')

NotImplementedError: Mounting drive is unsupported in this environment. Use PyDrive2 instead. See examples at https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2.

In [41]:
import zipfile
import os
import shutil
from pathlib import Path

print("Libraries imported successfully.")

Libraries imported successfully.


In [None]:
# folder_to_archive = Path("/content/finetuned_model_checkpoint")
# output_zip_name = "finetuned_model_checkpoint.zip"

# print(f"Preparing to archive: {folder_to_archive} into {output_zip_name}")

In [None]:
# if folder_to_archive.exists():
#     with zipfile.ZipFile(output_zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
#         for root, dirs, files in os.walk(folder_to_archive):
#             for file in files:
#                 file_path = os.path.join(root, file)
#                 # Add file to zip, preserving directory structure
#                 zipf.write(file_path, arcname=Path(file_path).relative_to(folder_to_archive.parent))
#     print(f"Successfully created {output_zip_name}")
# else:
#     print(f"Error: Folder '{folder_to_archive}' not found.")

In [None]:
# # Mount Google Drive if not already mounted (it seems it's already mounted from previous cells)
# # from google.colab import drive
# # drive.mount('/content/drive')

# # Define the destination path in Google Drive
# drive_destination = Path("/content/drive/MyDrive") / output_zip_name

# if Path(output_zip_name).exists():
#     shutil.move(output_zip_name, drive_destination)
#     print(f"Successfully moved {output_zip_name} to {drive_destination}")
#     print("You can now find this file in your Google Drive and download it.")
# else:
#     print(f"Error: Zip file '{output_zip_name}' not found. Please check previous steps.")

In [42]:
import shutil
from IPython.display import FileLink

# Define the path to the folder you want to download
folder_to_zip = '/kaggle/working/finetuned_model_checkpoint'

# Define the name for the output zip file (it will be saved in /kaggle/working/)
output_zip_filename = 'finetuned_model_checkpoint'

# --- 1. Create the zip archive ---
print(f"Zipping the folder: {folder_to_zip}...")
shutil.make_archive(output_zip_filename, 'zip', folder_to_zip)
print(f"Successfully created {output_zip_filename}.zip in /kaggle/working/")

# --- 2. Create and display a download link for the zip file ---
print("\nClick the link below to download your file:")
display(FileLink(f'{output_zip_filename}.zip'))

Zipping the folder: /kaggle/working/finetuned_model_checkpoint...
Successfully created finetuned_model_checkpoint.zip in /kaggle/working/

Click the link below to download your file:


### How to download the file from Google Drive:

1. Go to your Google Drive (drive.google.com).
2. Look for the file named `finetuned_model_checkpoint.zip` in your 'My Drive' folder.
3. Right-click on the file and select 'Download'.

## Load Fine-Tuned Model for Evaluation


In [43]:
# Load the fine-tuned model for evaluation
finetuned_model_dir = Path("finetuned_model_checkpoint")

# Load fine-tuned model
finetuned_checkpoint_path = finetuned_model_dir / "model_finetuned.pt"
finetuned_meta_path = finetuned_model_dir / "meta_finetuned.json"

# Load metadata
with open(finetuned_meta_path, "r", encoding="utf-8") as f:
    finetuned_meta = json.load(f)

# Create model with same config
finetuned_gptconf = GPTConfig(**finetuned_meta['model_config'])
finetuned_model = GPT(finetuned_gptconf)

# Load state dict
finetuned_state_dict = torch.load(finetuned_checkpoint_path, map_location=device)

# Fix keys if needed (remove '_orig_mod.' prefix)
unwanted_prefix = '_orig_mod.'
for k, v in list(finetuned_state_dict.items()):
    if k.startswith(unwanted_prefix):
        finetuned_state_dict[k[len(unwanted_prefix):]] = finetuned_state_dict.pop(k)

finetuned_model.load_state_dict(finetuned_state_dict)
finetuned_model.eval()  # Set to evaluation mode
finetuned_model.to(device)

print(f"✓ Fine-tuned model loaded successfully!")
print(f"  Model is on device: {device}")
print(f"  Model parameters: {sum(p.numel() for p in finetuned_model.parameters()) / 1e6:.0f}M")

# Initialize tokenizer for fine-tuned model (same as base)
finetuned_tokenizer_dir = finetuned_model_dir / "tokenizer"
finetuned_tokenizer = RustBPETokenizer.from_directory(str(finetuned_tokenizer_dir))
print(f"✓ Fine-tuned model tokenizer loaded")


✓ Fine-tuned model loaded successfully!
  Model is on device: cuda
  Model parameters: 561M
✓ Fine-tuned model tokenizer loaded


## Evaluation: Generate Responses on Test Set

We'll evaluate both the base model and fine-tuned model on the test set.


In [44]:
# Load the original base model (unmodified 650-step checkpoint)
base_model_eval, base_tokenizer_eval, _ = build_model(
    checkpoint_dir=str(base_cache / "chatsft_checkpoints" / "d20"),
    step=checkpoint_step,
    device=device,
    phase="eval",
)
print("✓ Original base checkpoint reloaded for comparison.")

2025-11-24 04:16:45,488 - nanochat.checkpoint_manager - [32m[1mINFO[0m - Building model with config: {'sequence_len': 2048, 'vocab_size': 65536, 'n_layer': 20, 'n_head': 10, 'n_kv_head': 10, 'n_embd': 1280}


✓ Original base checkpoint reloaded for comparison.


In [45]:
# Helper function to generate response using nanochat's generation
def generate_response_with_model(model_instance, tokenizer_instance, conversation, max_tokens=256, temperature=0.7, top_k=50):
    """Generate response using nanochat's native generation"""
    # Precompute special token IDs for this tokenizer
    assistant_end = tokenizer_instance.encode_special("<|assistant_end|>")

    # Build completion prompt
    conversation_for_completion = json.loads(json.dumps(conversation))  # deep copy
    conversation_for_completion["messages"][-1]["content"] = ""
    prompt_tokens = tokenizer_instance.render_for_completion(conversation_for_completion)

    # Generate
    tokens = list(prompt_tokens)
    generated = []

    # Activate autocast context for inference to match model's bfloat16 dtype
    with autocast_ctx:
        for token in model_instance.generate(tokens, max_tokens=max_tokens, temperature=temperature, top_k=top_k):
            tokens.append(token)
            generated.append(token)
            if token == assistant_end:
                break

    # Decode
    text = tokenizer_instance.decode([token for token in generated if token not in {assistant_end}])
    return text.strip(), generated


# Evaluate on test set (use a subset for speed)
EVAL_SIZE = 50  # Adjust as needed

print(f"Generating responses for {EVAL_SIZE} test examples...")
print("This may take a few minutes...\n")

evaluation_results = []

for idx in range(min(EVAL_SIZE, len(conversations))):
    example = conversations[idx]
    convo = example["conversation"]

    # # Generate with base model
    # base_response, _ = generate_response_with_model(model, tokenizer, convo, max_tokens=256, temperature=0.7, top_k=50)
    # Generate with original base checkpoint (clean 650-step)
    base_response, _ = generate_response_with_model(base_model_eval, base_tokenizer_eval, convo, max_tokens=256, temperature=0.7, top_k=50)

    # Generate with fine-tuned model
    finetuned_response, _ = generate_response_with_model(finetuned_model, finetuned_tokenizer, convo, max_tokens=256, temperature=0.7, top_k=50)

    evaluation_results.append({
        "index": idx,
        "example_id": example.get("id"),
        "question": convo["messages"][1]["content"],
        "expected": example["expected_response"],
        "base_response": base_response,
        "finetuned_response": finetuned_response,
    })

    if (idx + 1) % 10 == 0:
        print(f"  Progress: {idx + 1}/{EVAL_SIZE} examples completed")

print(f"\n✓ Generated responses for {len(evaluation_results)} examples")

Generating responses for 50 test examples...
This may take a few minutes...

  Progress: 10/50 examples completed
  Progress: 20/50 examples completed
  Progress: 30/50 examples completed
  Progress: 40/50 examples completed
  Progress: 50/50 examples completed

✓ Generated responses for 50 examples


## Compute Accuracy Metrics

Extract answer choices and compute accuracy for both models.


In [46]:
def extract_choice_letter(text):
    """Extract the answer choice letter (A, B, C, D, E, F) from text"""
    import re

    # Look for patterns like "The correct answer is A" or "A." or "answer: A"
    patterns = [
        r'\b([A-F])\.',  # A. or B. etc
        r'answer is ([A-F])\b',  # answer is A
        r'correct answer is ([A-F])\b',  # correct answer is A
        r'\b([A-F])\b',  # Just the letter A, B, etc
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).upper()

    return None


def compute_accuracy_for_model(results, response_key):
    """Compute accuracy given evaluation results and response key"""
    correct = 0
    total = 0
    correct_examples = []
    incorrect_examples = []

    for entry in results:
        expected_letter = extract_choice_letter(entry["expected"])
        predicted_letter = extract_choice_letter(entry[response_key])

        if expected_letter and predicted_letter:
            total += 1
            if expected_letter == predicted_letter:
                correct += 1
                correct_examples.append(entry)
            else:
                incorrect_examples.append({
                    **entry,
                    'expected_letter': expected_letter,
                    'predicted_letter': predicted_letter
                })

    accuracy = correct / total if total else 0.0
    return {
        "evaluated": len(results),
        "scored": total,
        "correct": correct,
        "accuracy": accuracy,
        "correct_examples": correct_examples,
        "incorrect_examples": incorrect_examples,
    }


# Compute metrics for both models
print("Computing accuracy metrics...\n")

base_metrics = compute_accuracy_for_model(evaluation_results, "base_response")
finetuned_metrics = compute_accuracy_for_model(evaluation_results, "finetuned_response")

print("="*80)
print("EVALUATION RESULTS")
print("="*80)
print(f"\nBase Model Performance:")
print(f"  Evaluated: {base_metrics['evaluated']} examples")
print(f"  Scored: {base_metrics['scored']} examples (had parseable answers)")
print(f"  Correct: {base_metrics['correct']} examples")
print(f"  Accuracy: {base_metrics['accuracy']:.2%}")

print(f"\nFine-Tuned Model Performance:")
print(f"  Evaluated: {finetuned_metrics['evaluated']} examples")
print(f"  Scored: {finetuned_metrics['scored']} examples (had parseable answers)")
print(f"  Correct: {finetuned_metrics['correct']} examples")
print(f"  Accuracy: {finetuned_metrics['accuracy']:.2%}")

improvement = finetuned_metrics['accuracy'] - base_metrics['accuracy']
print(f"\n{'='*80}")
print(f"IMPROVEMENT: {improvement:+.2%}")
print(f"{'='*80}")

# Save results to JSON
results_summary = {
    "evaluation_size": len(evaluation_results),
    "base_model": {
        "accuracy": base_metrics['accuracy'],
        "correct": base_metrics['correct'],
        "total_scored": base_metrics['scored'],
    },
    "finetuned_model": {
        "accuracy": finetuned_metrics['accuracy'],
        "correct": finetuned_metrics['correct'],
        "total_scored": finetuned_metrics['scored'],
    },
    "improvement": improvement,
}

with open("evaluation_summary.json", "w") as f:
    json.dump(results_summary, f, indent=2)

print("\n✓ Evaluation summary saved to evaluation_summary.json")


Computing accuracy metrics...

EVALUATION RESULTS

Base Model Performance:
  Evaluated: 50 examples
  Scored: 41 examples (had parseable answers)
  Correct: 21 examples
  Accuracy: 51.22%

Fine-Tuned Model Performance:
  Evaluated: 50 examples
  Scored: 50 examples (had parseable answers)
  Correct: 17 examples
  Accuracy: 34.00%

IMPROVEMENT: -17.22%

✓ Evaluation summary saved to evaluation_summary.json


## Qualitative Comparison: Side-by-Side Examples

Let's examine some specific examples to see how the models differ in their responses.


In [47]:
def display_comparison(example):
    """Display side-by-side comparison of base and fine-tuned responses"""
    print("="*100)
    print(f"EXAMPLE {example['index']}")
    print("="*100)

    print("\n📝 QUESTION:")
    print("-" * 100)
    print(example['question'])

    print("\n✅ EXPECTED ANSWER:")
    print("-" * 100)
    print(example['expected'])

    print("\n🤖 BASE MODEL RESPONSE:")
    print("-" * 100)
    print(example['base_response'])

    print("\n🎯 FINE-TUNED MODEL RESPONSE:")
    print("-" * 100)
    print(example['finetuned_response'])
    print("\n" + "="*100 + "\n")


# Display a few interesting examples
print("Showing some example comparisons:\n")

# Show first 3 examples
for i in range(min(3, len(evaluation_results))):
    display_comparison(evaluation_results[i])

# Show some examples where models differ (if available)
print("\n\n" + "="*100)
print("EXAMPLES WHERE MODELS GAVE DIFFERENT ANSWERS")
print("="*100 + "\n")

different_answer_count = 0
for result in evaluation_results:
    base_letter = extract_choice_letter(result['base_response'])
    ft_letter = extract_choice_letter(result['finetuned_response'])

    if base_letter and ft_letter and base_letter != ft_letter:
        display_comparison(result)
        different_answer_count += 1

        if different_answer_count >= 3:  # Show max 3 examples
            break

if different_answer_count == 0:
    print("No examples found where models gave different answers.")


Showing some example comparisons:

EXAMPLE 0

📝 QUESTION:
----------------------------------------------------------------------------------------------------
What is the name of the colony shown?

A. Georgia
B. New Hampshire
C. South Carolina
D. West Virginia

✅ EXPECTED ANSWER:
----------------------------------------------------------------------------------------------------
The correct answer is A. Georgia

Explanation: The colony is Georgia.

🤖 BASE MODEL RESPONSE:
----------------------------------------------------------------------------------------------------
The colony shown is called New Hampshire.

🎯 FINE-TUNED MODEL RESPONSE:
----------------------------------------------------------------------------------------------------
The correct answer is C. South Carolina

Explanation: The colony is South Carolina.


EXAMPLE 1

📝 QUESTION:
----------------------------------------------------------------------------------------------------
Which animal's skin is also adapted for 

## Interactive Demo: Science Tutor Assistant

Test the fine-tuned model interactively with your own questions.


In [49]:
def test_single_question(question_text, use_finetuned=True):
    """Test a single question with either base or fine-tuned model"""

    # Create conversation format
    test_conversation = {
        "messages": [
            {"role": "system", "content": "You are a helpful science tutor for elementary through high school students. Explain concepts clearly with examples."},
            {"role": "user", "content": question_text},
            {"role": "assistant", "content": ""}
        ]
    }

    # Select model and tokenizer
    if use_finetuned:
        selected_model = finetuned_model
        selected_tokenizer = finetuned_tokenizer
        model_name = "Fine-Tuned Model"
    else:
        selected_model = base_model_eval
        selected_tokenizer = base_tokenizer_eval
        model_name = "Base Model"

    # Generate response
    response, _ = generate_response_with_model(
        selected_model,
        selected_tokenizer,
        test_conversation,
        max_tokens=300,
        temperature=0.7,
        top_k=50
    )

    print("="*100)
    print(f"🤖 {model_name.upper()}")
    print("="*100)
    print("\n📝 QUESTION:")
    print(question_text)
    print(f"\n💬 {model_name.upper()} RESPONSE:")
    print(response)
    print("="*100 + "\n")

    return response


# Test with sample science questions
print("Testing the fine-tuned model with sample questions:\n")

sample_questions = [
    """What is photosynthesis?
A. The process plants use to make food from sunlight
B. The process of cell division
C. The process of breathing
D. The process of digestion""",

    """What is the main cause of ocean tides?
A. Wind blowing across the ocean surface
B. The gravitational pull of the Moon
C. Temperature differences in the water
D. Earth's rotation""",

    """Which state of matter has a definite volume but no definite shape?
A. Solid
B. Liquid
C. Gas
D. Plasma"""
]

for i, question in enumerate(sample_questions, 1):
    print(f"\n{'#'*100}")
    print(f"TEST QUESTION {i}")
    print(f"{'#'*100}\n")
    test_single_question(question, use_finetuned=False)


Testing the fine-tuned model with sample questions:


####################################################################################################
TEST QUESTION 1
####################################################################################################

🤖 BASE MODEL

📝 QUESTION:
What is photosynthesis?
A. The process plants use to make food from sunlight
B. The process of cell division
C. The process of breathing
D. The process of digestion

💬 BASE MODEL RESPONSE:
Here are the explanations:

**Photosynthesis (Para. 1):**
Photosynthesis is the process by which plants, algae, and certain bacteria convert sunlight, water, and carbon dioxide into glucose and oxygen. This process occurs in the cells of plants, algae, and some bacteria. During photosynthesis, the green pigment chlorophyll absorbs light energy, which excites electrons. The electrons are then used to reduce NADP+ to NADPH, which is essential for the next stage of photosynthesis. The electrons from NADP+ are 

## Optional: Gradio Web Interface

Uncomment the cell below to launch an interactive web interface for your Science Tutor. This creates a shareable demo with a chat-like interface.


In [None]:
# # OPTIONAL: Gradio Web Interface
# # Uncomment this cell to launch an interactive web demo

# # Install gradio if not already installed
# # !pip install -q gradio

# import gradio as gr

# def respond_to_question(message, chat_history, use_finetuned_checkbox):
#     """Generate response for Gradio interface"""

#     # Create conversation format
#     conversation = {
#         "messages": [
#             {"role": "system", "content": "You are a helpful science tutor for elementary through high school students. Explain concepts clearly with examples."},
#             {"role": "user", "content": message},
#             {"role": "assistant", "content": ""}
#         ]
#     }

#     # Select model based on checkbox
#     if use_finetuned_checkbox:
#         selected_model = finetuned_model
#         selected_tokenizer = finetuned_tokenizer
#     else:
#         selected_model = model
#         selected_tokenizer = tokenizer

#     # Generate response
#     response, _ = generate_response_with_model(
#         selected_model,
#         selected_tokenizer,
#         conversation,
#         max_tokens=300,
#         temperature=0.7,
#         top_k=50
#     )

#     chat_history.append((message, response))
#     return "", chat_history


# # Create Gradio interface
# with gr.Blocks(title="Science Tutor Assistant") as demo:
#     gr.Markdown("# 🔬 Science Tutor Assistant")
#     gr.Markdown("### Ask me any science question from Biology, Chemistry, Physics, or Earth Science!")
#     gr.Markdown("*Powered by nanochat fine-tuned on ScienceQA dataset*")

#     with gr.Row():
#         use_finetuned = gr.Checkbox(label="Use Fine-Tuned Model", value=True,
#                                     info="Uncheck to use base model for comparison")

#     chatbot = gr.Chatbot(label="Science Tutor", height=400)

#     with gr.Row():
#         msg = gr.Textbox(
#             label="Your Question",
#             placeholder="Type your science question here... (e.g., What is photosynthesis?)",
#             lines=3,
#             scale=4
#         )
#         submit = gr.Button("Ask", scale=1, variant="primary")

#     clear = gr.Button("Clear Chat")

#     # Example questions
#     gr.Examples(
#         examples=[
#             "What is photosynthesis?",
#             "Explain Newton's first law of motion",
#             "What causes ocean tides?",
#             "What are the three states of matter?",
#             "How does the water cycle work?",
#             "What is cellular respiration?"
#         ],
#         inputs=msg,
#         label="Example Questions"
#     )

#     # Event handlers
#     msg.submit(respond_to_question, [msg, chatbot, use_finetuned], [msg, chatbot])
#     submit.click(respond_to_question, [msg, chatbot, use_finetuned], [msg, chatbot])
#     clear.click(lambda: None, None, chatbot, queue=False)

# # Launch the interface
# # share=True creates a public link you can share with others
# print("Launching Gradio interface...")
# demo.launch(share=True, server_port=7860)


## 📊 Project Summary & Next Steps

### ✅ Completed Tasks

This notebook has completed the following:

1. **Environment Setup**: Loaded nanochat repository and dependencies
2. **Data Preparation**: Loaded and formatted ScienceQA dataset for conversational format
3. **Base Model Loading**: Loaded the `sdobson/nanochat` checkpoint
4. **Model Training**: Set up training pipeline with nanochat's native functions
5. **Model Saving**: Saved fine-tuned model checkpoint
6. **Evaluation**: Computed accuracy metrics for base vs fine-tuned models
7. **Qualitative Analysis**: Compared responses side-by-side
8. **Interactive Demo**: Created testing functions and optional web interface

### 📁 Generated Files

- `finetuned_model_checkpoint/`: Directory containing fine-tuned model
  - `model_finetuned.pt`: Model weights
  - `meta_finetuned.json`: Model metadata and training info
  - `tokenizer/`: Tokenizer files
- `evaluation_summary.json`: Quantitative evaluation results
- `data/`: Formatted datasets (train, validation, test)

### 🎯 Key Results

Check the evaluation cells above for:
- Base model accuracy
- Fine-tuned model accuracy
- Improvement percentage
- Example comparisons

### 🚀 Next Steps

1. **For Testing (No GPU)**:
   - Uncomment the "TESTING OPTION" cell to save base model as pseudo fine-tuned
   - Run evaluation cells to validate pipeline
   - Test with sample questions

2. **For Training (With GPU on Kaggle/Colab)**:
   - Run the training cell (it will take 2-4 hours)
   - Monitor GPU memory usage
   - Save checkpoints regularly
   - Compare before/after results

3. **For Deployment**:
   - Upload model to HuggingFace Hub
   - Launch Gradio interface for public demo
   - Share results with stakeholders

### 📚 Notes

- **Memory Management**: If you encounter OOM errors, reduce `device_batch_size` or move model to CPU
- **Evaluation Size**: Adjust `EVAL_SIZE` based on available time/resources
- **Generation Parameters**: Modify `temperature`, `top_k`, `max_tokens` for different response styles
- **Training Duration**: Full training on 3000 samples takes approximately 2-4 hours on a consumer GPU

### 🔗 Resources

- Nanochat Repository: https://github.com/karpathy/nanochat
- ScienceQA Dataset: https://huggingface.co/datasets/derek-thomas/ScienceQA
- Model Checkpoint: https://huggingface.co/sdobson/nanochat

---

**Project Status**: All components implemented and ready for testing/training!


## 🧪 Test Your Own Questions

Use the cell below to test your own science questions!


In [None]:
# # Test your own question here!
# # Modify the question below and run this cell

# your_question = """What is the difference between weather and climate?
# A. Weather is short-term, climate is long-term patterns
# B. Weather is warm, climate is cold
# C. Weather happens on land, climate happens in the ocean
# D. They are the same thing"""

# print("Testing with your custom question:\n")
# print("="*100)
# print("COMPARING BOTH MODELS")
# print("="*100 + "\n")

# # Test with base model
# print("🤖 BASE MODEL:")
# print("-"*100)
# test_single_question(your_question, use_finetuned=False)

# print("\n")

# # Test with fine-tuned model
# print("🎯 FINE-TUNED MODEL:")
# print("-"*100)
# test_single_question(your_question, use_finetuned=True)

# print("\n" + "="*100)
# print("✅ Test complete! Modify 'your_question' above to test other questions.")
# print("="*100)


In [None]:
# Additional code or experiments can be added here


In [None]:
# import time
# from contextlib import nullcontext
# from nanochat.common import autodetect_device_type

# device_type = "gpu"
# autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()

# # Learning rate scheduler
# def get_lr_multiplier(it):
#     lrm = 1.0 - it / num_iterations
#     return lrm

# print("Starting fine-tuning...")
# train_iter = iter(train_loader)
# val_iter = iter(val_loader)
# val_loss = 0.0
# step_time_ema = 0.0

# for step in range(num_iterations):
#     t0 = time.time()
#     last_step = step == num_iterations - 1

#     # Validation loop
#     if last_step or step % eval_every == 0:
#         model.eval()
#         losses = []
#         for _ in range(eval_steps):
#             val_inputs, val_targets = next(val_iter)
#             with torch.no_grad(), autocast_ctx:
#                 loss = model(val_inputs, val_targets)
#             losses.append(loss)
#         val_loss = torch.stack(losses).mean().item()
#         print(f"Step {step:05d} | Validation loss: {val_loss:.6f}")
#         model.train()

#     # Training loop
#     num_tokens = 0
#     for micro_step in range(grad_accum_steps):
#         train_inputs, train_targets = next(train_iter)
#         with autocast_ctx:
#             loss = model(train_inputs, train_targets)
#         train_loss = loss.detach() # for logging
#         loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
#         loss.backward() # accumulate the gradient
#         num_tokens += (train_targets >= 0).sum()

#     # Update learning rate
#     lrm = get_lr_multiplier(step)
#     for opt in optimizers:
#         for group in opt.param_groups:
#             group["lr"] = group["initial_lr"] * lrm

#     # Step the optimizers
#     for opt in optimizers:
#         opt.step()
#     model.zero_grad(set_to_none=True)

#     # Timing and logging
#     t1 = time.time()
#     dt = t1 - t0
#     step_time_ema = dt if step_time_ema == 0 else 0.9 * step_time_ema + 0.1 * dt
#     tokens_per_sec = num_tokens.item() / dt

#     print(f"Step {step:05d}/{num_iterations:05d} | Train loss: {train_loss.item():.6f} | LR mult: {lrm:.4f} | Tok/s: {tokens_per_sec:,.0f} | Step time: {step_time_ema:.3f}s")

# print("\nTraining complete.")
