# Notebook 2 · Load Base `nanochat` Checkpoint

This notebook loads the `sdobson/nanochat` checkpoint, evaluates its baseline performance on the prepared ScienceQA subset, and saves the raw baseline responses for later comparison.


In [24]:
import sys
from pathlib import Path

# Clone the nanochat repository (idempotent)
nanochat_repo = Path("nanochat")
if not nanochat_repo.exists():
    print("Cloning karpathy/nanochat...")
    subprocess.run(["git", "clone", "https://github.com/karpathy/nanochat.git", str(nanochat_repo)], check=True)
else:
    print("nanochat repository already present.")

# Ensure the nanochat package is importable
package_path = nanochat_repo.resolve()
if str(package_path) not in sys.path:
    sys.path.insert(0, str(package_path))

from nanochat.gpt import GPT, GPTConfig
from nanochat.tokenizer import RustBPETokenizer


nanochat repository already present.


In [8]:
# Load the model using nanochat's own scripts ---
print("\nLoading model using nanochat's native functions...")
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'


Loading model using nanochat's native functions...


In [35]:
# Download checkpoint artifacts from Hugging Face (if needed)
model_repo = "sdobson/nanochat"
base_cache = Path.home() / ".cache" / "nanochat"

files_to_download = {
    "model_000650.pt": base_cache / "chatsft_checkpoints" / "d20",
    "meta_000650.json": base_cache / "chatsft_checkpoints" / "d20",
    "tokenizer.pkl": base_cache / "tokenizer",
    "token_bytes.pt": base_cache / "tokenizer",
}

for filename, target_dir in files_to_download.items():
    target_dir.mkdir(parents=True, exist_ok=True)
    target_path = target_dir / filename
    if not target_path.exists():
        print(f"Downloading {filename}...")
        hf_hub_download(
            repo_id=model_repo,
            filename=filename,
            local_dir=str(target_dir),
            local_dir_use_symlinks=False,
        )
    else:
        print(f"Found cached {filename}.")


Found cached model_000650.pt.
Found cached meta_000650.json.
Found cached tokenizer.pkl.
Found cached token_bytes.pt.


In [None]:
import torch
import os
from pathlib import Path
import subprocess
import pickle
import json
from huggingface_hub import hf_hub_download

# --- 1. Set up the nanochat environment ---
if not Path('nanochat').exists():
    print("Cloning karpathy/nanochat repository...")
    subprocess.run(['git', 'clone', 'https://github.com/karpathy/nanochat.git'], check=True)
else:
    print("nanochat repository already exists.")

# Add the correct subfolder to the Python path to import its modules
import sys
# The actual package is inside the 'nanochat/nanochat' directory
package_path = os.path.abspath('nanochat')
if package_path not in sys.path:
    sys.path.insert(0, package_path)

# Import from the correct module within the package
from nanochat.gpt import GPT, GPTConfig

# --- 2. Manually download model and tokenizer files ---
print("Downloading nanochat model and tokenizer files...")
model_repo = "sdobson/nanochat"
cache_dir = Path.home() / '.cache' / 'nanochat'

# Define files and their target directories within the cache
files_to_download = {
    "model_000650.pt": "chatsft_checkpoints/d20/",
    "meta_000650.json": "chatsft_checkpoints/d20/",
    "tokenizer.pkl": "tokenizer/",
    "token_bytes.pt": "tokenizer/"
}

# Download all files to their respective cache directories
for filename, target_subdir in files_to_download.items():
    local_dir = cache_dir / target_subdir
    os.makedirs(local_dir, exist_ok=True)
    target_path = local_dir / filename
    if not target_path.exists():
        print(f"Downloading {filename}...")
        hf_hub_download(repo_id=model_repo, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False)
    else:
        print(f"{filename} already exists. Skipping download.")

# --- 3. Load the model using nanochat's own scripts ---
print("\nLoading model using nanochat's native functions...")
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

# Load the model checkpoint
checkpoint_path = cache_dir / "chatsft_checkpoints" / "d20" / "model_000650.pt"
meta_path = cache_dir / "chatsft_checkpoints" / "d20" / "meta_000650.json"

state_dict = torch.load(checkpoint_path, map_location=device)

with open(meta_path, "r", encoding="utf-8") as f:
    meta = json.load(f)

gptconf = GPTConfig(**meta['model_config'])
model = GPT(gptconf)
model.load_state_dict(state_dict, strict=True)

# Fix the keys in the state_dict (remove '_orig_mod.' prefix)
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

model.load_state_dict(state_dict)
model.eval()  # Set to evaluation mode
model.to(device)

print(f"\nModel loaded successfully!")
print(f"Model is on device: {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M")

# --- 4. Load the custom tokenizer ---
class NanoTokenizer:
    def __init__(self, cache_dir):
        # We need to load the tokenizer from the nanochat package now
        from nanochat.tokenizer import RustBPETokenizer
        tokenizer_path = cache_dir / 'tokenizer'
        self.tokenizer_model = RustBPETokenizer.from_directory(str(tokenizer_path))
    
    def encode(self, text, bos=True, eos=True):
        # Use the nanochat tokenizer's encode method signature
        return self.tokenizer_model.encode(text, bos=bos, eos=eos)

    def decode(self, tokens):
        return self.tokenizer_model.decode(tokens)

# tokenizer = NanoTokenizer(cache_dir)
# print("Custom tokenizer loaded.")

nanochat repository already exists.
Downloading nanochat model and tokenizer files...
model_000650.pt already exists. Skipping download.
meta_000650.json already exists. Skipping download.
tokenizer.pkl already exists. Skipping download.
token_bytes.pt already exists. Skipping download.

Loading model using nanochat's native functions...

Model loaded successfully!
Model is on device: cpu
Model parameters: 561M
Custom tokenizer loaded.


In [38]:
# Initialize the nanochat tokenizer
tokenizer_dir = base_cache / "tokenizer"
tokenizer = RustBPETokenizer.from_directory(str(tokenizer_dir))

print("Tokenizer ready.")


Tokenizer ready.


In [39]:
# Precompute frequently used special token IDs
bos_id = tokenizer.get_bos_token_id()
assistant_start_id = tokenizer.encode_special("<|assistant_start|>")
assistant_end_id = tokenizer.encode_special("<|assistant_end|>")
user_start_id = tokenizer.encode_special("<|user_start|>")
user_end_id = tokenizer.encode_special("<|user_end|>")


In [40]:
def build_completion_prompt(conversation):
    """Return token ids primed for assistant completion."""
    conversation_for_completion = json.loads(json.dumps(conversation))  # deep copy via json
    conversation_for_completion["messages"][-1]["content"] = ""
    prompt_tokens = tokenizer.render_for_completion(conversation_for_completion)
    return prompt_tokens


def generate_response_from_tokens(prompt_tokens, *, max_tokens=256, temperature=0.7, top_k=50):
    tokens = list(prompt_tokens)
    generated = []
    for token in model.generate(tokens, max_tokens=max_tokens, temperature=temperature, top_k=top_k):
        tokens.append(token)
        generated.append(token)
        if token == assistant_end_id:
            break
    text = tokenizer.decode([token for token in generated if token not in {assistant_end_id}])
    return text.strip(), generated


def generate_response_for_conversation(conversation, **generation_kwargs):
    prompt_tokens = build_completion_prompt(conversation)
    text, token_sequence = generate_response_from_tokens(prompt_tokens, **generation_kwargs)
    return {
        "prompt_tokens": prompt_tokens,
        "generated_tokens": token_sequence,
        "response_text": text,
    }


In [41]:
from datasets import load_from_disk
from pathlib import Path

# Load PRE-FORMATTED ScienceQA evaluation subset
DATA_DIR = Path("data")

try:
    # Load the data that was already processed in Notebook 1
    test_formatted = load_from_disk(str(DATA_DIR / "test_subset"))
    print(f"Loaded formatted test subset with {len(test_formatted)} examples.")
except FileNotFoundError:
    test_formatted = None
    print("Warning: data/test_formatted not found. Please run Notebook 1 first and save the formatted data.")


Loaded formatted test subset with 500 examples.


In [42]:
from datasets import load_from_disk
from pathlib import Path

test_formatted = load_from_disk(str(DATA_DIR / "test_formatted"))
print(f"Loaded formatted test subset with {len(test_formatted)} examples.")

conversations = []
for example in test_formatted:
    messages = example["messages"]  # now present
    conversations.append({
        "id": example.get("id"),
        "conversation": {"messages": messages},
        "expected_response": messages[2]["content"],
    })
print(f"Prepared {len(conversations)} conversations from pre-processed file.")

Loaded formatted test subset with 500 examples.
Prepared 500 conversations from pre-processed file.


In [43]:
# Prepare conversations from the pre-formatted file
if test_formatted is not None:
    conversations = []
    for example in test_formatted:
        # The assistant's content is the expected response
        conversations.append({
            "id": example.get("id"), # We can still get the ID
            "conversation": {"messages": example["messages"]},
            "expected_response": example["messages"][2]["content"], # 2 is the assistant's message
        })
    print(f"Prepared {len(conversations)} conversations from pre-processed file.")
else:
    conversations = []


Prepared 500 conversations from pre-processed file.


In [44]:
# Generate baseline responses (configure sample size as needed)
EVAL_SAMPLE_SIZE = 50  # adjust to cover more/less examples

def evaluate_baseline(sample_size=EVAL_SAMPLE_SIZE, temperature=0.7, top_k=50):
    assert conversations, "No conversations prepared. Ensure Notebook 1 has been run."
    results = []
    for idx, example in enumerate(conversations[:sample_size]):
        convo = example["conversation"]
        generation = generate_response_for_conversation(
            convo,
            max_tokens=256,
            temperature=temperature,
            top_k=top_k,
        )
        results.append({
            "index": idx,
            "example_id": example.get("id"),
            "question": convo["messages"][1]["content"],
            "expected": example["expected_response"],
            "response": generation["response_text"],
            "prompt_tokens": generation["prompt_tokens"],
            "generated_tokens": generation["generated_tokens"],
        })
    return results

# baseline_results = evaluate_baseline()
# len(baseline_results)


In [45]:
def extract_choice_letter(text):
    for letter in ["A", "B", "C", "D", "E", "F"]:
        if f"{letter}." in text:
            return letter
        if f" {letter} " in text:
            return letter
    return None


def compute_accuracy(results):
    correct = 0
    total = 0
    for entry in results:
        expected_letter = extract_choice_letter(entry["expected"])
        predicted_letter = extract_choice_letter(entry["response"])
        if expected_letter and predicted_letter:
            total += 1
            if expected_letter == predicted_letter:
                correct += 1
    accuracy = correct / total if total else 0.0
    return {
        "evaluated": len(results),
        "scored": total,
        "correct": correct,
        "accuracy": accuracy,
    }

# metrics = compute_accuracy(baseline_results)
# metrics


In [47]:
def summarize_result(entry, idx=0):
    print(f"Example {idx}")
    print("Question:\n", entry["question"])
    print("\nExpected:\n", entry["expected"])
    print("\nModel Response:\n", entry["response"])

# if baseline_results:
#     summarize_result(baseline_results[0], idx=baseline_results[0]["index"])


## Next Steps

1. Uncomment the evaluation cells to generate baseline responses once the ScienceQA subsets are prepared.
2. Review `baseline_responses.json` to confirm output quality before starting fine-tuning.
3. Keep an eye on GPU availability; running on CPU will be slow for the full evaluation set.


In [65]:
# This is the data loader from scripts/chat_sft.py
def sft_data_generator(dataset, batch_size):
    pad_token_id = tokenizer.encode_special("<|assistant_end|>") # use <|assistant_end|> as the pad token is ok, these positions are masked in the loss
    
    # prepares a list of tokenized conversations into a batch and yields
    def collate_and_yield(batch):
        nrows = len(batch)
        ncols = max(len(ids) for ids, mask in batch) - 1 # seq of n creates inputs/targets of n-1
        inputs = torch.full((nrows, ncols), pad_token_id, dtype=torch.long)
        targets = torch.full((nrows, ncols), -1, dtype=torch.long) # -1 is ignore index
        for i, (ids, mask) in enumerate(batch):
            n = len(ids)
            ids_tensor = torch.tensor(ids, dtype=torch.long)
            inputs[i, :n-1] = ids_tensor[:-1]
            # recall -1 is the ignore index, so mask out targets where mask is 0
            row_targets = ids_tensor[1:]
            # mask[1:] omits the mask for the BOS token, which is never a target atm so it's ok
            mask_tensor = torch.tensor(mask[1:], dtype=torch.long)
            row_targets[mask_tensor == 0] = -1 # mask out targets where mask is 0
            targets[i, :n-1] = row_targets
        inputs = inputs.to(device) # move to device
        targets = targets.to(device)
        return inputs, targets
    
    # iterates over the dataset in epochs, tokenizes
    batch = []
    while True:
        for i in range(len(dataset)):
            doc = dataset[i]
            ids, mask = tokenizer.render_conversation(doc)
            batch.append((ids, mask))
            if len(batch) == batch_size:
                yield collate_and_yield(batch)
                batch = []

print("Data generator `sft_data_generator` is defined.")


Data generator `sft_data_generator` is defined.


In [66]:
# Cell: Define project paths
from pathlib import Path

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

# Define paths for formatted data
train_formatted_path = DATA_DIR / "train_formatted"
val_formatted_path = DATA_DIR / "val_formatted"
test_formatted_path = DATA_DIR / "test_formatted"


In [67]:
# Cell: Download and subset ScienceQA dataset
from datasets import load_dataset

# Load full dataset
full_dataset = load_dataset('derek-thomas/ScienceQA')

# Create balanced subset for fine-tuning
train_subset = full_dataset['train'].shuffle(seed=42).select(range(3000))
val_subset = full_dataset['validation'].shuffle(seed=42).select(range(500))
test_subset = full_dataset['test'].shuffle(seed=42).select(range(500))

print(f"Train subset: {len(train_subset)} | Validation subset: {len(val_subset)} | Test subset: {len(test_subset)}")


Train subset: 3000 | Validation subset: 500 | Test subset: 500


In [69]:
# Cell: Define formatting function
def format_scienceqa_for_chat(example):
    """Convert ScienceQA to conversational format for nanochat."""
    
    # Build question with choices
    question = example['question']
    choices = example['choices']
    choices_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])
    
    full_question = f"{question}\n\n{choices_text}"
    
    # Build answer with explanation
    answer_idx = example['answer']
    correct_answer = choices[answer_idx]
    
    response = f"The correct answer is {chr(65+answer_idx)}. {correct_answer}"
    
    # Add explanation if available
    if example.get('solution'):
        response += f"\n\nExplanation: {example['solution']}"
    
    # Add lecture context if available
    if example.get('lecture'):
        response += f"\n\nBackground: {example['lecture']}"
    
    # Format as conversational message
    return {
        "id": example.get("id"), # Keep id for tracking
        "messages": [
            {"role": "system", "content": "You are a helpful science tutor for elementary through high school students. Explain concepts clearly with examples."},
            {"role": "user", "content": full_question},
            {"role": "assistant", "content": response}
        ]
    }


In [70]:
# Cell: Apply formatting and save datasets
# Apply formatting
train_formatted = train_subset.map(format_scienceqa_for_chat, remove_columns=train_subset.column_names)
val_formatted = val_subset.map(format_scienceqa_for_chat, remove_columns=val_subset.column_names)
test_formatted = test_subset.map(format_scienceqa_for_chat, remove_columns=test_subset.column_names)

# Save formatted datasets to disk
train_formatted.save_to_disk(str(train_formatted_path))
val_formatted.save_to_disk(str(val_formatted_path))
test_formatted.save_to_disk(str(test_formatted_path))

print(f"Saved formatted datasets to {DATA_DIR}")
print(f"Train: {len(train_formatted)} | Val: {len(val_formatted)} | Test: {len(test_formatted)}")


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saved formatted datasets to data
Train: 3000 | Val: 500 | Test: 500


In [72]:
# Cell: Load formatted datasets for training
from datasets import load_from_disk

# These paths were defined in a previous cell
if not train_formatted_path.exists() or not val_formatted_path.exists():
    raise FileNotFoundError(f"Missing formatted datasets. Please ensure the data preparation cells have been run.")

train_dataset = load_from_disk(str(train_formatted_path))
val_dataset = load_from_disk(str(val_formatted_path))

print(f"Loaded datasets for training. Train samples: {len(train_dataset)} | Validation samples: {len(val_dataset)}")


Loaded datasets for training. Train samples: 3000 | Validation samples: 500


In [73]:
# SFT Hyperparameters (adapted from scripts/chat_sft.py)
device_batch_size = 2 # Max to avoid OOM on a consumer GPU
num_epochs = 2
target_examples_per_step = 16 # Effective batch size via gradient accumulation
unembedding_lr = 0.004
embedding_lr = 0.2
matrix_lr = 0.02
weight_decay = 0.0
init_lr_frac = 0.02
eval_every = 100 # Steps
eval_steps = 50

# Calculate number of iterations and gradient accumulation
grad_accum_steps = target_examples_per_step // device_batch_size
num_iterations = (len(train_dataset) // target_examples_per_step) * num_epochs

print(f"Effective batch size: {target_examples_per_step}")
print(f"Device batch size: {device_batch_size}")
print(f"Gradient accumulation steps: {grad_accum_steps}")
print(f"Total training iterations: {num_iterations}")

# Create data loaders (generators)
train_loader = sft_data_generator(train_dataset, batch_size=device_batch_size)
val_loader = sft_data_generator(val_dataset, batch_size=device_batch_size)


Effective batch size: 16
Device batch size: 2
Gradient accumulation steps: 8
Total training iterations: 374


In [None]:
optimizers = model.setup_optimizers(
    unembedding_lr=unembedding_lr,
    embedding_lr=embedding_lr,
    matrix_lr=matrix_lr,
    weight_decay=weight_decay,
    )

# Set the initial learning rate as a fraction of the base learning rate
for opt in optimizers:
    for group in opt.param_groups:
        group["lr"] = group["lr"] * init_lr_frac
        group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later

print("Optimizers initialized.")


Scaling the LR for the AdamW parameters ∝1/√(1280/768) = 0.774597
Optimizers initialized.


In [62]:
import time
from contextlib import nullcontext
from nanochat.common import autodetect_device_type

device_type = "cpu"
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()

# Learning rate scheduler
def get_lr_multiplier(it):
    lrm = 1.0 - it / num_iterations
    return lrm

print("Starting fine-tuning...")
train_iter = iter(train_loader)
val_iter = iter(val_loader)
val_loss = 0.0
step_time_ema = 0.0

for step in range(num_iterations):
    t0 = time.time()
    last_step = step == num_iterations - 1

    # Validation loop
    if last_step or step % eval_every == 0:
        model.eval()
        losses = []
        for _ in range(eval_steps):
            val_inputs, val_targets = next(val_iter)
            with torch.no_grad(), autocast_ctx:
                loss = model(val_inputs, val_targets)
            losses.append(loss)
        val_loss = torch.stack(losses).mean().item()
        print(f"Step {step:05d} | Validation loss: {val_loss:.6f}")
        model.train()

    # Training loop
    num_tokens = 0
    for micro_step in range(grad_accum_steps):
        train_inputs, train_targets = next(train_iter)
        with autocast_ctx:
            loss = model(train_inputs, train_targets)
        train_loss = loss.detach() # for logging
        loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
        loss.backward() # accumulate the gradient
        num_tokens += (train_targets >= 0).sum()
    
    # Update learning rate
    lrm = get_lr_multiplier(step)
    for opt in optimizers:
        for group in opt.param_groups:
            group["lr"] = group["initial_lr"] * lrm

    # Step the optimizers
    for opt in optimizers:
        opt.step()
    model.zero_grad(set_to_none=True)

    # Timing and logging
    t1 = time.time()
    dt = t1 - t0
    step_time_ema = dt if step_time_ema == 0 else 0.9 * step_time_ema + 0.1 * dt
    tokens_per_sec = num_tokens.item() / dt
    
    print(f"Step {step:05d}/{num_iterations:05d} | Train loss: {train_loss.item():.6f} | LR mult: {lrm:.4f} | Tok/s: {tokens_per_sec:,.0f} | Step time: {step_time_ema:.3f}s")

print("\nTraining complete.")


Starting fine-tuning...
Step 00000 | Validation loss: 1.921232


KeyboardInterrupt: 