# Notebook 2 Â· Load Base `nanochat` Checkpoint

This notebook loads the `sdobson/nanochat` checkpoint, evaluates its baseline performance on the prepared ScienceQA subset, and saves the raw baseline responses for later comparison.


In [2]:
import os
import sys
import json
import subprocess
from pathlib import Path

import torch
from datasets import load_from_disk
from huggingface_hub import hf_hub_download

# Detect available device (CUDA preferred)
if torch.cuda.is_available():
    device = torch.device("cuda")
    device_name = torch.cuda.get_device_name(0)
else:
    device = torch.device("cpu")
    device_name = "CPU"

device = torch.device("cpu")
device_name = "CPU"

print(f"Using device: {device} ({device_name})")


Using device: cpu (CPU)


In [4]:
!pip install uv
%cd nanochat
!uv sync
%cd ..
!pip install tiktoken

/home/nidhinninan/Desktop/UCMO Classes/Natural Language Processing/Project/nanochat
[2mResolved [1m91 packages[0m [2min 0.77ms[0m[0m
[2mAudited [1m71 packages[0m [2min 8ms[0m[0m
/home/nidhinninan/Desktop/UCMO Classes/Natural Language Processing/Project


In [5]:
# Clone the nanochat repository (idempotent)
nanochat_repo = Path("nanochat")
if not nanochat_repo.exists():
    print("Cloning karpathy/nanochat...")
    subprocess.run(["git", "clone", "https://github.com/karpathy/nanochat.git", str(nanochat_repo)], check=True)
else:
    print("nanochat repository already present.")

# Ensure the nanochat package is importable
package_path = nanochat_repo.resolve()
if str(package_path) not in sys.path:
    sys.path.insert(0, str(package_path))

from nanochat.gpt import GPT, GPTConfig
from nanochat.tokenizer import RustBPETokenizer


nanochat repository already present.


In [6]:
# Download checkpoint artifacts from Hugging Face (if needed)
model_repo = "sdobson/nanochat"
base_cache = Path.home() / ".cache" / "nanochat"

files_to_download = {
    "model_000650.pt": base_cache / "chatsft_checkpoints" / "d20",
    "meta_000650.json": base_cache / "chatsft_checkpoints" / "d20",
    "tokenizer.pkl": base_cache / "tokenizer",
    "token_bytes.pt": base_cache / "tokenizer",
}

for filename, target_dir in files_to_download.items():
    target_dir.mkdir(parents=True, exist_ok=True)
    target_path = target_dir / filename
    if not target_path.exists():
        print(f"Downloading {filename}...")
        hf_hub_download(
            repo_id=model_repo,
            filename=filename,
            local_dir=str(target_dir),
            local_dir_use_symlinks=False,
        )
    else:
        print(f"Found cached {filename}.")


Found cached model_000650.pt.
Found cached meta_000650.json.
Found cached tokenizer.pkl.
Found cached token_bytes.pt.


In [8]:
# Load checkpoint weights and configuration
checkpoint_dir = base_cache / "chatsft_checkpoints" / "d20"
model_path = checkpoint_dir / "model_000650.pt"
meta_path = checkpoint_dir / "meta_000650.json"

torch_dtype = torch.float16 if device.type == "cuda" else torch.float32
state_dict = torch.load(model_path, map_location=device)

with open(meta_path, "r", encoding="utf-8") as f:
    meta = json.load(f)

model_config = GPTConfig(**meta["model_config"])
model = GPT(model_config)

# Clean state dict keys (strip _orig_mod.)
if any(k.startswith("_orig_mod.") for k in state_dict.keys()):
    state_dict = {k.removeprefix("_orig_mod."): v for k, v in state_dict.items()}

# Move model to device and load weights
model.to(device)
model.load_state_dict(state_dict, strict=True)
model.eval()

print("Loaded nanochat base model.")
print(f"Model is on device: {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M")


Loaded nanochat base model.
Model is on device: cpu
Model parameters: 561M


In [9]:
# Initialize the nanochat tokenizer
tokenizer_dir = base_cache / "tokenizer"
tokenizer = RustBPETokenizer.from_directory(str(tokenizer_dir))

print("Tokenizer ready.")


Tokenizer ready.


In [10]:
# Precompute frequently used special token IDs
bos_id = tokenizer.get_bos_token_id()
assistant_start_id = tokenizer.encode_special("<|assistant_start|>")
assistant_end_id = tokenizer.encode_special("<|assistant_end|>")
user_start_id = tokenizer.encode_special("<|user_start|>")
user_end_id = tokenizer.encode_special("<|user_end|>")


In [20]:
def build_completion_prompt(conversation):
    """Return token ids primed for assistant completion."""
    conversation_for_completion = json.loads(json.dumps(conversation))  # deep copy via json
    conversation_for_completion["messages"][-1]["content"] = ""
    prompt_tokens = tokenizer.render_for_completion(conversation_for_completion)
    return prompt_tokens


def generate_response_from_tokens(prompt_tokens, *, max_tokens=256, temperature=0.7, top_k=50):
    tokens = list(prompt_tokens)
    generated = []
    for token in model.generate(tokens, max_tokens=max_tokens, temperature=temperature, top_k=top_k):
        tokens.append(token)
        generated.append(token)
        if token == assistant_end_id:
            break
    text = tokenizer.decode([token for token in generated if token not in {assistant_end_id}])
    return text.strip(), generated


def generate_response_for_conversation(conversation, **generation_kwargs):
    prompt_tokens = build_completion_prompt(conversation)
    text, token_sequence = generate_response_from_tokens(prompt_tokens, **generation_kwargs)
    return {
        "prompt_tokens": prompt_tokens,
        "generated_tokens": token_sequence,
        "response_text": text,
    }


In [22]:
# Load PRE-FORMATTED ScienceQA evaluation subset
DATA_DIR = Path("data")

try:
    # Load the data that was already processed in Notebook 1
    test_formatted = load_from_disk(str(DATA_DIR / "test_subset"))
    print(f"Loaded formatted test subset with {len(test_formatted)} examples.")
except FileNotFoundError:
    test_formatted = None
    print("Warning: data/test_formatted not found. Please run Notebook 1 first and save the formatted data.")


Loaded formatted test subset with 500 examples.


In [None]:
from datasets import load_from_disk
from pathlib import Path

test_formatted = load_from_disk(str(DATA_DIR / "test_formatted"))
print(f"Loaded formatted test subset with {len(test_formatted)} examples.")

conversations = []
for example in test_formatted:
    messages = example["messages"]  # now present
    conversations.append({
        "id": example.get("id"),
        "conversation": {"messages": messages},
        "expected_response": messages[2]["content"],
    })
print(f"Prepared {len(conversations)} conversations from pre-processed file.")

Loaded formatted test subset with 500 examples.
Prepared 500 conversations from pre-processed file.


In [30]:
# Prepare conversations from the pre-formatted file
if test_formatted is not None:
    conversations = []
    for example in test_formatted:
        # The assistant's content is the expected response
        conversations.append({
            "id": example.get("id"), # We can still get the ID
            "conversation": {"messages": example["messages"]},
            "expected_response": example["messages"][2]["content"], # 2 is the assistant's message
        })
    print(f"Prepared {len(conversations)} conversations from pre-processed file.")
else:
    conversations = []


Prepared 500 conversations from pre-processed file.


In [26]:
# Generate baseline responses (configure sample size as needed)
EVAL_SAMPLE_SIZE = 50  # adjust to cover more/less examples

def evaluate_baseline(sample_size=EVAL_SAMPLE_SIZE, temperature=0.7, top_k=50):
    assert conversations, "No conversations prepared. Ensure Notebook 1 has been run."
    results = []
    for idx, example in enumerate(conversations[:sample_size]):
        convo = example["conversation"]
        generation = generate_response_for_conversation(
            convo,
            max_tokens=256,
            temperature=temperature,
            top_k=top_k,
        )
        results.append({
            "index": idx,
            "example_id": example.get("id"),
            "question": convo["messages"][1]["content"],
            "expected": example["expected_response"],
            "response": generation["response_text"],
            "prompt_tokens": generation["prompt_tokens"],
            "generated_tokens": generation["generated_tokens"],
        })
    return results

# baseline_results = evaluate_baseline()
# len(baseline_results)


In [27]:
def extract_choice_letter(text):
    for letter in ["A", "B", "C", "D", "E", "F"]:
        if f"{letter}." in text:
            return letter
        if f" {letter} " in text:
            return letter
    return None


def compute_accuracy(results):
    correct = 0
    total = 0
    for entry in results:
        expected_letter = extract_choice_letter(entry["expected"])
        predicted_letter = extract_choice_letter(entry["response"])
        if expected_letter and predicted_letter:
            total += 1
            if expected_letter == predicted_letter:
                correct += 1
    accuracy = correct / total if total else 0.0
    return {
        "evaluated": len(results),
        "scored": total,
        "correct": correct,
        "accuracy": accuracy,
    }

# metrics = compute_accuracy(baseline_results)
# metrics


In [28]:
BASELINE_OUTPUT_PATH = Path("baseline_responses.json")


def save_baseline_results(results, path=BASELINE_OUTPUT_PATH):
    path = Path(path)
    with path.open("w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(results)} baseline responses to {path}.")

# save_baseline_results(baseline_results)



In [29]:
def summarize_result(entry, idx=0):
    print(f"Example {idx}")
    print("Question:\n", entry["question"])
    print("\nExpected:\n", entry["expected"])
    print("\nModel Response:\n", entry["response"])

# if baseline_results:
#     summarize_result(baseline_results[0], idx=baseline_results[0]["index"])


## Next Steps

1. Uncomment the evaluation cells to generate baseline responses once the ScienceQA subsets are prepared.
2. Review `baseline_responses.json` to confirm output quality before starting fine-tuning.
3. Keep an eye on GPU availability; running on CPU will be slow for the full evaluation set.
