In [1]:
import torch

# Check if CUDA (NVIDIA GPU) is available and set the device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using CUDA.")
    # You can also check the GPU name
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU. This will be very slow.")

GPU is available. Using CUDA.
Device Name: Quadro T2000


In [None]:
# This cell is now empty. The logic has been moved to the next cell.

In [8]:
import torch
import os
from pathlib import Path
import subprocess
import pickle
import json
from huggingface_hub import hf_hub_download

# --- 1. Set up the nanochat environment ---
if not Path('nanochat').exists():
    print("Cloning karpathy/nanochat repository...")
    subprocess.run(['git', 'clone', 'https://github.com/karpathy/nanochat.git'], check=True)
else:
    print("nanochat repository already exists.")

# --- 1a. Install dependencies using uv ---
print("\nInstalling dependencies...")
!pip install -q uv
%cd nanochat
!uv sync
%cd ..

# Add the correct subfolder to the Python path to import its modules
import sys
# The actual package is inside the 'nanochat/nanochat' directory
package_path = os.path.abspath('nanochat')
if package_path not in sys.path:
    sys.path.insert(0, package_path)

# Import from the correct module within the package
from nanochat.gpt import GPT, GPTConfig

# --- 2. Manually download model and tokenizer files ---
print("Downloading nanochat model and tokenizer files...")
model_repo = "sdobson/nanochat"
cache_dir = Path.home() / '.cache' / 'nanochat'

# Define files and their target directories within the cache
files_to_download = {
    "model_000650.pt": "chatsft_checkpoints/d20/",
    "meta_000650.json": "chatsft_checkpoints/d20/",
    "tokenizer.pkl": "tokenizer/",
    "token_bytes.pt": "tokenizer/"
}

# Download all files to their respective cache directories
for filename, target_subdir in files_to_download.items():
    local_dir = cache_dir / target_subdir
    os.makedirs(local_dir, exist_ok=True)
    target_path = local_dir / filename
    if not target_path.exists():
        print(f"Downloading {filename}...")
        hf_hub_download(repo_id=model_repo, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False)
    else:
        print(f"{filename} already exists. Skipping download.")

# --- 3. Load the model using nanochat's own scripts ---
print("\nLoading model using nanochat's native functions...")
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

# Load the model checkpoint
checkpoint_path = cache_dir / "chatsft_checkpoints" / "d20" / "model_000650.pt"
meta_path = cache_dir / "chatsft_checkpoints" / "d20" / "meta_000650.json"

state_dict = torch.load(checkpoint_path, map_location=device)

with open(meta_path, "r", encoding="utf-8") as f:
    meta = json.load(f)

gptconf = GPTConfig(**meta['model_config'])
model = GPT(gptconf)
model.load_state_dict(state_dict, strict=True)

# Fix the keys in the state_dict (remove '_orig_mod.' prefix)
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

model.load_state_dict(state_dict)
model.eval()  # Set to evaluation mode
model.to(device)

print(f"\nModel loaded successfully!")
print(f"Model is on device: {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M")

# --- 4. Load the custom tokenizer ---
class NanoTokenizer:
    def __init__(self, cache_dir):
        # We need to load the tokenizer from the nanochat package now
        from nanochat.tokenizer import Tokenizer
        tokenizer_path = cache_dir / 'tokenizer'
        self.tokenizer_model = Tokenizer(str(tokenizer_path))
    
    def encode(self, text, bos=True, eos=True):
        # Use the nanochat tokenizer's encode method signature
        return self.tokenizer_model.encode(text, bos=bos, eos=eos)

    def decode(self, tokens):
        return self.tokenizer_model.decode(tokens)

tokenizer = NanoTokenizer(cache_dir)
print("Custom tokenizer loaded.")