In [1]:
import sys
from pathlib import Path
import subprocess

# Clone the nanochat repository (idempotent)
nanochat_repo = Path("nanochat")
if not nanochat_repo.exists():
    print("Cloning karpathy/nanochat...")
    subprocess.run(["git", "clone", "https://github.com/karpathy/nanochat.git", str(nanochat_repo)], check=True)
else:
    print("nanochat repository already present.")

# Ensure the nanochat package is importable
package_path = nanochat_repo.resolve()
if str(package_path) not in sys.path:
    sys.path.insert(0, str(package_path))

from nanochat.gpt import GPT, GPTConfig
from nanochat.tokenizer import RustBPETokenizer

Cloning karpathy/nanochat...


In [2]:
!pip install -e ./nanochat

Obtaining file:///content/nanochat
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting files-to-prompt>=0.6 (from nanochat==0.1.0)
  Downloading files_to_prompt-0.6-py3-none-any.whl.metadata (7.4 kB)
Collecting psutil>=7.1.0 (from nanochat==0.1.0)
  Downloading psutil-7.1.3-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl.metadata (23 kB)
Collecting setuptools>=80.9.0 (from nanochat==0.1.0)
  Downloading setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Downloading files_to_prompt-0.6-py3-none-any.whl (10 kB)
Downloading psutil-7.1.3-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl (263 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.3/2

In [3]:
import torch

if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print(f"CUDA is available. You have {device_count} GPU(s) available.")
    for i in range(device_count):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. PyTorch is using the CPU.")

# Load the model using nanochat's native scripts ---
print("\nLoading model using nanochat's native functions...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device selected: {device}")


CUDA is available. You have 1 GPU(s) available.
GPU 0: NVIDIA A100-SXM4-40GB

Loading model using nanochat's native functions...
Device selected: cuda


In [4]:
from huggingface_hub import hf_hub_download

# Download checkpoint artifacts from Hugging Face (if needed)
model_repo = "sdobson/nanochat"
base_cache = Path.home() / ".cache" / "nanochat"

files_to_download = {
    "model_000650.pt": base_cache / "chatsft_checkpoints" / "d20",
    "meta_000650.json": base_cache / "chatsft_checkpoints" / "d20",
    "tokenizer.pkl": base_cache / "tokenizer",
    "token_bytes.pt": base_cache / "tokenizer",
}

for filename, target_dir in files_to_download.items():
    target_dir.mkdir(parents=True, exist_ok=True)
    target_path = target_dir / filename
    if not target_path.exists():
        print(f"Downloading {filename}...")
        hf_hub_download(
            repo_id=model_repo,
            filename=filename,
            local_dir=str(target_dir),
            local_dir_use_symlinks=False,
        )
    else:
        print(f"Found cached {filename}.")


Downloading model_000650.pt...


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


model_000650.pt:   0%|          | 0.00/2.08G [00:00<?, ?B/s]

Downloading meta_000650.json...


meta_000650.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

Downloading tokenizer.pkl...


tokenizer.pkl:   0%|          | 0.00/846k [00:00<?, ?B/s]

Downloading token_bytes.pt...


token_bytes.pt:   0%|          | 0.00/264k [00:00<?, ?B/s]

In [5]:
# Initialize the nanochat tokenizer
tokenizer_dir = base_cache / "tokenizer"
tokenizer = RustBPETokenizer.from_directory(str(tokenizer_dir))

print("Tokenizer ready.")


Tokenizer ready.


In [6]:
# Precompute frequently used special token IDs
bos_id = tokenizer.get_bos_token_id()
assistant_start_id = tokenizer.encode_special("<|assistant_start|>")
assistant_end_id = tokenizer.encode_special("<|assistant_end|>")
user_start_id = tokenizer.encode_special("<|user_start|>")
user_end_id = tokenizer.encode_special("<|user_end|>")

In [7]:
!pip install -q transformers datasets torch accelerate huggingface_hub bitsandbytes

from huggingface_hub import login
from google.colab import userdata

try:
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    print("✅ Successfully logged in to Hugging Face using Kaggle Secret.")
except Exception as e:
    print(f"⚠️ Failed to login with Kaggle Secret: {e}")
    # Fallback for local testing or if secret is missing
    # login()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Successfully logged in to Hugging Face using Kaggle Secret.


In [8]:
# Cell: Define project paths
from pathlib import Path

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

# Define paths for formatted data
train_formatted_path = DATA_DIR / "train_formatted"
val_formatted_path = DATA_DIR / "val_formatted"
test_formatted_path = DATA_DIR / "test_formatted"

In [9]:
# Cell: Download and subset ScienceQA dataset
from datasets import load_dataset

# Load full dataset
full_dataset = load_dataset('derek-thomas/ScienceQA')

# Create balanced subset for fine-tuning
train_subset = full_dataset['train'].shuffle(seed=42).select(range(12000))
val_subset = full_dataset['validation'].shuffle(seed=42).select(range(4000))
test_subset = full_dataset['test'].shuffle(seed=42).select(range(4000))

print(f"Train subset: {len(train_subset)} | Validation subset: {len(val_subset)} | Test subset: {len(test_subset)}")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-1028f23e353fbe(…):   0%|          | 0.00/377M [00:00<?, ?B/s]

data/validation-00000-of-00001-6c7328ff6(…):   0%|          | 0.00/126M [00:00<?, ?B/s]

data/test-00000-of-00001-f0e719df791966f(…):   0%|          | 0.00/122M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12726 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Train subset: 12000 | Validation subset: 4000 | Test subset: 4000


In [10]:
# Cell: Define formatting function
def format_scienceqa_for_chat(example):
    """Convert ScienceQA to conversational format for nanochat."""

    # Build question with choices
    question = example['question']
    choices = example['choices']
    choices_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])

    full_question = f"{question}\n\n{choices_text}"

    # Build answer with explanation
    answer_idx = example['answer']
    correct_answer = choices[answer_idx]

    response = f"The correct answer is {chr(65+answer_idx)}. {correct_answer}"

    # Add explanation if available
    if example.get('solution'):
        response += f"\n\nExplanation: {example['solution']}"

    # Add lecture context if available
    if example.get('lecture'):
        response += f"\n\nBackground: {example['lecture']}"

    # Format as conversational message
    return {
        "id": example.get("id"), # Keep id for tracking
        "messages": [
            {"role": "system", "content": "You are a helpful science tutor for elementary through high school students. Explain concepts clearly with examples."},
            {"role": "user", "content": full_question},
            {"role": "assistant", "content": response}
        ]
    }

In [11]:
# Cell: Apply formatting and save datasets
# Apply formatting
train_formatted = train_subset.map(format_scienceqa_for_chat, remove_columns=train_subset.column_names)
val_formatted = val_subset.map(format_scienceqa_for_chat, remove_columns=val_subset.column_names)
test_formatted = test_subset.map(format_scienceqa_for_chat, remove_columns=test_subset.column_names)

# Save formatted datasets to disk
train_formatted.save_to_disk(str(train_formatted_path))
val_formatted.save_to_disk(str(val_formatted_path))
test_formatted.save_to_disk(str(test_formatted_path))

print(f"Saved formatted datasets to {DATA_DIR}")
print(f"Train: {len(train_formatted)} | Val: {len(val_formatted)} | Test: {len(test_formatted)}")


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saved formatted datasets to data
Train: 12000 | Val: 4000 | Test: 4000


In [1]:
# !python /content/test_train.py

SciAssist Training Test

✓ All prerequisites check passed

Starting training script...
--------------------------------------------------------------------------------
SciAssist Fine-Tuning
Autodetected device type: cuda
Overriding device type to 'cuda' via SCIASSIST_FORCE_DEVICE
  _C._set_float32_matmul_precision(precision)
2025-11-25 17:22:32,146 - nanochat.common - [32m[1mINFO[0m - Distributed world size: 1

Device: cuda
World size: 1

⚠️  TEST MODE ENABLED - Will run only 3 training steps
   Set TEST_MODE = False in script for full training


Loading base model...
2025-11-25 17:22:33,939 - nanochat.checkpoint_manager - [32m[1mINFO[0m - Building model with config: {'sequence_len': 2048, 'vocab_size': 65536, 'n_layer': 20, 'n_head': 10, 'n_kv_head': 10, 'n_embd': 1280}
Model loaded: 20 layers, ~561M parameters

Loading datasets...
Train: 12000, Val: 4000

Training configuration:
  Batch size (per device): 1
  Gradient accumulation steps: 64
  Total iterations: 3

Initializing o

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.environ["SCIASSIST_TEST_MODE"] = "0"     # enables full training loop
os.environ["SCIASSIST_SKIP_TRAINING"] = "0" # make sure training runs

In [18]:
!python sciassist_train.py

SciAssist Fine-Tuning
Autodetected device type: cuda
  _C._set_float32_matmul_precision(precision)
2025-11-25 18:26:03,731 - nanochat.common - [32m[1mINFO[0m - Distributed world size: 1

Device: cuda
World size: 1

Loading base model...
2025-11-25 18:26:05,422 - nanochat.checkpoint_manager - [32m[1mINFO[0m - Building model with config: {'sequence_len': 2048, 'vocab_size': 65536, 'n_layer': 20, 'n_head': 10, 'n_kv_head': 10, 'n_embd': 1280}
Model loaded: 20 layers, ~561M parameters

Loading datasets...
Train: 12000, Val: 4000

Training configuration:
  Batch size (per device): 8
  Gradient accumulation steps: 8
  Total iterations: 748

Initializing optimizers...
Scaling the LR for the AdamW parameters ∝1/√(1280/768) = 0.774597
✓ Tokenizer copied to finetuned_model_checkpoint/tokenizer

Starting training...
--------------------------------------------------------------------------------
Step 00000 | Val loss: 1.967224
  → New best! (was inf)
Step 00000/00748 | Train loss: 1.868409 

In [19]:
import shutil
from google.colab import files

# Define the path to the folder you want to download
folder_to_zip = '/content/finetuned_model_checkpoint'
output_filename = 'finetuned_model_checkpoint'

# --- 1. Create the zip archive ---
print(f"Zipping the folder: {folder_to_zip}...")
# shutil.make_archive automatically adds the .zip extension
shutil.make_archive(output_filename, 'zip', folder_to_zip)
zip_path = f"{output_filename}.zip"
print(f"Successfully created {zip_path}")

Zipping the folder: /content/finetuned_model_checkpoint...
Successfully created finetuned_model_checkpoint.zip
Downloading finetuned_model_checkpoint.zip...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
import shutil
import os

source_path = '/content/finetuned_model_checkpoint.zip'
destination_dir = '/content/drive/My Drive/Colab Notebooks' # Default destination

# Create the destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Copy the file
shutil.copy(source_path, destination_dir)
print(f"File '{os.path.basename(source_path)}' copied to '{destination_dir}'")

File 'finetuned_model_checkpoint.zip' copied to '/content/drive/My Drive/Colab Notebooks'
