# 03 Â· Fine-Tune Nanochat Science QA Model (Native Trainer)

This notebook configures and launches supervised fine-tuning for the Nanochat model on the ScienceQA conversational dataset, using the native `nanochat` training loop.


## Prerequisites

- Run `nanochat-QA_finetune.ipynb` to generate the `data/*_formatted` artifacts.
- Run `02_load_base_model.ipynb` to cache the base `sdobson/nanochat` weights.
- Ensure the required Python packages from `nanochat/pyproject.toml` are installed.


In [1]:
import subprocess

print("Checking GPU status with nvidia-smi (ignore errors if no NVIDIA GPU is available)...")
try:
    gpu_info = subprocess.check_output(["nvidia-smi"])  # noqa: S606 (intentional command execution)
    print(gpu_info.decode())
except (FileNotFoundError, subprocess.CalledProcessError) as exc:
    print(f"nvidia-smi unavailable: {exc}")


Checking GPU status with nvidia-smi (ignore errors if no NVIDIA GPU is available)...
Sun Nov  9 22:33:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.65.06              Driver Version: 580.65.06      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Quadro T2000                   Off |   00000000:01:00.0 Off |                  N/A |
| N/A   52C    P8              2W /   40W |       3MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------

In [2]:
from pathlib import Path
import json
import torch
from contextlib import nullcontext

project_dir = Path.cwd()
data_dir = project_dir / "data"
output_dir = project_dir / "nanochat-science-finetuned"
final_dir = project_dir / "nanochat-science-final"

print(f"Project directory: {project_dir}")
print(f"Data directory: {data_dir}")
print(f"Output directory: {output_dir}")


Project directory: /home/nidhinninan/Desktop/UCMO Classes/Natural Language Processing/Project
Data directory: /home/nidhinninan/Desktop/UCMO Classes/Natural Language Processing/Project/data
Output directory: /home/nidhinninan/Desktop/UCMO Classes/Natural Language Processing/Project/nanochat-science-finetuned


In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA device detected: {torch.cuda.get_device_name(0)}")
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    ptdtype = torch.bfloat16
    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=ptdtype)
else:
    device = torch.device("cpu")
    ptdtype = torch.float32
    autocast_ctx = nullcontext()
    print("Warning: Training on CPU will be extremely slow.")


CUDA device detected: Quadro T2000


  self.setter(val)


## Load Prepared Datasets


In [16]:
from datasets import load_from_disk

train_dataset_path = data_dir / "train_formatted"
val_dataset_path = data_dir / "val_formatted"

if not train_dataset_path.exists():
    raise FileNotFoundError(f"Missing dataset: {train_dataset_path}. Run nanochat-QA_finetune.ipynb first.")
if not val_dataset_path.exists():
    raise FileNotFoundError(f"Missing dataset: {val_dataset_path}. Run nanochat-QA_finetune.ipynb first.")

train_dataset = load_from_disk(str(train_dataset_path))
val_dataset = load_from_disk(str(val_dataset_path))

print(f"Train samples: {len(train_dataset)} | Validation samples: {len(val_dataset)}")


Train samples: 3000 | Validation samples: 500


## Load Base Model and Tokenizer


In [19]:
import sys
from pathlib import Path
import pprint

# The 'nanochat' git repository is a subdirectory of our project.
# To resolve the module import, we must add its parent directory to Python's path.
# project_dir is defined in a previous cell.
nanochat_repo_path = project_dir / "nanochat"
resolved_repo_path = nanochat_repo_path.resolve()

# Ensure the repo directory is searched first when importing packages named "nanochat".
if str(resolved_repo_path) not in sys.path:
    sys.path.insert(0, str(resolved_repo_path))
    print(f"Added '{resolved_repo_path}' to sys.path")
else:
    print(f"'{resolved_repo_path}' already present in sys.path")

# If a different 'nanochat' package was previously imported, remove it so our local copy is used.
if "nanochat" in sys.modules:
    removed = sys.modules.pop("nanochat")
    print(f"Removed cached module: {removed}")

print("Top 3 entries in sys.path:")
pprint.pprint(sys.path[:3])


'/home/nidhinninan/Desktop/UCMO Classes/Natural Language Processing/Project/nanochat' already present in sys.path
Removed cached module: <module 'nanochat' (<_frozen_importlib_external.NamespaceLoader object at 0x73c8b01a4c90>)>
Top 3 entries in sys.path:
['/home/nidhinninan/Desktop/UCMO Classes/Natural Language '
 'Processing/Project/nanochat',
 '/home/nidhinninan/Desktop/UCMO Classes/Natural Language '
 'Processing/Project/nanochat',
 '/home/nidhinninan/anaconda3/envs/CSCE5720/lib/python311.zip']


Loading model and tokenizer using native nanochat functions...


FileNotFoundError: [Errno 2] No such file or directory: '/home/nidhinninan/.cache/nanochat/base_checkpoints'

## Create Data Loader


In [None]:
# This is the data loader from scripts/chat_sft.py
def sft_data_generator(dataset, batch_size):
    pad_token_id = tokenizer.encode_special("<|assistant_end|>") # use <|assistant_end|> as the pad token is ok, these positions are masked in the loss
    
    # prepares a list of tokenized conversations into a batch and yields
    def collate_and_yield(batch):
        nrows = len(batch)
        ncols = max(len(ids) for ids, mask in batch) - 1 # seq of n creates inputs/targets of n-1
        inputs = torch.full((nrows, ncols), pad_token_id, dtype=torch.long)
        targets = torch.full((nrows, ncols), -1, dtype=torch.long) # -1 is ignore index
        for i, (ids, mask) in enumerate(batch):
            n = len(ids)
            ids_tensor = torch.tensor(ids, dtype=torch.long)
            inputs[i, :n-1] = ids_tensor[:-1]
            # recall -1 is the ignore index, so mask out targets where mask is 0
            row_targets = ids_tensor[1:]
            # mask[1:] omits the mask for the BOS token, which is never a target atm so it's ok
            mask_tensor = torch.tensor(mask[1:], dtype=torch.long)
            row_targets[mask_tensor == 0] = -1 # mask out targets where mask is 0
            targets[i, :n-1] = row_targets
        inputs = inputs.to(device) # move to device
        targets = targets.to(device)
        return inputs, targets
    
    # iterates over the dataset in epochs, tokenizes
    batch = []
    while True:
        for i in range(len(dataset)):
            doc = dataset[i]
            ids, mask = tokenizer.render_conversation(doc)
            batch.append((ids, mask))
            if len(batch) == batch_size:
                yield collate_and_yield(batch)
                batch = []

print("Data generator `sft_data_generator` is defined.")


## Configure Training


In [None]:
# SFT Hyperparameters (adapted from scripts/chat_sft.py)
device_batch_size = 2 # Max to avoid OOM on a consumer GPU
num_epochs = 2
target_examples_per_step = 16 # Effective batch size via gradient accumulation
unembedding_lr = 0.004
embedding_lr = 0.2
matrix_lr = 0.02
weight_decay = 0.0
init_lr_frac = 0.02
eval_every = 100 # Steps
eval_steps = 50

# Calculate number of iterations and gradient accumulation
grad_accum_steps = target_examples_per_step // device_batch_size
num_iterations = (len(train_dataset) // target_examples_per_step) * num_epochs

print(f"Effective batch size: {target_examples_per_step}")
print(f"Device batch size: {device_batch_size}")
print(f"Gradient accumulation steps: {grad_accum_steps}")
print(f"Total training iterations: {num_iterations}")

# Create data loaders (generators)
train_loader = sft_data_generator(train_dataset, batch_size=device_batch_size)
val_loader = sft_data_generator(val_dataset, batch_size=device_batch_size)


## Initialize Optimizer


In [None]:
optimizers = model.setup_optimizers(
    unembedding_lr=unembedding_lr,
    embedding_lr=embedding_lr,
    matrix_lr=matrix_lr,
    weight_decay=weight_decay,
    )

# Set the initial learning rate as a fraction of the base learning rate
for opt in optimizers:
    for group in opt.param_groups:
        group["lr"] = group["lr"] * init_lr_frac
        group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later

print("Optimizers initialized.")


## Run Fine-Tuning


In [None]:
import time

# Learning rate scheduler
def get_lr_multiplier(it):
    lrm = 1.0 - it / num_iterations
    return lrm

print("Starting fine-tuning...")
train_iter = iter(train_loader)
val_iter = iter(val_loader)
val_loss = 0.0
step_time_ema = 0.0

for step in range(num_iterations):
    t0 = time.time()
    last_step = step == num_iterations - 1

    # Validation loop
    if last_step or step % eval_every == 0:
        model.eval()
        losses = []
        for _ in range(eval_steps):
            val_inputs, val_targets = next(val_iter)
            with torch.no_grad(), autocast_ctx:
                loss = model(val_inputs, val_targets)
            losses.append(loss)
        val_loss = torch.stack(losses).mean().item()
        print(f"Step {step:05d} | Validation loss: {val_loss:.6f}")
        model.train()

    # Training loop
    num_tokens = 0
    for micro_step in range(grad_accum_steps):
        train_inputs, train_targets = next(train_iter)
        with autocast_ctx:
            loss = model(train_inputs, train_targets)
        train_loss = loss.detach() # for logging
        loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
        loss.backward() # accumulate the gradient
        num_tokens += (train_targets >= 0).sum()
    
    # Update learning rate
    lrm = get_lr_multiplier(step)
    for opt in optimizers:
        for group in opt.param_groups:
            group["lr"] = group["initial_lr"] * lrm

    # Step the optimizers
    for opt in optimizers:
        opt.step()
    model.zero_grad(set_to_none=True)

    # Timing and logging
    t1 = time.time()
    dt = t1 - t0
    step_time_ema = dt if step_time_ema == 0 else 0.9 * step_time_ema + 0.1 * dt
    tokens_per_sec = num_tokens.item() / dt
    
    print(f"Step {step:05d}/{num_iterations:05d} | Train loss: {train_loss.item():.6f} | LR mult: {lrm:.4f} | Tok/s: {tokens_per_sec:,.0f} | Step time: {step_time_ema:.3f}s")

print("\nTraining complete.")


## Save Final Model Artifacts


In [None]:
final_dir.mkdir(parents=True, exist_ok=True)

print(f"Saving final model checkpoint to {final_dir}...")

# Create metadata for the checkpoint
checkpoint_meta = {
    "step": step,
    "val_loss": val_loss,
    "model_config": model.config.__dict__,
}

# Use the nanochat save_checkpoint function
save_checkpoint(
    checkpoint_dir=str(final_dir),
    step=step, # You can use the final step number
    model_state=model.state_dict(),
    # We are not saving optimizer state in this simplified version
    optimizer_state=None, 
    meta=checkpoint_meta,
)

# The tokenizer is a folder, so we can't save it with the checkpoint manager.
# We'll just copy the files if they aren't already there.
tokenizer.save_vocabulary(str(final_dir))

print(f"Saved fine-tuned model and tokenizer to {final_dir}")


### Next Steps

- Validate the fine-tuned model in `04_evaluation.ipynb`.
- Build interactive demos in `05_interactive_demo.ipynb`.
- Document cost, runtime, and findings for the final report.
