# 03 · Fine-Tune Nanochat Science QA Model (Clean Version)

This notebook configures and launches supervised fine-tuning for the Nanochat model on the ScienceQA conversational dataset prepared in the earlier steps.


## Prerequisites

- Run `nanochat-QA_finetune.ipynb` to generate the `data/*_formatted` artifacts.
- Run `02_load_base_model.ipynb` to cache the base `sdobson/nanochat` weights.
- Ensure the required Python packages are installed.


### Cleanup Summary

- Removed the standalone GPU status check cell from the earlier draft because GPU verification already occurs in Notebook 2 (`02_load_base_model.ipynb`) and the fine-tuning flow here relies on `device_map="auto"`.
- No other structural changes were required; remaining cells continue the Task 3 logic for configuring training, running fine-tuning, and saving checkpoints.


In [None]:
# Removed GPU status check; see Cleanup Summary above for rationale.

In [None]:
from pathlib import Path
import json
import torch

project_dir = Path.cwd()
data_dir = project_dir / "data"
output_dir = project_dir / "nanochat-science-finetuned"
final_dir = project_dir / "nanochat-science-final"

print(f"Project directory: {project_dir}")
print(f"Data directory: {data_dir}")
print(f"Output directory: {output_dir}")


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA device detected: {torch.cuda.get_device_name(0)}")
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
else:
    device = torch.device("cpu")
    print("Warning: Training on CPU will be extremely slow.")


## Load Prepared Datasets


In [None]:
from datasets import load_from_disk

train_dataset_path = data_dir / "train_formatted"
val_dataset_path = data_dir / "val_formatted"

if not train_dataset_path.exists():
    raise FileNotFoundError(f"Missing dataset: {train_dataset_path}. Run nanochat-QA_finetune.ipynb first.")
if not val_dataset_path.exists():
    raise FileNotFoundError(f"Missing dataset: {val_dataset_path}. Run nanochat-QA_finetune.ipynb first.")

train_dataset = load_from_disk(str(train_dataset_path))
val_dataset = load_from_disk(str(val_dataset_path))

print(train_dataset)
print(val_dataset)
print(f"Train samples: {len(train_dataset)} | Validation samples: {len(val_dataset)}")


## Load Base Model and Tokenizer

This cell loads the model using the `transformers` library, which is required for compatibility with the `Trainer` API used for fine-tuning.


In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

model_name = "sdobson/nanochat"

print(f"Loading tokenizer and model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
)
model.resize_token_embeddings(len(tokenizer))
model.config.use_cache = False  # Required when using gradient checkpointing

print("Model and tokenizer loaded successfully.")


## Tokenize Conversations for Causal LM Training


In [None]:
max_length = 512


def conversation_to_text(messages):
    """Convert a chat-style dict of messages into a single training string."""
    segments = []
    for message in messages:
        role = message.get("role", "user")
        content = message.get("content", "").strip()
        segments.append(f'{role}: {content}')
    return "\n".join(segments)


def tokenize_conversation(example):
    text = conversation_to_text(example["messages"])
    return tokenizer(
        text,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_attention_mask=True,
    )


train_tokenized = train_dataset.map(
    tokenize_conversation,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train dataset",
)

val_tokenized = val_dataset.map(
    tokenize_conversation,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation dataset",
)

print(train_tokenized)
print(val_tokenized)


## Configure Data Collator


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

print("Data collator initialized (causal LM mode).")


## Set Training Arguments


In [None]:
from datetime import datetime

output_dir.mkdir(parents=True, exist_ok=True)

supports_bf16 = False
if torch.cuda.is_available():
    compute_capability = torch.cuda.get_device_capability()
    supports_bf16 = compute_capability[0] >= 8

run_name = f"nanochat-science-ft-{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}"

training_args = TrainingArguments(
    output_dir=str(output_dir),
    run_name=run_name,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir=str(output_dir / "logs"),
    logging_steps=50,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=200,
    metric_for_best_model="loss",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available() and not supports_bf16,
    bf16=supports_bf16,
    gradient_checkpointing=True,
    report_to="none",
    dataloader_num_workers=2,
    optim="adamw_torch",
)

print(training_args)


## Initialize Trainer


In [None]:
if training_args.gradient_checkpointing:
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()  # Required for gradient checkpointing in newer HF versions

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Trainer initialized.")


## Run Fine-Tuning


In [None]:
resume_checkpoint = None  # Set to checkpoint path to resume training if interrupted.

print("Starting fine-tuning — this may take several hours depending on GPU availability...")
train_result = trainer.train(resume_from_checkpoint=resume_checkpoint)
trainer.save_state()

trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)

print("Training complete.")


## Evaluate Best Checkpoint


In [None]:
eval_metrics = trainer.evaluate()
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)

print(eval_metrics)


## Save Final Model Artifacts


In [None]:
final_dir.mkdir(parents=True, exist_ok=True)

trainer.save_model(str(final_dir))
tokenizer.save_pretrained(str(final_dir))

metrics_summary = {
    "train": train_result.metrics,
    "eval": eval_metrics if "eval_metrics" in locals() else None,
}

metrics_path = final_dir / "training_metrics.json"
with metrics_path.open("w") as fp:
    json.dump(metrics_summary, fp, indent=2)

print(f"Saved fine-tuned model and tokenizer to {final_dir}")
print(f"Metrics written to {metrics_path}")


## (Optional) Push to Hugging Face Hub


In [None]:
# from huggingface_hub import HfApi
#
# repo_id = "your-username/nanochat-science-qa"
# api = HfApi()
# api.create_repo(repo_id=repo_id, exist_ok=True)
# trainer.push_to_hub()
# tokenizer.push_to_hub(repo_id)

print("Configure and uncomment the cell above to push the model to the Hugging Face Hub.")


### Next Steps

- Validate the fine-tuned model in `04_evaluation.ipynb`.
- Build interactive demos in `05_interactive_demo.ipynb`.
- Document cost, runtime, and findings for the final report.
