# Mason Transformer — Colab Training
**125M-parameter GPT-style model for construction AI**

### Before you run:
1. **Runtime → Change runtime type → T4 GPU → Save**
2. Then **Runtime → Run all** (or run cells one by one with Shift+Enter)

Cells 1–4 take ~5 minutes. Cell 5 runs for 3–5 hours on T4.

In [None]:
# ══════════════════════════════════════════════════════════════
# CELL 1 — Clone repo and install dependencies
# ══════════════════════════════════════════════════════════════
import os, subprocess

REPO = '/content/contech1'

if not os.path.exists(REPO):
    print('Cloning contech1...')
    subprocess.run(['git', 'clone', 'https://github.com/masonearl/contech1.git', REPO], check=True)
else:
    print('Repo exists — pulling latest fixes...')
    subprocess.run(['git', '-C', REPO, 'pull'], check=True)

os.chdir(REPO)
print(f'Working directory: {os.getcwd()}')

subprocess.run(['pip', 'install', '-q', 'torch', 'openpyxl'], check=True)

# Verify all required files are present
required = [
    'build_corpus.py', 'config.py', 'tokenizer.py', 'data.py',
    'generate.py', 'model.py', 'train.py',
    'data/materials.json', 'data/labor.json', 'data/equipment.json',
    'data/production_rates.json', 'data/terms.json',
]
missing = [f for f in required if not os.path.exists(f)]
if missing:
    raise RuntimeError(f'MISSING FILES: {missing}')
print('\nAll required files present. Ready to train.')

In [None]:
# ══════════════════════════════════════════════════════════════
# CELL 2 — Check GPU
# ══════════════════════════════════════════════════════════════
import torch

if not torch.cuda.is_available():
    raise RuntimeError(
        'NO GPU DETECTED.\n'
        'Go to Runtime → Change runtime type → T4 GPU, then restart.'
    )

name = torch.cuda.get_device_name(0)
mem  = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f'GPU:      {name}')
print(f'VRAM:     {mem:.1f} GB')
print(f'CUDA:     {torch.version.cuda}')
print(f'PyTorch:  {torch.__version__}')
print(f'bfloat16: {torch.cuda.is_bf16_supported()}')
print()
if mem >= 35:
    print('A100 detected — will use batch_size=8  (~45-90 min)')
else:
    print('T4 detected  — will use batch_size=4  (~3-5 hours)')

In [None]:
# ══════════════════════════════════════════════════════════════
# CELL 3 — Build training corpus
# Combines JSON rate files, construction knowledge, real Tempest
# project data, and synthetic conversations into one corpus.
# Takes ~1-2 minutes.
# ══════════════════════════════════════════════════════════════
import subprocess, os

result = subprocess.run(['python', 'build_corpus.py'], capture_output=False)
if result.returncode != 0:
    raise RuntimeError('build_corpus.py failed — check output above')

if not os.path.exists('corpus/full_corpus.txt'):
    raise RuntimeError('corpus/full_corpus.txt not found — build_corpus.py did not complete')

mb = os.path.getsize('corpus/full_corpus.txt') / 1e6
print(f'\nCorpus built: {mb:.1f} MB')

if os.path.exists('corpus/real_projects.txt'):
    kb = os.path.getsize('corpus/real_projects.txt') / 1024
    print(f'Real Tempest project data: {kb:.0f} KB included')

In [None]:
# ══════════════════════════════════════════════════════════════
# CELL 4 — Train tokenizer
# BPE tokenizer trained on the corpus. Takes ~30 seconds.
# ══════════════════════════════════════════════════════════════
import subprocess, os

result = subprocess.run(['python', 'tokenizer.py', '--train'], capture_output=False)
if result.returncode != 0:
    raise RuntimeError('tokenizer.py failed — check output above')

if not os.path.exists('tokenizer.json'):
    raise RuntimeError('tokenizer.json not created')

kb = os.path.getsize('tokenizer.json') / 1024
print(f'\nTokenizer saved: {kb:.0f} KB')

In [None]:
# ══════════════════════════════════════════════════════════════
# CELL 5 — Train the model
#
# LEAVE THIS RUNNING — do NOT close the tab.
# Checkpoints save every 500 steps so you won't lose progress.
#
# What you should see:
#   step    50/40000 | loss 9.xx   <-- starting high, normal
#   step   500/40000 | loss 4-5    <-- dropping, good
#   step  2000/40000 | loss 2-3    <-- below 3.0 = working
#
# If loss is still >7 at step 500, something is wrong.
# If you get OOM, set BATCH_SIZE = 2 below and re-run this cell.
# ══════════════════════════════════════════════════════════════
import torch, os

STEPS      = 40000
BATCH_SIZE = 4       # Change to 2 if OOM, 8 on A100
RESUME     = False   # Change to True to resume from checkpoint

if torch.cuda.is_available():
    mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    if mem >= 35:
        BATCH_SIZE = 8
    print(f'GPU: {torch.cuda.get_device_name(0)} — batch_size={BATCH_SIZE}')

os.makedirs('checkpoints', exist_ok=True)

cmd = f'python train.py --steps {STEPS} --batch-size {BATCH_SIZE}'
if RESUME:
    cmd += ' --resume'

print(f'Running: {cmd}')
print('=' * 60)
!PYTHONUNBUFFERED=1 {cmd}

In [None]:
# ══════════════════════════════════════════════════════════════
# CELL 6 — Download the trained model
#
# Run this AFTER Cell 5 finishes.
# Downloads best.pt and tokenizer.json to your Mac.
#
# Then on your Mac, move both files to:
#   pages/contech/estimator/model/transformer/checkpoints/
# ══════════════════════════════════════════════════════════════
from google.colab import files
import os

print('Looking for checkpoint...')
downloaded = []

for path in ['checkpoints/best.pt', 'checkpoints/latest.pt']:
    if os.path.exists(path):
        mb = os.path.getsize(path) / 1e6
        print(f'Downloading {path} ({mb:.0f} MB)...')
        files.download(path)
        downloaded.append(path)
        break

if os.path.exists('tokenizer.json'):
    print('Downloading tokenizer.json...')
    files.download('tokenizer.json')
    downloaded.append('tokenizer.json')

if downloaded:
    print(f'\nDownloaded: {downloaded}')
    print('\nNext steps on your Mac:')
    print('  Move both files to:')
    print('  pages/contech/estimator/model/transformer/checkpoints/')
    print('  Then tell Mason to deploy them.')
else:
    print('No checkpoints found.')
    print('Make sure Cell 5 ran to completion before running this.')