# Mason Transformer — Colab Training
**125M-parameter GPT-style model for construction AI**

### Before running:
1. Go to **Runtime → Change runtime type → T4 GPU → Save**
2. Then run cells top to bottom with Shift+Enter

---

In [None]:
# ── Cell 1: Clone repo and install ─────────────────────────────────────────
import os

if not os.path.exists('/content/contech1'):
    !git clone https://github.com/masonearl/contech1.git /content/contech1
else:
    print('Already cloned — pulling latest...')
    !git -C /content/contech1 pull

os.chdir('/content/contech1')
print(f'Working directory: {os.getcwd()}')

!pip install -q torch openpyxl

# Verify all required files are present
required = ['build_corpus.py','config.py','tokenizer.py','data.py',
            'generate.py','model.py','train.py',
            'data/materials.json','data/labor.json','data/equipment.json',
            'data/production_rates.json','data/terms.json']
missing = [f for f in required if not os.path.exists(f)]
if missing:
    print(f'MISSING FILES: {missing}')
else:
    print('All required files present.')

In [None]:
# ── Cell 2: Check GPU ───────────────────────────────────────────────────────
import torch

if not torch.cuda.is_available():
    print('NO GPU DETECTED.')
    print('Go to Runtime → Change runtime type → T4 GPU, then restart.')
else:
    name = torch.cuda.get_device_name(0)
    mem  = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'GPU:      {name}')
    print(f'VRAM:     {mem:.1f} GB')
    print(f'CUDA:     {torch.version.cuda}')
    print(f'PyTorch:  {torch.__version__}')
    print(f'bfloat16: {torch.cuda.is_bf16_supported()}')
    print()
    if mem >= 35:
        print('A100 detected — will use batch_size=8')
    else:
        print('T4 detected — will use batch_size=4')

In [None]:
# ── Cell 3: Build training corpus ──────────────────────────────────────────
# Builds all training text from JSON rate files, conversations,
# construction industry knowledge, and real Tempest project data.
# Takes ~1-2 minutes.

!python build_corpus.py

import os
if os.path.exists('corpus/real_projects.txt'):
    kb = os.path.getsize('corpus/real_projects.txt') / 1024
    print(f'Real project data loaded ({kb:.0f} KB)')

if os.path.exists('corpus/full_corpus.txt'):
    mb = os.path.getsize('corpus/full_corpus.txt') / 1e6
    print(f'Full corpus: {mb:.1f} MB')
else:
    print('ERROR: full_corpus.txt not created — check errors above')

In [None]:
# ── Cell 4: Train tokenizer ─────────────────────────────────────────────────
# BPE tokenizer trained on the corpus. Takes ~30 seconds.

!python tokenizer.py --train

import os
if os.path.exists('tokenizer.json'):
    kb = os.path.getsize('tokenizer.json') / 1024
    print(f'Tokenizer saved ({kb:.0f} KB)')
else:
    print('ERROR: tokenizer.json not created')

In [None]:
# ── Cell 5: Train the model ─────────────────────────────────────────────────
# This is the long cell. Leave it running — do NOT close the tab.
#
# What to expect:
#   step    50/40000 | loss 9.xx | lr 1.2e-04 | 3,000 tok/s
#   step   500/40000 | loss 4.xx | ...
#   step  2000/40000 | loss 2.xx | ...
#
# Loss MUST drop below 3.0 by step 2000 or something is wrong.
# If you see OOM (out of memory), change BATCH_SIZE = 2 below.
#
# T4 GPU:  ~3-5 hours for 40k steps
# A100:    ~45-90 minutes for 40k steps

import torch

STEPS      = 40000
BATCH_SIZE = 4      # Change to 2 if you get OOM errors
RESUME     = False  # Change to True to resume from a previous checkpoint

if torch.cuda.is_available():
    mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    if mem >= 35:  # A100
        BATCH_SIZE = 8
        print(f'A100 ({mem:.0f} GB) — using batch_size={BATCH_SIZE}')
    else:
        print(f'T4 ({mem:.0f} GB) — using batch_size={BATCH_SIZE}')

cmd = f'python train.py --steps {STEPS} --batch-size {BATCH_SIZE}'
if RESUME:
    cmd += ' --resume'

print(f'Command: {cmd}')
print('=' * 60)
!PYTHONUNBUFFERED=1 {cmd}

In [None]:
# ── Cell 6: Download the trained model ─────────────────────────────────────
# Run this AFTER Cell 5 finishes.
# Downloads best.pt and tokenizer.json to your Mac.
#
# Then on your Mac, move them to:
#   pages/contech/estimator/model/transformer/checkpoints/

from google.colab import files
import os

downloaded = []

for path in ['checkpoints/best.pt', 'checkpoints/latest.pt']:
    if os.path.exists(path):
        mb = os.path.getsize(path) / 1e6
        print(f'Downloading {path} ({mb:.0f} MB)...')
        files.download(path)
        downloaded.append(path)
        break

if os.path.exists('tokenizer.json'):
    files.download('tokenizer.json')
    downloaded.append('tokenizer.json')

if downloaded:
    print(f'Downloaded: {downloaded}')
    print()
    print('Next steps:')
    print('  1. Move both files to: pages/contech/estimator/model/transformer/checkpoints/')
    print('  2. Tell Mason to deploy them to the live site')
else:
    print('No checkpoints found. Make sure Cell 5 completed successfully.')