# Train Transformer (3-epoch run with checkpoints every 3 epochs)
This notebook runs `train_main.py` with the requested config. Checkpoints are saved every 3 epochs and `best_model.pth` is overwritten at each save.

## Download data and checkpoint from shared Drive
Use the shared folder (data.zip + best_model.pth) to populate `data/processed/` and `training/checkpoints/` before training.

In [None]:
import os, pathlib, subprocess, sys

try:
    import gdown  
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "gdown"])

folder_id = "1b2Wn_I3m2-2WBPI3H1TtDiR0URazIOsg"
workdir = pathlib.Path('.')

data_dir = workdir / 'data' / 'processed'
data_dir.mkdir(parents=True, exist_ok=True)
ckpt_dir = workdir / 'training' / 'checkpoints'
ckpt_dir.mkdir(parents=True, exist_ok=True)

download_dir = workdir / 'downloads'
download_dir.mkdir(exist_ok=True)

print('Downloading folder from Drive...')

subprocess.check_call([
    sys.executable, '-m', 'gdown', '--folder', '--id', folder_id, '-O', str(download_dir)
])

zip_path = download_dir / 'data.zip'
if zip_path.exists():
    subprocess.check_call(['unzip', '-o', str(zip_path), '-d', str(data_dir)])
    print('Unzipped data.zip into data/processed')
else:
    print('data.zip not found in downloaded folder; please check the shared Drive contents')

# load checkpoint
best_ckpt = download_dir / 'best_model.pth'
if best_ckpt.exists():
    target_ckpt = ckpt_dir / 'best_model.pth'
    target_ckpt.write_bytes(best_ckpt.read_bytes())
    print('best_model.pth copied to training/checkpoints')
else:
    print('best_model.pth not found in downloaded folder; continuing without it')

print('Done downloading assets.')

In [None]:
import os, sys, pathlib

root = pathlib.Path('.')
print('Current working dir:', root.resolve())

source_path = pathlib.Path('data/processed/train_source_ids.npy')
target_path = pathlib.Path('data/processed/train_target_ids.npy')
print('Source exists:', source_path.exists())
print('Target exists:', target_path.exists())


In [None]:
pad_token_id = 0
source_ids = 'data/processed/train_source_ids.npy'
target_ids = 'data/processed/train_target_ids.npy'
batch_size = 32
epochs = 3
d_model = 256
num_encoder_layers = 4
num_decoder_layers = 4
d_ff = 1024
num_heads = 8

print('Config loaded.')

In [None]:
import subprocess, textwrap, shlex

cmd = f"python train_main.py \\\n  --pad_token_id {pad_token_id} \\\n  --source_ids {source_ids} \\\n  --target_ids {target_ids} \\\n  --batch_size {batch_size} \\\n  --epochs {epochs} \\\n  --d_model {d_model} \\\n  --num_encoder_layers {num_encoder_layers} \\\n  --num_decoder_layers {num_decoder_layers} \\\n  --d_ff {d_ff} \\\n  --num_heads {num_heads}"

print('Running command:\n', cmd)

process = subprocess.Popen(cmd, shell=True)
process.wait()
print('Exit code:', process.returncode)

import pathlib
ckpt_dir = pathlib.Path('training/checkpoints')
if ckpt_dir.exists():
    print('Checkpoints:')
    for p in sorted(ckpt_dir.glob('*.pth')):
        print(' -', p)
else:
    print('No checkpoints directory found')

In [None]:
import pathlib, json
metrics_path = pathlib.Path('training/metrics.json')
if metrics_path.exists():
    data = json.loads(metrics_path.read_text())
    print('Last metrics entry:' if data else 'Metrics file empty')
    if data:
        print(data[-1])
else:
    print('metrics.json not found')

best_model = pathlib.Path('training/checkpoints/best_model.pth')
print('Best model exists:', best_model.exists())