## Colab: T5-Nano (Python → C++) End-to-End Pipeline

This notebook is **Google Colab friendly**. It starts by cloning your GitHub repo, then runs:

- Data prep (XLCoST)
- Tokenizer training
- T5-Nano init (random weights)
- Training
- Inference demo

Tip: In Colab, enable a GPU: **Runtime → Change runtime type → GPU**.

In [16]:
# --- Clone repo (idempotent) ---
# Always anchor at /content so we don't accidentally clone into /content/NMT/NMT
%cd /content

REPO_URL = "https://github.com/ns-1456/NMT.git"
REPO_DIR = "NMT"

import os

if not os.path.isdir(REPO_DIR):
    !git clone {REPO_URL} {REPO_DIR}

%cd /content/{REPO_DIR}
!git status -sb || true


/content
/content/NMT
## [32mmain[m...[31morigin/main[m
[31m??[m NMT/


In [17]:
# Install dependencies (avoid reinstalling torch in Colab)
!pip -q install -U pip
!pip -q install transformers datasets tokenizers pandas scikit-learn accelerate gdown tqdm matplotlib


In [18]:
from __future__ import annotations

import json
import os
import subprocess
from pathlib import Path

os.environ["TOKENIZERS_PARALLELISM"] = "false"

REPO_ROOT = Path.cwd()
DATA_PROCESSED = REPO_ROOT / "data" / "processed"
TOKENIZER_DIR = REPO_ROOT / "custom_tokenizer"
CHECKPOINT_DIR = REPO_ROOT / "t5_nano_checkpoints"
FINAL_MODEL_DIR = REPO_ROOT / "final_model"

QUICK_RUN = True
MAX_SAMPLES = 2000 if QUICK_RUN else None
EPOCHS = 1 if QUICK_RUN else 30
BATCH_SIZE = 8 if QUICK_RUN else 32

print("repo:", REPO_ROOT)
print("quick_run:", QUICK_RUN)


repo: /content/NMT
quick_run: True


## 1) Data prep (XLCoST → `data/processed/`)

In [27]:
!python - <<'PY'
from pathlib import Path
import zipfile

zip_path = Path("data/raw/XLCoST_data.zip")
print("zip exists:", zip_path.exists(), "size:", zip_path.stat().st_size if zip_path.exists() else None)

with zipfile.ZipFile(zip_path) as z:
    names = z.namelist()
    print("total entries:", len(names))

    print("\nfirst 50 entries:")
    for n in names[:50]:
        print(n)

    hits = [n for n in names if "pair_data_tok_1" in n]
    print("\nentries containing pair_data_tok_1:", len(hits))
    for n in hits[:20]:
        print(n)
PY

zip exists: True size: 297821023
total entries: 1938

first 50 entries:
XLCoST_data/
__MACOSX/._XLCoST_data
XLCoST_data/retrieval/
__MACOSX/XLCoST_data/._retrieval
XLCoST_data/generation/
__MACOSX/XLCoST_data/._generation
XLCoST_data/retrieval/nl2code_search/
__MACOSX/XLCoST_data/retrieval/._nl2code_search
XLCoST_data/retrieval/code2code_search/
__MACOSX/XLCoST_data/retrieval/._code2code_search
XLCoST_data/retrieval/.ipynb_checkpoints/
__MACOSX/XLCoST_data/retrieval/._.ipynb_checkpoints
XLCoST_data/generation/pair_data_tok_full_desc_comment/
__MACOSX/XLCoST_data/generation/._pair_data_tok_full_desc_comment
XLCoST_data/generation/pair_data_tok_1_comment/
__MACOSX/XLCoST_data/generation/._pair_data_tok_1_comment
XLCoST_data/generation/pair_data_tok_full_desc/
__MACOSX/XLCoST_data/generation/._pair_data_tok_full_desc
XLCoST_data/generation/pair_data_tok_full/
__MACOSX/XLCoST_data/generation/._pair_data_tok_full
XLCoST_data/generation/pair_data_tok_1/
__MACOSX/XLCoST_data/generation/._pair

NameError: name 'PY' is not defined

In [23]:
# Clean any stale XLCoST artifacts (zip + extracted folders)
subprocess.run(["rm", "-rf", "data/raw/XLCoST_data"], check=False)
subprocess.run(["rm", "-rf", "data/raw/__MACOSX"], check=False)
subprocess.run(["rm", "-f",  "data/raw/XLCoST_data.zip"], check=False)

cmd = ["python", "-u", "data_prep.py"]
if MAX_SAMPLES is not None:
    cmd += ["--max_samples", str(MAX_SAMPLES)]

print("Running:", " ".join(cmd))
proc = subprocess.run(cmd, text=True, capture_output=True)

print("\n--- data_prep.py stdout ---\n")
print(proc.stdout)

if proc.returncode != 0:
    print("\n--- data_prep.py stderr ---\n")
    print(proc.stderr)
    raise RuntimeError(f"data_prep.py failed with exit code {proc.returncode}")

print("\nProduced:")
for p in sorted(DATA_PROCESSED.glob("*")):
    print("-", p)


Running: python -u data_prep.py --max_samples 2000

--- data_prep.py stdout ---

[data_prep] Downloading XLCoST zip to data/raw/XLCoST_data.zip
[data_prep] Extracting data/raw/XLCoST_data.zip into data/raw


--- data_prep.py stderr ---

Downloading...
From (original): https://drive.google.com/uc?id=1Cp3vFITRaUEJwPoeI_uv0cC6KVyvDc4F
From (redirected): https://drive.google.com/uc?id=1Cp3vFITRaUEJwPoeI_uv0cC6KVyvDc4F&confirm=t&uuid=0dcf5a2c-22b7-4b76-8160-4650807533e2
To: /content/NMT/data/raw/XLCoST_data.zip

  0%|          | 0.00/298M [00:00<?, ?B/s]
  6%|▌         | 16.8M/298M [00:00<00:01, 162MB/s]
 12%|█▏        | 35.1M/298M [00:00<00:01, 170MB/s]
 19%|█▉        | 57.1M/298M [00:00<00:01, 191MB/s]
 26%|██▋       | 78.6M/298M [00:00<00:01, 200MB/s]
 33%|███▎      | 99.1M/298M [00:00<00:00, 199MB/s]
 40%|████      | 120M/298M [00:00<00:00, 199MB/s] 
 47%|████▋     | 141M/298M [00:00<00:00, 196MB/s]
 54%|█████▍    | 160M/298M [00:01<00:01, 112MB/s]
 59%|█████▉    | 176M/298M [00:01<00:0

RuntimeError: data_prep.py failed with exit code 1

In [None]:
# Basic dataset inspection + visualization (Arrow dataset preferred; JSONL fallback)
import pandas as pd
import matplotlib.pyplot as plt

arrow_dir = DATA_PROCESSED / "xlcost_py_cpp_snippet"
if arrow_dir.exists():
    from datasets import load_from_disk

    ds = load_from_disk(str(arrow_dir))
    train_df = pd.DataFrame(ds["train"])
else:
    train_df = pd.read_json(DATA_PROCESSED / "train.jsonl", lines=True)

print("train rows:", len(train_df))

train_df["source_len"] = train_df["source"].astype(str).map(len)
train_df["target_len"] = train_df["target"].astype(str).map(len)

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].hist(train_df["source_len"], bins=50)
ax[0].set_title("Train source char length")
ax[1].hist(train_df["target_len"], bins=50)
ax[1].set_title("Train target char length")
plt.tight_layout()
plt.show()

train_df.head(3)


## 2) Train tokenizer (Byte-Level BPE)

In [None]:
subprocess.run(["python", "train_tokenizer.py"], check=True)
print("Tokenizer dir:", TOKENIZER_DIR)
!ls -la custom_tokenizer | head


## 3) Verify model config

In [None]:
import model_config

tok = model_config.load_tokenizer()
model = model_config.build_t5_nano(tok)
params = model_config.count_parameters(model)
print(f"T5-Nano parameter count: {params:,}")


## 4) Train

In [None]:
import torch

if not torch.cuda.is_available():
    raise RuntimeError(
        "CUDA GPU not available. In Colab: Runtime → Change runtime type → GPU. "
        "(train.py uses fp16=True by default.)"
    )

cmd = [
    "python",
    "train.py",
    "--per_device_batch_size",
    str(BATCH_SIZE),
    "--num_train_epochs",
    str(EPOCHS),
]
print("Running:", " ".join(cmd))
subprocess.run(cmd, check=True)
print("Final model dir exists:", FINAL_MODEL_DIR.exists())


In [None]:
# Plot training curves (train/eval loss)
import matplotlib.pyplot as plt

trainer_states = list(CHECKPOINT_DIR.glob("checkpoint-*/trainer_state.json"))
if not trainer_states:
    root_state = CHECKPOINT_DIR / "trainer_state.json"
    trainer_states = [root_state] if root_state.exists() else []

if not trainer_states:
    print("No trainer_state.json found yet")
else:
    state_path = max(trainer_states, key=lambda p: p.stat().st_mtime)
    state = json.loads(state_path.read_text())
    logs = state.get("log_history", [])

    steps, train_losses = [], []
    eval_steps, eval_losses = [], []
    for item in logs:
        if "loss" in item and "eval_loss" not in item:
            steps.append(item.get("step"))
            train_losses.append(item["loss"])
        if "eval_loss" in item:
            eval_steps.append(item.get("step"))
            eval_losses.append(item["eval_loss"])

    plt.figure(figsize=(10, 4))
    if train_losses:
        plt.plot(steps, train_losses, label="train_loss")
    if eval_losses:
        plt.plot(eval_steps, eval_losses, label="eval_loss")
    plt.title("Training curves")
    plt.xlabel("step")
    plt.ylabel("loss")
    plt.legend()
    plt.grid(True, alpha=0.2)
    plt.show()


## 5) Inference demo

In [None]:
import inference

sample_python = """\
def sum_upto(n):
    s = 0
    for i in range(n + 1):
        s += i
    return s
"""

print("=== Python ===")
print(sample_python)
print("=== C++ (generated) ===")
inference.translate(sample_python)
