# Phase 2â€“3: Train DeepPTX (Colab Pro, A100)

Load Parquet dataset, build tokenizers, create DataLoaders with curriculum sampling, and train the Pointer-Generator Transformer.

In [None]:
!pip install -q torch pyarrow pandas wandb

import sys, os
sys.path.insert(0, os.path.abspath("/content/Neural PTX Decompiler" if os.path.exists("/content/Neural PTX Decompiler") else ".."))

import torch
from pathlib import Path
import pandas as pd

DATA_PATH = "/content/drive/MyDrive/NeuralPTX/dataset_100k.parquet"  # or local path
SAVE_DIR = Path("/content/drive/MyDrive/NeuralPTX/checkpoints")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from ptx_decompiler.data.dataset import load_parquet_for_training, collate_pad_batch, CurriculumSampler, PTXASTDataset
from ptx_decompiler.tokenizer import PTXTokenizer, ASTTokenizer

df = pd.read_parquet(DATA_PATH)
ptx_tokenizer = PTXTokenizer(max_vocab_size=2000)
ptx_tokenizer.build_vocab(df["ptx_normalized"].tolist())
ast_tokenizer = ASTTokenizer()

train_ds, val_ds = load_parquet_for_training(DATA_PATH, ptx_tokenizer, ast_tokenizer, train_ratio=0.9, seed=42)
curriculum_sampler = CurriculumSampler(train_ds, shuffle=True, seed=42)

train_loader = torch.utils.data.DataLoader(
    train_ds,
    batch_size=128,
    sampler=curriculum_sampler,
    collate_fn=lambda b: collate_pad_batch(b, ptx_tokenizer.pad_id, ast_tokenizer.pad_id),
    num_workers=0,
)
val_loader = torch.utils.data.DataLoader(
    val_ds,
    batch_size=128,
    shuffle=False,
    collate_fn=lambda b: collate_pad_batch(b, ptx_tokenizer.pad_id, ast_tokenizer.pad_id),
    num_workers=0,
)

In [None]:
from ptx_decompiler.model import PTXDecompilerModel
from ptx_decompiler.training import Trainer, get_cosine_schedule_with_warmup

ptx_to_ast = torch.full((len(ptx_tokenizer),), -1, dtype=torch.long)
for tok, ptx_id in ptx_tokenizer.vocab.items():
    if tok in ast_tokenizer.vocab:
        ptx_to_ast[ptx_id] = ast_tokenizer.vocab[tok]

model = PTXDecompilerModel(
    ptx_vocab_size=len(ptx_tokenizer),
    ast_vocab_size=len(ast_tokenizer),
    d_model=256,
    n_heads=8,
    d_ff=1024,
    encoder_layers=6,
    decoder_layers=6,
    dropout=0.1,
    use_copy=True,
    ptx_to_ast_map=ptx_to_ast,
).to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
num_steps = len(train_loader) * 30
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=num_steps)

trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    device=DEVICE,
    pad_id_ast=ast_tokenizer.pad_id,
    eos_id_ast=ast_tokenizer.eos_id,
    label_smoothing=0.1,
    use_amp=torch.cuda.is_available(),
    curriculum_sampler=curriculum_sampler,
    save_dir=SAVE_DIR,
    use_wandb=False,
)
print(f"Model parameters: {model.count_parameters()}")

In [None]:
trainer.train(num_epochs=30)