# Phase 1: Data Factory — Generate 100k+ (PTX, AST, CUDA) Pairs

Run this notebook in **Google Colab (Free tier)**. It uses the 7-tier grammar to generate random ASTs, renders them to CUDA, compiles with `nvcc` to PTX, and normalizes the PTX. Output is saved as Parquet for training.

## Setup: Install deps and mount Drive (optional)

In Colab: Runtime → Change runtime type → CPU is enough. Install `pyarrow` for Parquet. Optionally mount Google Drive to save the dataset.

In [None]:
!pip install -q pyarrow

# Optional: mount Google Drive to save dataset
# from google.colab import drive
# drive.mount("/content/drive")
# OUTPUT_DIR = "/content/drive/MyDrive/NeuralPTX"

OUTPUT_DIR = "."  # local; use OUTPUT_DIR above if using Drive
TARGET_PAIRS = 100_000
BATCH_SIZE = 5000  # save progress every N pairs

## Add project root and import

In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath("/content/Neural PTX Decompiler" if os.path.exists("/content/Neural PTX Decompiler") else ".."))

import random
import pandas as pd
from pathlib import Path

from ptx_decompiler.data import (
    get_tier_generator,
    TIER_WEIGHTS,
    Tier1SimpleBinary,
    Tier2NestedArithmetic,
    Tier3UnaryMath,
    Tier4Ternary,
    Tier5TypeDiversity,
    Tier6MultiStatement,
    Tier7SharedMemory,
    ast_to_cuda,
    compile_cuda_to_ptx_silent,
    normalize_ptx,
)
from ptx_decompiler.data.grammar import TIER_CLASSES, sample_tier

## Generation loop

In [None]:
def generate_one():
    """Sample tier, generate AST, render CUDA, compile to PTX, normalize. Returns dict or None on compile failure."""
    tier_id, gen = sample_tier()
    ast = gen.generate()
    ast_sexp = ast.to_sexp()
    cuda_source = ast_to_cuda(ast_sexp)
    ptx_raw = compile_cuda_to_ptx_silent(cuda_source)
    if ptx_raw is None:
        return None
    ptx_normalized = normalize_ptx(ptx_raw)
    return {
        "ptx_normalized": ptx_normalized,
        "ast_sexp": ast_sexp,
        "cuda_source": cuda_source,
        "tier": tier_id,
        "complexity_score": gen.complexity_score,
    }

random.seed(42)
data = []
attempts = 0
max_attempts = TARGET_PAIRS * 3  # avoid infinite loop if compile fails often

while len(data) < TARGET_PAIRS and attempts < max_attempts:
    row = generate_one()
    attempts += 1
    if row is not None:
        data.append(row)
    if len(data) % BATCH_SIZE == 0 and len(data) > 0:
        print(f"Generated {len(data)} pairs...")

print(f"Done. Total pairs: {len(data)} (attempts: {attempts})")

## Save to Parquet and validate

In [None]:
df = pd.DataFrame(data)
out_path = Path(OUTPUT_DIR) / "dataset_100k.parquet"
df.to_parquet(out_path, index=False)
print(f"Saved to {out_path}")
print(df["tier"].value_counts().sort_index())
df.head(2)

In [None]:
# Quick validation: round-trip one row
from ptx_decompiler.data import parse_sexp
from ptx_decompiler.data.renderer import CUDARenderer

r = df.iloc[0]
tree = parse_sexp(r["ast_sexp"])
rendered = CUDARenderer().kernel_source(tree)
assert r["cuda_source"].strip() == rendered.strip(), "Round-trip mismatch"
print("Round-trip OK.")