# Phase 1: Data Factory — Generate 100k+ (PTX, AST, CUDA) Pairs

Run this notebook in **Google Colab (Free tier)**. It uses the 7-tier grammar to generate random ASTs, renders them to CUDA, compiles with `nvcc` to PTX, and normalizes the PTX. Output is saved as Parquet for training.

In [None]:
# --- Run this cell first on Google Colab to clone the repo ---
import os
if not os.path.exists("/content/DeepPTX"):
    !git clone https://github.com/ns-1456/DeepPTX.git /content/DeepPTX
%cd /content/DeepPTX

## Setup: Install deps and mount Drive (optional)

In Colab: Runtime → Change runtime type → CPU is enough. Install `pyarrow` for Parquet. Optionally mount Google Drive to save the dataset.

In [1]:
!pip install -q pyarrow tqdm

# Optional: mount Google Drive to save the dataset persistently
# from google.colab import drive
# drive.mount("/content/drive")
# OUTPUT_DIR = "/content/drive/MyDrive/NeuralPTX"

OUTPUT_DIR = "."  # saves to repo root; uncomment above for Drive
TARGET_PAIRS = 100_000
BATCH_SIZE = 5000  # save progress every N pairs

## Add project root and import

In [2]:
import sys, os
# Add repo root to path (works both in Colab after clone, and locally from notebooks/)
REPO_ROOT = "/content/DeepPTX" if os.path.exists("/content/DeepPTX") else os.path.abspath("..")
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)

import random
import pandas as pd
from pathlib import Path

from ptx_decompiler.data import (
    get_tier_generator,
    TIER_WEIGHTS,
    Tier1SimpleBinary,
    Tier2NestedArithmetic,
    Tier3UnaryMath,
    Tier4Ternary,
    Tier5TypeDiversity,
    Tier6MultiStatement,
    Tier7SharedMemory,
    ast_to_cuda,
    compile_cuda_to_ptx_silent,
    normalize_ptx,
)
from ptx_decompiler.data.grammar import TIER_CLASSES, sample_tier
print("Imports OK")

## Generation loop

In [3]:
from tqdm.auto import tqdm

def generate_one():
    """Sample tier, generate AST, render CUDA, compile to PTX, normalize. Returns dict or None on compile failure."""
    tier_id, gen = sample_tier()
    ast = gen.generate()
    ast_sexp = ast.to_sexp()
    cuda_source = ast_to_cuda(ast_sexp)
    ptx_raw = compile_cuda_to_ptx_silent(cuda_source)
    if ptx_raw is None:
        return None
    ptx_normalized = normalize_ptx(ptx_raw)
    return {
        "ptx_normalized": ptx_normalized,
        "ast_sexp": ast_sexp,
        "cuda_source": cuda_source,
        "tier": tier_id,
        "complexity_score": gen.complexity_score,
    }

random.seed(42)
data = []
attempts = 0
failures = 0
max_attempts = TARGET_PAIRS * 3

pbar = tqdm(total=TARGET_PAIRS, desc="Generating pairs", unit="pair",
            bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}] failures={postfix}")
pbar.set_postfix_str(f"0")

while len(data) < TARGET_PAIRS and attempts < max_attempts:
    row = generate_one()
    attempts += 1
    if row is not None:
        data.append(row)
        pbar.update(1)
        if len(data) % 500 == 0:
            pbar.set_postfix_str(f"{failures}")
    else:
        failures += 1

pbar.close()
print(f"\nDone. Total pairs: {len(data)} | Attempts: {attempts} | Compile failures: {failures} ({failures/max(attempts,1)*100:.1f}%)")

KeyboardInterrupt: 

## Save to Parquet and validate

In [None]:
df = pd.DataFrame(data)
out_path = Path(OUTPUT_DIR) / "dataset_100k.parquet"
df.to_parquet(out_path, index=False)
print(f"Saved to {out_path}")
print(df["tier"].value_counts().sort_index())
df.head(2)

In [None]:
# Quick validation: round-trip one row
from ptx_decompiler.data import parse_sexp
from ptx_decompiler.data.renderer import CUDARenderer

r = df.iloc[0]
tree = parse_sexp(r["ast_sexp"])
rendered = CUDARenderer().kernel_source(tree)
assert r["cuda_source"].strip() == rendered.strip(), "Round-trip mismatch"
print("Round-trip OK.")