# RFT Codebase Functional Check Notebook
This notebook validates the core *task → prompt → dataset preview → registry* components you uploaded.

It is designed to run **even before** the full `RFT/src/rft/...` package layout is in place, by loading modules directly from file paths.

## Files under test
- `pyproject.toml`
- `spec.py`, `registry.py`, `io.py`
- `templates.py`, `render.py`

## What this notebook checks
1. Packaging metadata sanity (Python version constraints, scripts)
2. `TaskSpec` validation logic
3. `TaskRegistry` loads CSV/JSONL and constructs `TaskSpec`
4. Dataset preview token wrapping and model-visible input
5. Prompt rendering into ShareGPT-style messages


In [None]:
import os
from pathlib import Path

print("CWD =", Path(".").resolve())
print("Files here =", list(Path(".").iterdir()))


In [None]:

# --- Paths to uploaded files (adjust if you moved them) ---
from pathlib import Path

ROOT = Path(".").resolve()

PYPROJECT = ROOT / "pyproject.toml"
SPEC_PY   = ROOT / "spec.py"
REG_PY    = ROOT / "registry.py"
IO_PY     = ROOT / "io.py"
TPL_PY    = ROOT / "templates.py"
RENDER_PY = ROOT / "render.py"

for p in [PYPROJECT, SPEC_PY, REG_PY, IO_PY, TPL_PY, RENDER_PY]:
    print(p, "exists =", p.exists())


In [None]:

# --- Utility: import a module from a file path ---
import importlib.util
from types import ModuleType

def load_module(name: str, path: Path) -> ModuleType:
    spec = importlib.util.spec_from_file_location(name, str(path))
    if spec is None or spec.loader is None:
        raise ImportError(f"Cannot load module {name} from {path}")
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)  # type: ignore[attr-defined]
    return mod

spec_mod   = load_module("rft_tasks_spec", SPEC_PY)
registry_mod = load_module("rft_tasks_registry", REG_PY)
io_mod     = load_module("rft_tasks_io", IO_PY)
tpl_mod    = load_module("rft_prompt_templates", TPL_PY)
render_mod = load_module("rft_prompt_render", RENDER_PY)

print("Loaded:", spec_mod, registry_mod, io_mod, tpl_mod, render_mod)


In [None]:

# --- 1) Packaging sanity checks ---
import tomllib

data = tomllib.loads(PYPROJECT.read_text(encoding="utf-8"))

print("project.name:", data["project"]["name"])
print("project.requires-python:", data["project"].get("requires-python"))
print("project.scripts:", list(data.get("project", {}).get("scripts", {}).keys()))

# Check that the core CLI entrypoints exist (they may not yet, but we flag it)
expected_scripts = {"rft-generate","rft-verify","rft-build","rft-train","rft-eval"}
missing = expected_scripts - set(data.get("project", {}).get("scripts", {}).keys())
print("Missing scripts:", missing)


In [None]:

# --- 2) TaskSpec unit checks ---
TaskSpec = spec_mod.TaskSpec

# A valid minimal TaskSpec should validate
t = TaskSpec(task_id="demo_task", split="sab", eval_entrypoint="python -m benchmark.eval_programs.eval_demo")
t.validate()
print("TaskSpec.validate OK:", t.short_name())

# Missing eval_entrypoint must fail
try:
    TaskSpec(task_id="bad", split="sab", eval_entrypoint="").validate()
    raise AssertionError("Expected validation failure but got success")
except ValueError as e:
    print("Expected failure:", e)


In [None]:

# --- 3) Create a tiny annotation table and test TaskRegistry ---
import tempfile, csv, json
from pathlib import Path

TaskRegistry = registry_mod.TaskRegistry

with tempfile.TemporaryDirectory() as td:
    td = Path(td)
    ann = td / "ann.csv"
    with ann.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["task_id","task_inst","eval_script_name","dataset_preview","domain","subtask_categories"])
        w.writeheader()
        w.writerow({
            "task_id": "toy_1",
            "task_inst": "Write a program that prints 'hello'.",
            "eval_script_name": "eval_toy_1.py",
            "dataset_preview": "a,b\n1,2\n",
            "domain": "toy",
            "subtask_categories": "io,printing"
        })

    reg = TaskRegistry(annotation_path=ann, benchmark_root=Path("."), split="sab")
    tasks = reg.list_tasks()
    print("Num tasks:", len(tasks))
    print("First task:", tasks[0])

    ts = reg.get("toy_1")
    print("eval_entrypoint:", ts.eval_entrypoint)
    assert ts.eval_entrypoint.endswith("benchmark.eval_programs.eval_toy_1"), "entrypoint resolution mismatch"


In [None]:

# --- 4) Dataset preview tokens + model-visible input ---
PREVIEW_START_TOKEN = io_mod.PREVIEW_START_TOKEN
PREVIEW_END_TOKEN   = io_mod.PREVIEW_END_TOKEN

ts = TaskSpec(
    task_id="toy_2",
    split="sab",
    instruction="Do something.",
    dataset_preview="x,y\n3,4\n",
    eval_entrypoint="python -m benchmark.eval_programs.eval_toy_2"
)

preview = io_mod.get_dataset_preview(ts)
print(preview)
assert preview.startswith(PREVIEW_START_TOKEN)
assert preview.strip().endswith(PREVIEW_END_TOKEN)

visible = io_mod.get_model_visible_input(ts)
print("\n--- model visible ---\n", visible)
assert "Do something." in visible
assert PREVIEW_START_TOKEN in visible


In [None]:

# --- 5) Prompt template and render() contract checks ---
template = tpl_mod.THINKING_CODE_TEMPLATE
assert "<thinking>" in template and "</thinking>" in template, "Template missing thinking tags"
assert "```python" in template, "Template missing python fenced block instruction"

msgs = render_mod.render(ts)
print(msgs)
assert isinstance(msgs, list) and len(msgs) == 2
assert msgs[0]["role"] == "system"
assert msgs[1]["role"] == "user"
assert msgs[0]["content"] == template
assert "Do something." in msgs[1]["content"]


## Next steps
If all cells above run successfully, these components are functionally consistent.

To validate the full pipeline later, you will extend this notebook to:
1. Call a vLLM endpoint and write `candidates.jsonl`
2. Parse model output into `thinking` + `program`
3. Run your verifier entrypoints in a sandbox and emit `verdicts.jsonl`
4. Build SFT/DPO training JSONL files for LLaMA Factory
