# Synthetic Data Factory (SDF) ‚Äî Colab\nEnd-to-end: install ‚Üí bring code ‚Üí tests ‚Üí API ‚Üí generate ‚Üí filter ‚Üí dedupe ‚Üí score ‚Üí curate ‚Üí train SFT ‚Üí report.

In [None]:
#@title üì¶ Install deps
!pip -q install -U pip
!pip -q install fastapi uvicorn transformers peft pydantic requests numpy jsonschema pytest ruff black accelerate

## Bring in code\nUpload the zip produced by ChatGPT or your local repo zip (must contain `pyproject.toml` and `sdf/`).

In [None]:
#@title ‚¨ÜÔ∏è Upload SDF zip
from google.colab import files
import zipfile, io, os, shutil
TARGET='/content/sdf-lab'
if os.path.exists(TARGET): shutil.rmtree(TARGET)
os.makedirs(TARGET, exist_ok=True)
up = files.upload()
if up:
  name = next(iter(up.keys()))
  with zipfile.ZipFile(io.BytesIO(up[name]), 'r') as zf:
    zf.extractall(TARGET)
print('Extracted to', TARGET)

In [None]:
#@title üèóÔ∏è Install project (editable)
%cd /content/sdf-lab
!pip -q install -e .[dev]

In [None]:
#@title ‚úÖ Tests
%cd /content/sdf-lab
!pytest -q

In [None]:
#@title üöÄ Launch API (background)
import subprocess, time, socket, os
!fuser -k 9000/tcp || true
p = subprocess.Popen(["uvicorn","server:app","--host","127.0.0.1","--port","9000"], cwd="/content/sdf-lab")
time.sleep(4)
s=socket.socket(); s.settimeout(2)
try:
  s.connect(("127.0.0.1",9000)); s.close(); print("API up at http://127.0.0.1:9000")
except Exception as e:
  print("API failed:", e)

In [None]:
#@title üß™ Generate -> Filter -> Dedupe -> Score -> Curate (via Python API)
from sdf.generate.templates import generate_templates
from sdf.generate.code_math import generate_code_math
from sdf.filter.schema import filter_schema
from sdf.filter.pii import filter_pii
from sdf.filter.safety import filter_safety
from sdf.filter.format import filter_format
from sdf.dedupe.exact import dedupe_exact
from sdf.score.judge import score_items
from sdf.curate.mixture import curate_mixture
import json

items = generate_templates(50, 1) + generate_code_math(50, 2)
k,_ = filter_schema(items, {})
k,_ = filter_pii(k, {})
k,_ = filter_safety(k, {})
k,_ = filter_format(k, {})
k = dedupe_exact(k)
k = score_items(k, {})
cur = curate_mixture(k, 60, {})
print("curated:", len(cur))
print(json.dumps(cur[0], indent=2)[:1000])

In [None]:
#@title üßë‚Äçüè´ Train tiny SFT (LoRA)
!python -m sdf.train.sft_trainer --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --data /content/sdf-lab/tests/testdata.sft.jsonl --out_dir /content/out/sft_adapter --epochs 1 --batch_size 2 --lr 5e-5 --max_seq 256 --lora_r 8 --lora_alpha 16

In [None]:
#@title üßæ Report
import json, tempfile, os
from sdf.eval.report import build_report
# Save curated to a file for reporting
cur_path = "/content/curated.jsonl"
with open(cur_path,'w') as f:
  for x in cur: f.write(json.dumps(x)+'\n')
print(build_report(cur_path))

## üîß Tool-calling generation + schema validation

In [None]:
from sdf.generate.tools import generate_tools
from sdf.filter.schema import filter_schema
tools_items = generate_tools(10, seed=123)
kept, dropped = filter_schema(tools_items, {})
print("kept:", len(kept), "dropped:", len(dropped))
print(kept[0])

## üîÅ Build DPO pairs (grouped by instruction)

In [None]:
import json, os, tempfile
from sdf.tools.dpo_builder import build_pairs
from sdf.generate.templates import generate_templates
# make duplicates to simulate multiple answers per instruction
a = {"instruction":"Say hi.","input":"","output":"Hello!","meta":{"domain":"general"},"score":0.9}
b = {"instruction":"Say hi.","input":"","output":"Hi.","meta":{"domain":"general"},"score":0.2}
pairs = build_pairs([a,b], group_key='instruction', k_pairs_per_group=1)
print(pairs[0])

## üßπ Semantic dedupe

In [None]:
from sdf.dedupe.semantic import dedupe_semantic
near_dups = [
    {"instruction":"Write an email to Jordan about the Q3 report.","input":"","output":"Dear Jordan, The Q3 report is ready.","meta":{"domain":"email"}},
    {"instruction":"Write an email to Jordan about the Q3 report.","input":"","output":"Dear Jordan, Q3 report attached.","meta":{"domain":"email"}},
    {"instruction":"List 3 uses for a paperclip.","input":"","output":"- Use 1 for paperclip","meta":{"domain":"list"}},
]
out = dedupe_semantic(near_dups, threshold=0.9, model_id="prajjwal1/bert-tiny")
print(len(out), "after semantic dedupe")