In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __future__ import annotations

from dotenv import load_dotenv
import json, pathlib, sys, os
from openai import AsyncOpenAI
from openai.types.fine_tuning import ReinforcementHyperparameters
from openai.lib._pydantic import to_strict_json_schema 

In [None]:
HERE = pathlib.Path().resolve()

# ---------------------------------------------------------------------------
# Locate shared root (the folder that contains both `utils` and your project)
# It climbs up until it finds a `utils/` directory or stops at filesystem root.
# ---------------------------------------------------------------------------
ROOT = HERE
while ROOT != ROOT.parent and not (ROOT / "utils").exists():
    ROOT = ROOT.parent

if not (ROOT / "utils").exists():
    raise RuntimeError(
        f"Could not find 'utils' directory above {HERE}. "
        "Check your project structure or adjust the path resolution logic."
    )

if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
print(f"✅ Added to sys.path: {ROOT}")

# ---------------------------------------------------------------------------
# Infer project name (parent of pipelines/ or notebooks/)
# e.g., .../projects/<project>/notebooks/... -> <project>
# ---------------------------------------------------------------------------
project_name = HERE.parent.name
os.environ.setdefault("PROJECT", project_name)
print(f"✅ Project name set to: {project_name}")

# ---------------------------------------------------------------------------
# Load project-specific environment variables
# ---------------------------------------------------------------------------
env_path = HERE.parent / ".env"
if env_path.exists():
    load_dotenv(env_path, override=True)
    print(f"✅ Loaded .env from: {env_path}")
else:
    print("⚠️ No .env file found, relying on existing environment variables.")

# ---------------------------------------------------------------------------
# Ensure the OpenAI API key is available
# ---------------------------------------------------------------------------
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError(
        "OPENAI_API_KEY not found. Add it to your .env file or export it before running."
    )
print("✅ OpenAI API key detected.")

In [None]:
# ---------------------------------------------------------------------------
# Project Imports (now should work everywhere)
# ---------------------------------------------------------------------------
from utils import build_rft_jsonl, get_or_upload_file, load_prompt, load_saved_grader, create_rft_job
from utils.project_paths import datasets_root, project_root
from utils.grader_utils import load_saved_grader

# Ensure structured_outputs can be imported
_cust_root = project_root()
if str(_cust_root) not in sys.path:
    sys.path.append(str(_cust_root))


print("✅ Utils imported successfully.")

In [None]:
# Configure USER_FIELD and PROMPT_NAME --------------------------------------
USER_FIELD = "text_input"
PROMPT_NAME = "v7"
DATASET_NAME = project_name

prompt_obj = load_prompt(DATASET_NAME, PROMPT_NAME, prompt_type="developer")
assert prompt_obj, "Prompt not found"

In [None]:
# Collect splits present in data/
client_kwargs = {"api_key": api_key}
proj_id = os.getenv("OPENAI_PROJECT_ID")
if proj_id:
    client_kwargs["project"] = proj_id

client = AsyncOpenAI(**client_kwargs)

splits = ["train", "val"]
train_file_id = None
val_file_id = None
for split in splits:
    data_path = next(datasets_root().glob(f"*_{split}.jsonl"))
    items = [json.loads(l) for l in data_path.read_text().splitlines()]
    rft_path = await build_rft_jsonl(prompt_obj.text, prompt_obj.id, items, split=split, user_field=USER_FIELD)
    file_id = await get_or_upload_file(client, rft_path, purpose="fine-tune")
    print(f"[RFT] Split {split}: file_id = {file_id}")
    if split == "train":
        train_file_id = file_id
    elif split in {"val", "valid", "validation"}:
        val_file_id = file_id

In [None]:
# Load the response format
from structured_outputs.base_models import Level1Codes

schema = to_strict_json_schema(Level1Codes)
RESPONSE_FORMAT = dict(
    type="json_schema",
    json_schema={
        "name": Level1Codes.__name__,
        "schema": schema,
        "strict": True,
    }
)

In [None]:
# Load grader for RFT (Param object is sufficient)
GRADER = load_saved_grader(DATASET_NAME, "level1_f1_multi")

In [None]:
import requests

API_KEY = os.environ["OPENAI_API_KEY"]
HEADERS = {"Authorization": f"Bearer {API_KEY}"}

# Validate a grader configuration for fine-tuning
payload = {"grader": GRADER}
try:
    response = requests.post(
        "https://api.openai.com/v1/fine_tuning/alpha/graders/validate",
        json=payload,
        headers=HEADERS,
    )
    response.raise_for_status()
    print("Grader validated")
except requests.exceptions.RequestException as e:
    print(f"Error validating grader: {e}")
    if 'response' in locals():
        print(f"Response: {response.text}")

In [None]:
# Hyper-parameters for RFT fine-tuning -----------------------------
HPARAMS = ReinforcementHyperparameters(
    n_epochs=15,
    batch_size=16,
    reasoning_effort="low",
    eval_samples=3,
    compute_multiplier=1,
)

In [None]:
# Create RFT job
job_id = await create_rft_job(
    client=client,
    train_file_id=train_file_id,
    val_file_id=val_file_id,
    grader=GRADER,
    base_model="o4-mini-2025-04-16",
    hp=HPARAMS,
    suffix=f"rft-{project_name}-prompt-{PROMPT_NAME}",
    seed=42,
    response_format=RESPONSE_FORMAT,
)