# Experiment: v5 Fine Tuning

This notebook runs the full v5 OpenAI supervised fine-tuning workflow for bot-or-not:

1. Sync Python dependencies
2. Prepare strict pair-aware data splits (`30+32` vs `31+33`)
3. Run pair-holdout CV fine-tuning
4. Train final model on all datasets
5. Evaluate and optionally emit a challenge-compatible run file


In [7]:
from __future__ import annotations

import json
import os
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path

from fine_tuning import prepare_data, run_cv, train_final, eval_model
from fine_tuning.constants import (
    ARTIFACTS_ROOT,
    PREPARED_DIR,
    PROJECT_ROOT,
    RUNS_DIR,
)

print(f'PROJECT_ROOT: {PROJECT_ROOT}')
print(f'ARTIFACTS:    {ARTIFACTS_ROOT}')


PROJECT_ROOT: /Users/max/code/bot-or-not
ARTIFACTS:    /Users/max/code/bot-or-not/python/artifacts/fine_tuning


## Configuration

Set knobs here before running live jobs.

- Keep `RUN_LIVE_JOBS = False` for dry-run only
- Set `RUN_LIVE_JOBS = True` to create OpenAI fine-tuning jobs


In [8]:
DATASETS = (30, 31, 32, 33)
BASE_MODEL = 'gpt-4.1-mini-2025-04-14'
# BASE_MODEL = 'gpt-4.1-2025-04-14' # wayyyyy too expensive
VAL_FRACTION = 0.10
SEED = 20260214

# Fine-tuning hyperparameters
# Use None for provider default (auto) where supported.
FT_EPOCHS = 'auto'  # e.g. 'auto' or 3
FT_BATCH_SIZE = None  # e.g. None, 'auto', or 8
FT_LEARNING_RATE_MULTIPLIER = None  # e.g. None, 'auto', or 1.0

# Live execution controls
RUN_LIVE_JOBS = True
NO_WAIT = False  # If True, submit jobs and return immediately
POLL_SECONDS = 30
MAX_WAIT_MINUTES = 0  # 0 means no timeout

# Evaluation controls
MAX_EVAL_SAMPLES = 0  # 0 means evaluate all users
WRITE_RUN_FILE = True

openai_key_present = bool(os.environ.get('OPENAI_API_KEY'))
print('OPENAI_API_KEY present:', openai_key_present)
if not openai_key_present:
    print('WARNING: OPENAI_API_KEY is not set. Live job submission/evaluation will fail until it is set.')

print('Hyperparameters:')
print('  epochs:', FT_EPOCHS)
print('  batch_size:', FT_BATCH_SIZE)
print('  learning_rate_multiplier:', FT_LEARNING_RATE_MULTIPLIER)


OPENAI_API_KEY present: True


In [9]:
def run_shell(cmd: str, cwd: Path = PROJECT_ROOT, check: bool = True) -> subprocess.CompletedProcess[str]:
    """Helper for non-Python commands (e.g. bun)."""
    print(f"\n$ {cmd}")
    completed = subprocess.run(cmd, cwd=str(cwd), shell=True, text=True, capture_output=True)
    if completed.stdout:
        print(completed.stdout)
    if completed.returncode != 0:
        if completed.stderr:
            print(completed.stderr, file=sys.stderr)
        if check:
            raise RuntimeError(f'Command failed ({completed.returncode}): {cmd}')
    return completed


## 1) Sync dependencies


## 2) Prepare data (full-post inputs, strict pair holdout)


In [10]:
summary = prepare_data(
    datasets=DATASETS,
    prepared_dir=PREPARED_DIR,
    val_fraction=VAL_FRACTION,
    seed=SEED,
)

print(json.dumps(summary['data_integrity'], indent=2))

assert summary['data_integrity']['total_examples'] == 889
assert summary['data_integrity']['labels']['BOT'] == 184
assert summary['data_integrity']['labels']['HUMAN'] == 705
assert summary['data_integrity']['no_truncation_mismatches'] == 0

for fold in summary['pair_folds']:
    assert fold['train_test_overlap'] == 0
    assert fold['val_test_overlap'] == 0
    assert fold['train_val_overlap'] == 0

print('Data integrity and split integrity checks passed.')


Prepared data written to: /Users/max/code/bot-or-not/python/artifacts/fine_tuning/prepared
Data integrity: total=889 BOT=184 HUMAN=705
No-truncation mismatches: 0
fold_a: train=491 val=55 test=343 | overlaps train/val=0 train/test=0 val/test=0
fold_b: train=308 val=35 test=546 | overlaps train/val=0 train/test=0 val/test=0
final: train=799 val=90 overlap=0
{
  "total_examples": 889,
  "labels": {
    "BOT": 184,
    "HUMAN": 705
  },
  "languages": {
    "en": 546,
    "fr": 343
  },
  "datasets": {
    "30": 275,
    "33": 172,
    "31": 171,
    "32": 271
  },
  "no_truncation_mismatches": 0
}
Data integrity and split integrity checks passed.


## 3) Run strict pair-holdout CV fine-tuning


In [None]:
cv_summary = run_cv(
    prepared_dir=PREPARED_DIR,
    base_model=BASE_MODEL,
    n_epochs=FT_EPOCHS,
    batch_size=FT_BATCH_SIZE,
    learning_rate_multiplier=FT_LEARNING_RATE_MULTIPLIER,
    poll_seconds=POLL_SECONDS,
    max_wait_minutes=MAX_WAIT_MINUTES,
    max_samples=MAX_EVAL_SAMPLES,
    dry_run=not RUN_LIVE_JOBS,
    no_wait=NO_WAIT,
)


Submitting fold_a fine-tuning job...


## 4) Train final model on all datasets


In [None]:
train_result = train_final(
    prepared_dir=PREPARED_DIR,
    base_model=BASE_MODEL,
    n_epochs=FT_EPOCHS,
    batch_size=FT_BATCH_SIZE,
    learning_rate_multiplier=FT_LEARNING_RATE_MULTIPLIER,
    poll_seconds=POLL_SECONDS,
    max_wait_minutes=MAX_WAIT_MINUTES,
    dry_run=not RUN_LIVE_JOBS,
    no_wait=NO_WAIT,
)



$ python3 -m fine_tuning train-final --prepared-dir /Users/max/code/bot-or-not/python/artifacts/fine_tuning/prepared --base-model gpt-4.1-mini-2025-04-14 --poll-seconds 30 --max-wait-minutes 0 --dry-run
Would submit final train job using: /Users/max/code/bot-or-not/python/artifacts/fine_tuning/prepared/final/train.jsonl + /Users/max/code/bot-or-not/python/artifacts/fine_tuning/prepared/final/val.jsonl



## 5) Evaluate model and emit a run file


In [None]:
final_model_path = ARTIFACTS_ROOT / 'final_model.txt'
model_id = ''

if final_model_path.exists():
    model_id = final_model_path.read_text(encoding='utf-8').strip()
elif os.environ.get('OPENAI_FT_MODEL_V5'):
    model_id = os.environ['OPENAI_FT_MODEL_V5'].strip()

if not model_id:
    print('No final model id found yet. Train final model first or set OPENAI_FT_MODEL_V5.')
else:
    eval_output = ARTIFACTS_ROOT / f'eval-{datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")}.json'
    report = eval_model(
        model=model_id,
        datasets=DATASETS,
        output=eval_output,
        max_samples=MAX_EVAL_SAMPLES,
        write_run_file=WRITE_RUN_FILE,
        run_tag='v5',
        detector_name='v5',
    )
    print('Saved eval report:', eval_output)


No final model id found yet. Train final model first or set OPENAI_FT_MODEL_V5.


## 6) Evaluate On Unseen `final` Validation Split Only

Use this for a model trained on `final/train` and scored on `final/val` (unseen examples only).


In [12]:
# Set the model ID to evaluate on final val only.
MODEL_ID_FINAL_ONLY = 'ft:gpt-4.1-mini-2025-04-14:personal:test:D91ewpzh'

final_dir = PREPARED_DIR / 'final'
val_sft_path = final_dir / 'val.jsonl'
val_meta_path = final_dir / 'val.meta.jsonl'
val_eval_path = final_dir / 'val.eval.jsonl'

assert val_sft_path.exists(), f'Missing: {val_sft_path}'
assert val_meta_path.exists(), f'Missing: {val_meta_path}'

# Build eval manifest from val SFT + metadata.
val_sft_rows = [json.loads(line) for line in val_sft_path.read_text(encoding='utf-8').splitlines() if line.strip()]
val_meta_rows = [json.loads(line) for line in val_meta_path.read_text(encoding='utf-8').splitlines() if line.strip()]

assert len(val_sft_rows) == len(val_meta_rows), 'val.jsonl and val.meta.jsonl row count mismatch'

eval_rows = []
for sft_row, meta_row in zip(val_sft_rows, val_meta_rows):
    messages = sft_row['messages']
    # Keep only system+user inputs for evaluation calls.
    eval_messages = [m for m in messages if m['role'] != 'assistant']
    if len(eval_messages) != 2:
        raise ValueError('Expected exactly 2 non-assistant messages (system + user)')

    label = meta_row.get('label')
    if label not in {'BOT', 'HUMAN'}:
        # Fallback: derive label from assistant message if metadata is missing label.
        assistant_msgs = [m for m in messages if m['role'] == 'assistant']
        if not assistant_msgs:
            raise ValueError('Missing label in metadata and no assistant message in SFT row')
        label = assistant_msgs[0]['content'].strip().upper()

    eval_rows.append({
        'user_id': meta_row['user_id'],
        'dataset_id': int(meta_row['dataset_id']),
        'lang': meta_row['lang'],
        'label': label,
        'full_post_count': int(meta_row['full_post_count']),
        'post_count_used': int(meta_row['post_count_used']),
        'messages': eval_messages,
    })

with val_eval_path.open('w', encoding='utf-8') as f:
    for row in eval_rows:
        f.write(json.dumps(row, ensure_ascii=False) + '\n')

print(f'Wrote {len(eval_rows)} rows to {val_eval_path}')

report_path = ARTIFACTS_ROOT / f'final-val-only-eval-{datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")}.json'
report = eval_model(
    model=MODEL_ID_FINAL_ONLY,
    eval_file=val_eval_path,
    output=report_path,
    write_run_file=False,
)

print('Saved report:', report_path)
print('Metrics:', json.dumps(report['metrics'], indent=2))


Wrote 90 rows to /Users/max/code/bot-or-not/python/artifacts/fine_tuning/prepared/final/val.eval.jsonl
combined: total=90 bots=19 humans=71 | TP=19 TN=71 FP=0 FN=0 | acc=100.00% score=76/76 (100.0%)
Invalid outputs: 0
Saved eval report: /Users/max/code/bot-or-not/python/artifacts/fine_tuning/final-val-only-eval-2026-02-14T04-58-10Z.json
Saved report: /Users/max/code/bot-or-not/python/artifacts/fine_tuning/final-val-only-eval-2026-02-14T04-58-10Z.json
Metrics: {
  "total": 90,
  "bots": 19,
  "humans": 71,
  "tp": 19,
  "tn": 71,
  "fp": 0,
  "fn": 0,
  "accuracy": 100.0,
  "score": 76,
  "max_score": 76,
  "pct_max": 100.0
}


## 7) Optional: analyze latest v5 run with JS analyzer


In [None]:
import shlex

run_files = sorted(RUNS_DIR.glob('v5-*.txt'))
if not run_files:
    print('No v5 run file found yet.')
else:
    latest_run = run_files[-1]
    print('Latest run file:', latest_run)
    run_shell(f'bun run js/analysis.ts {shlex.quote(str(latest_run))}')


No v5 run file found yet.


## Notes

- `run-cv` uses strict pair holdout (`30+32` train vs `31+33` test, and inverse).
- Training/eval prompts include **all posts** per user (no cap).
- JS runtime can use the final model via `OPENAI_FT_MODEL_V5`.
