# üß† NPC AI ‚Äî Complete Training & Integration Pipeline

**BD-NSCA: Behavior-Driven Neuro-Symbolic Cognitive Architecture**

| Step | Description |
|------|-------------|
| 1 | Environment Setup |
| 2 | Training Data Generation |
| 3 | QLoRA Fine-Tuning (checkpoint/resume) |
| 4 | GGUF Export |
| 5 | Ollama Serving |
| 6 | Integrated Demo |
| 7 | Quality Evaluation |
| 8 | C++ Engine Compilation |

> **Checkpoint/Resume**: Training auto-detects and resumes from existing checkpoints.


---
## 1. üîß Environment Setup & Dependencies


In [None]:
# ============================================================
# Cell 1: Environment Setup (accelerator-aware: TPU/GPU/CPU)
# ============================================================
import os
import subprocess
import sys
from pathlib import Path

IN_KAGGLE = Path('/kaggle').exists()
IN_COLAB = 'google.colab' in sys.modules
ENV_NAME = 'Kaggle' if IN_KAGGLE else ('Colab' if IN_COLAB else 'Local')


def run_cmd(cmd, allow_fail=False):
    try:
        subprocess.check_call(cmd)
        return True
    except Exception as exc:
        if not allow_fail:
            raise
        print(f'Warning: command failed: {cmd}')
        print(f'  -> {exc}')
        return False


def pip_install(packages, allow_fail=False, extra_args=None):
    extra_args = extra_args or []
    cmd = [sys.executable, '-m', 'pip', 'install', '-q'] + list(extra_args) + list(packages)
    return run_cmd(cmd, allow_fail=allow_fail)


def detect_runtime():
    forced = os.environ.get('NPC_ACCELERATOR', 'auto').strip().lower()
    if forced in {'tpu', 'cuda', 'cpu'}:
        return forced

    if os.environ.get('PJRT_DEVICE', '').strip().upper() == 'TPU':
        return 'tpu'

    kaggle_accel = os.environ.get('KAGGLE_ACCELERATOR_TYPE', '').strip().upper()
    if kaggle_accel.startswith('TPU'):
        return 'tpu'

    tpu_hints = ('TPU_NAME', 'COLAB_TPU_ADDR', 'TPU_WORKER_ID')
    if any(os.environ.get(k) for k in tpu_hints):
        try:
            import torch_xla.core.xla_model as xm  # type: ignore
            _ = xm.xla_device()
            return 'tpu'
        except Exception:
            pass

    try:
        import torch_xla.core.xla_model as xm  # type: ignore
        dev = str(xm.xla_device()).lower()
        if 'xla' in dev:
            return 'tpu'
    except Exception:
        pass

    try:
        import torch
        if torch.cuda.is_available():
            return 'cuda'
    except Exception:
        pass

    return 'cpu'


TRAIN_ACCELERATOR = detect_runtime()
if TRAIN_ACCELERATOR == 'tpu':
    os.environ.setdefault('PJRT_DEVICE', 'TPU')

print(f'Environment: {ENV_NAME}')
print(f'Training accelerator: {TRAIN_ACCELERATOR}')

base_deps = [
    'transformers>=4.45.0',
    'datasets>=2.20.0',
    'peft>=0.11.0',
    'accelerate>=0.30.0',
    'sentencepiece',
    'protobuf',
    'requests',
]
pip_install(base_deps, allow_fail=False)

if TRAIN_ACCELERATOR == 'cuda':
    pip_install(['bitsandbytes>=0.43.0'], allow_fail=True)
    # Optional, only for legacy Unsloth path.
    pip_install(['unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git'], allow_fail=True)
elif TRAIN_ACCELERATOR == 'tpu':
    try:
        import torch_xla  # type: ignore # noqa: F401
        print('torch_xla already available.')
    except Exception:
        print('torch_xla not found. Attempting install for TPU runtime...')
        pip_install(
            ['torch_xla[tpu]>=2.2'],
            allow_fail=True,
            extra_args=['-f', 'https://storage.googleapis.com/libtpu-releases/index.html'],
        )

try:
    import torch
    if torch.cuda.is_available():
        print(f'CUDA GPU count: {torch.cuda.device_count()}')
        print(f'Primary GPU: {torch.cuda.get_device_name(0)}')
except Exception as exc:
    print(f'Warning: torch check failed: {exc}')

if TRAIN_ACCELERATOR == 'tpu':
    try:
        import torch_xla.core.xla_model as xm  # type: ignore
        print(f'TPU device: {xm.xla_device()}')
    except Exception as exc:
        print(f'Warning: TPU selected but torch_xla check failed: {exc}')

print('Environment setup done.')



In [None]:
# ============================================================
# Step 1.5: Validate C++ Engine Layout (Non-destructive)
# ============================================================
import os
from pathlib import Path

print('Validating C++ engine files for Kaggle build...')

required_files = [
    'cpp/CMakeLists.txt',
    'cpp/src/NPCInference.cpp',
    'cpp/src/PromptBuilder.cpp',
    'cpp/include/ModelLoader.h',
]
missing = [p for p in required_files if not os.path.exists(p)]

if missing:
    print('Warning: missing required C++ files:')
    for m in missing:
        print(f'  - {m}')
else:
    print('All required C++ files are present.')

pb_path = Path('cpp/src/PromptBuilder.cpp')
if pb_path.exists():
    pb_text = pb_path.read_text(encoding='utf-8', errors='ignore')
    if 'BuildAdvanced' in pb_text and '[CONTEXT]' in pb_text and '[PLAYER]' in pb_text:
        print('PromptBuilder format check passed.')
    else:
        print('Warning: PromptBuilder format markers were not detected.')
else:
    print('Warning: PromptBuilder.cpp not found.')

print('C++ validation complete (source files left unchanged).')


---
## 2. üìù Training Data Generation (Enhanced)


In [None]:
# ============================================================
# Cell 2: Training Data Generation (Refined English)
# ============================================================
import json
import os
import random

os.makedirs('data', exist_ok=True)
PERSONAS_PATH = 'data/personas.json'
UTTERANCES_PATH = 'data/player_utterances.json'
OUTPUT_PATH = 'data/npc_training_v2.json'

if os.path.exists(PERSONAS_PATH):
    with open(PERSONAS_PATH, 'r', encoding='utf-8') as f:
        personas = json.load(f)
else:
    personas = {'merchant': {'persona_en': 'You are a Merchant.', 'traits': ['friendly'], 'id': 'merchant'}}

if os.path.exists(UTTERANCES_PATH):
    with open(UTTERANCES_PATH, 'r', encoding='utf-8') as f:
        utterances = json.load(f)
else:
    utterances = {'greetings': {'en': ['Hello!']}}


def generate_heuristic_response(persona, category, player_input):
    name = persona.get('id', 'NPC').replace('npc_', '').capitalize()
    traits = persona.get('traits', [])
    trait_str = random.choice(traits) if traits else 'friendly'
    templates = [
        lambda: f"{name} looks at you with a {trait_str} expression. 'Welcome, traveler. What brings you here?'",
        lambda: f"'Ah, a new face!' {name} exclaims. 'I hope your journey was smoother than mine.'",
        lambda: f"{name} pauses for a moment. 'I have much to share, but tell me, what is your business in this village?'",
        lambda: f"The {name} nods slowly. 'Greetings. I am here to help, if you have the coin.'",
    ]
    if category == 'greetings':
        return random.choice(templates)()
    if category == 'trade_related':
        return f"{name} eyes your gear carefully. 'I deal in quality only. Are you buying or just looking?'"
    return f"{name} considers your words deeply. 'Interesting... {player_input} is not something I hear every day.'"


if isinstance(personas, dict):
    persona_list = list(personas.values())
elif isinstance(personas, list):
    persona_list = personas
else:
    persona_list = []

if not persona_list:
    persona_list = [{'persona_en': 'You are an NPC.', 'traits': ['friendly'], 'id': 'npc_default'}]

categories = [
    key for key, value in utterances.items()
    if isinstance(value, dict) and isinstance(value.get('en', []), list) and value.get('en')
]
if not categories:
    utterances = {'greetings': {'en': ['Hello!']}}
    categories = ['greetings']


dataset = []
for _ in range(1200):
    p = random.choice(persona_list)
    c = random.choice(categories)
    q = random.choice(utterances[c].get('en', ['Hello']))
    a = generate_heuristic_response(p, c, q)
    ctx = {
        'memories': [],
        'current_emotion': {'description': 'neutral', 'valence': 0.0},
        'knowledge': [],
        'npc_info': {'name': p.get('id', 'NPC'), 'persona': p.get('persona_en', '')},
    }
    prompt = (
        '[INSTRUCTION] Respond strictly in English.\n[CONTEXT]\n'
        + json.dumps(ctx, ensure_ascii=False)
        + '\n\n[PLAYER] '
        + q
        + '\n\n[NPC] '
    )
    dataset.append({'prompt': prompt, 'completion': a})

with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, indent=1, ensure_ascii=False)

print(f'Generated {len(dataset)} training samples at {OUTPUT_PATH}')


In [None]:
# ============================================================
# Cell 3: Trainer selection and dry-run
# ============================================================
import os
import subprocess
import sys

TRAIN_SCRIPT = 'scripts/train_qlora.py'
if not os.path.exists(TRAIN_SCRIPT):
    raise FileNotFoundError(f'{TRAIN_SCRIPT} not found. Ensure repository files are present.')

candidates = [
    'data/npc_training_v2.json',
    'data/npc_training_v2.jsonl',
    'data/npc_training.json',
    'data/npc_training.jsonl',
]
TRAIN_DATASET = next((p for p in candidates if os.path.exists(p)), None)
if TRAIN_DATASET is None:
    raise FileNotFoundError('No training dataset found in expected paths.')

print(f'Using train script: {TRAIN_SCRIPT}')
print(f'Using dataset: {TRAIN_DATASET}')
print(f'Accelerator target: {globals().get("TRAIN_ACCELERATOR", "auto")}')

# Dry-run to validate config and dataset before long training.
dry_cmd = [
    sys.executable,
    TRAIN_SCRIPT,
    '--data',
    TRAIN_DATASET,
    '--output-dir',
    'outputs/npc_model',
    '--accelerator',
    globals().get('TRAIN_ACCELERATOR', 'auto'),
    '--dry-run',
]
subprocess.check_call(dry_cmd)
print('Dry-run validation completed.')



---
## 3. üöÄ QLoRA Fine-Tuning


In [None]:
# ============================================================
# Cell 4: Execute fine-tuning (TPU/GPU aware) and optional GGUF export
# ============================================================
import glob
import importlib.util
import os
import shutil
import subprocess
import sys

WORK_DIR = '/kaggle/working' if os.path.exists('/kaggle/working') else os.getcwd()
os.makedirs(WORK_DIR, exist_ok=True)

TRAINING_SUCCESS = False
GGUF_EXPORT_SUCCESS = False

train_script = 'scripts/train_qlora.py'
train_data = globals().get('TRAIN_DATASET', 'data/npc_training_v2.json')
accelerator = globals().get('TRAIN_ACCELERATOR', 'auto')
output_dir = 'outputs/npc_model'

if accelerator == 'tpu':
    os.environ.setdefault('PJRT_DEVICE', 'TPU')

train_args = [
    '--data',
    train_data,
    '--output-dir',
    output_dir,
    '--accelerator',
    accelerator,
    '--epochs',
    '1',
    '--max-seq-length',
    '1024',
    '--learning-rate',
    '2e-4',
    '--gradient-checkpointing',
]

if accelerator == 'cuda':
    train_args += ['--use-4bit', '--batch-size', '2', '--gradient-accumulation-steps', '4']
elif accelerator == 'tpu':
    # TPU cannot use bitsandbytes 4-bit. Increase grad accumulation for stable global batch.
    train_args += ['--no-4bit', '--batch-size', '1', '--gradient-accumulation-steps', '16']
else:
    train_args += ['--no-4bit', '--batch-size', '1', '--gradient-accumulation-steps', '8']

if accelerator == 'tpu' and importlib.util.find_spec('torch_xla.distributed.xla_run') is not None:
    tpu_cores = os.environ.get('NPC_TPU_CORES', '8')
    train_cmd = [
        sys.executable,
        '-m',
        'torch_xla.distributed.xla_run',
        '--num_cores',
        str(tpu_cores),
        train_script,
    ] + train_args
else:
    if accelerator == 'tpu':
        print('torch_xla xla_run launcher not found; using single-process TPU execution.')
    train_cmd = [sys.executable, train_script] + train_args

print('Running training command:')
print(' '.join(train_cmd))

try:
    subprocess.check_call(train_cmd)
    TRAINING_SUCCESS = True
    print('Fine-tuning completed.')
except Exception as exc:
    print(f'Warning: fine-tuning failed: {exc}')

if TRAINING_SUCCESS and accelerator != 'tpu' and os.path.exists('scripts/export_gguf.py'):
    # Optional export path. On TPU we skip by default to avoid long CPU merge/convert.
    gguf_out = os.path.join(WORK_DIR, 'npc-phi3.gguf')
    export_cmd = [
        sys.executable,
        'scripts/export_gguf.py',
        '--adapter',
        output_dir,
        '--base-model',
        'microsoft/Phi-3-mini-4k-instruct',
        '--output',
        gguf_out,
        '--merged-dir',
        'outputs/merged',
    ]
    print('Attempting GGUF export...')
    try:
        subprocess.check_call(export_cmd)
        GGUF_EXPORT_SUCCESS = True
    except Exception as exc:
        print(f'Warning: GGUF export failed: {exc}')

# Ensure any generated GGUF is copied to work dir.
for src in glob.glob('**/*.gguf', recursive=True):
    dst = os.path.join(WORK_DIR, os.path.basename(src))
    if os.path.abspath(src) != os.path.abspath(dst) and not os.path.exists(dst):
        shutil.copy2(src, dst)

if accelerator == 'tpu':
    print('TPU training complete. To export GGUF, run export_gguf.py later on a GPU/CPU runtime.')
print(f'TRAINING_SUCCESS={TRAINING_SUCCESS}, GGUF_EXPORT_SUCCESS={GGUF_EXPORT_SUCCESS}')



---
## 4. üì¶ GGUF Export


In [None]:
# ============================================================
# Cell 5: GGUF Export Status
# ============================================================
import glob
import os

WORK_DIR = '/kaggle/working' if os.path.exists('/kaggle/working') else os.getcwd()

all_ggufs = (
    glob.glob(os.path.join(WORK_DIR, '*.gguf'))
    + glob.glob('/tmp/model_export*.gguf')
    + glob.glob('/tmp/model_export*/**/*.gguf')
    + glob.glob('/tmp/model_export_gguf/*.gguf')
    + glob.glob('outputs/npc_model/*.gguf')
    + glob.glob('*.gguf')
    + glob.glob('**/*.gguf', recursive=True)
)

candidates = [
    f
    for f in all_ggufs
    if os.path.exists(f)
    and os.path.getsize(f) > 200 * 1024 * 1024
    and not any(x in f.lower() for x in ['vocab', 'embedding', 'bge', 'bert'])
]

if candidates:
    trained_model_path = candidates[0]
    print(f'GGUF found: {trained_model_path}')
else:
    trained_model_path = None
    print('Warning: GGUF model not found yet. Ollama registration will be skipped.')


---
## 5. ü§ñ Ollama Serving


In [None]:
# ============================================================
# Cell 6: Ollama Serving (Safe Register)
# ============================================================
import glob
import os
import shutil
import subprocess
import time

import requests

OLLAMA_READY = False
ollama_bin = shutil.which('ollama')

if not ollama_bin:
    print('Warning: ollama binary not found. Skipping model registration.')
else:
    print('Starting or checking Ollama server...')
    server_ok = False
    try:
        if requests.get('http://localhost:11434/api/tags', timeout=2).status_code == 200:
            server_ok = True
            print('Ollama is already running.')
    except Exception as exc:
        print(f'Warning: initial Ollama health check failed: {exc}')

    if not server_ok:
        try:
            subprocess.Popen([ollama_bin, 'serve'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            time.sleep(5)
            server_ok = requests.get('http://localhost:11434/api/tags', timeout=5).status_code == 200
        except Exception as exc:
            print(f'Warning: failed to start Ollama: {exc}')

    tm_path = globals().get('trained_model_path')
    if not tm_path or not os.path.exists(tm_path):
        all_ggufs = (
            glob.glob('model_gguf/*.gguf')
            + glob.glob('*.gguf')
            + glob.glob('outputs/*.gguf')
            + glob.glob('/kaggle/working/*.gguf')
        )
        candidates = [
            f
            for f in all_ggufs
            if os.path.exists(f)
            and os.path.getsize(f) > 200 * 1024 * 1024
            and not any(x in f.lower() for x in ['vocab', 'embedding', 'bge', 'bert'])
        ]
        if candidates:
            tm_path = candidates[0]

    if server_ok and tm_path and os.path.exists(tm_path):
        lines = [
            f'FROM {tm_path}',
            'PARAMETER temperature 0.7',
            'PARAMETER stop "[PLAYER]"',
            'PARAMETER stop "[INSTRUCTION]"',
            'PARAMETER stop "[CONTEXT]"',
            'PARAMETER stop "<|end|>"',
            'SYSTEM "You are an NPC. Always respond strictly in English as the [NPC] speaker. Do not repeat the prompt."',
        ]
        with open('Modelfile', 'w', encoding='utf-8') as f:
            f.write('\n'.join(lines))

        print(f'Registering model npc-ai from: {tm_path}')
        result = subprocess.run(
            [ollama_bin, 'create', 'npc-ai', '-f', 'Modelfile'],
            capture_output=True,
            text=True,
        )
        if result.returncode == 0:
            OLLAMA_READY = True
            print('Ollama model registration succeeded.')
        else:
            print('Warning: Ollama model registration failed:')
            print(result.stderr[-500:])
    else:
        print('Warning: Ollama server/model prerequisites not satisfied; skipping registration.')



---
## 6. üéÆ Integrated Demo (Enhanced)


In [None]:
# ============================================================
# Cell 7: Integrated Demo (Clean Turns)
# ============================================================
import json

import requests


def query_npc(player_input):
    if not globals().get('OLLAMA_READY', False):
        return '[Ollama model is not ready]'

    ctx = {
        'memories': [],
        'current_emotion': {'description': 'neutral', 'valence': 0.0},
        'npc_info': {'name': 'Blacksmith', 'persona': 'A friendly blacksmith.'},
    }
    prompt = (
        '[INSTRUCTION] Respond strictly in English.\n[CONTEXT]\n'
        + json.dumps(ctx)
        + '\n\n[PLAYER] '
        + player_input
        + '\n\n[NPC] '
    )

    try:
        payload = {
            'model': 'npc-ai',
            'prompt': prompt,
            'stream': False,
            'options': {'stop': ['[PLAYER]', '[INSTRUCTION]', '<|end|>']},
        }
        res = requests.post('http://localhost:11434/api/generate', json=payload, timeout=60)
        if res.status_code == 200:
            text = res.json().get('response', '[No response]')
            return text.split('[NPC]')[-1].strip()
        return f'[Error {res.status_code}]'
    except Exception as exc:
        return f'[Error: {exc}]'


for inp in ['Hello! I am new here.', 'What is the curse?']:
    print(f'Player: {inp}\nNPC: {query_npc(inp)}\n')


---
## 7. üìä Quality Evaluation


In [None]:
print('Evaluating responses...')
# Simplified evaluation loop
if 'query_npc' not in globals():
    print('query_npc is unavailable; skipping evaluation.')
else:
    test_queries = ['Hello!', 'Who are you?', 'Tell me a story.']
    for q in test_queries:
        resp = query_npc(q)
        print(f'Q: {q}\nA: {resp[:80]}...\n')


---
## 8. üõ†Ô∏è C++ Engine Compilation


In [None]:
# ============================================================
# Cell 10: C++ Engine Compilation (Optimized)
# ============================================================
import os
import subprocess

if os.path.exists('cpp'):
    os.makedirs('cpp/build', exist_ok=True)
    try:
        subprocess.check_call(['cmake', '..'], cwd='cpp/build')
        jobs = max(1, os.cpu_count() or 1)
        subprocess.check_call(['cmake', '--build', '.', f'-j{jobs}'], cwd='cpp/build')
        print('Compilation successful.')
    except Exception as exc:
        print(f'Warning: C++ compilation failed: {exc}')
else:
    print('Warning: cpp/ not found.')


---
## 9. üìà Performance Benchmarking


In [None]:
# ============================================================
# Cell 11: C++ Engine Benchmarks
# ============================================================
import os
import subprocess

if os.path.exists('cpp/build'):
    print('Running C++ engine benchmarks...')
    benchmarks = ['bench_engine', 'bench_memory', 'bench_retrieval', 'ablation_suite']

    for bench in benchmarks:
        path = f'cpp/build/{bench}'
        if os.path.exists(path):
            print(f'\nExecuting {bench}...')
            print('-' * 40)
            try:
                res = subprocess.run([path], capture_output=True, text=True, timeout=300)
                print(res.stdout)
                if res.stderr:
                    print(f'Stderr: {res.stderr}')
            except subprocess.TimeoutExpired:
                print(f'Warning: {bench} timed out after 5 minutes.')
            except Exception as exc:
                print(f'Warning: failed to run {bench}: {exc}')
        else:
            print(f'Warning: benchmark binary not found: {path}')
else:
    print('Warning: cpp/build not found. Run the compilation cell first.')


---
## 10. üìä Ablation Study Visualization


In [None]:
# ============================================================
# Cell 12: Visualize Ablation Results
# ============================================================
import os, json

try:
    import pandas as pd
    import matplotlib.pyplot as plt
except Exception as exc:
    pd = None
    plt = None
    print(f'Warning: plotting libraries unavailable: {exc}')

results_path = 'cpp/build/ablation_results.json'
if pd is None or plt is None:
    print('Skipping ablation visualization because pandas/matplotlib are missing.')
elif os.path.exists(results_path):
    print(f"üìà Loading ablation results from {results_path}...")
    with open(results_path, 'r') as f:
        data = json.load(f)
    
    records = []
    for config, metrics in data.items():
        records.append({
            'Configuration': config,
            'Latency p95 (ms)': metrics.get('latency_p95_ms', 0),
            'Throughput (tok/s)': metrics.get('throughput_tok_s', 0),
            'Memory (MB)': metrics.get('peak_memory_mb', 0)
        })
    
    df = pd.DataFrame(records)
    # display(df) # Commented out for standalone script robustness
    print(df)
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    df.plot(x='Configuration', y='Latency p95 (ms)', kind='bar', ax=axes[0], color='salmon', legend=False)
    axes[0].set_title('95th Percentile Latency (Lower is Better)')
    axes[0].set_ylabel('Milliseconds (ms)')
    axes[0].tick_params(axis='x', rotation=45)
    
    df.plot(x='Configuration', y='Throughput (tok/s)', kind='bar', ax=axes[1], color='skyblue', legend=False)
    axes[1].set_title('Generation Throughput (Higher is Better)')
    axes[1].set_ylabel('Tokens per Second')
    axes[1].tick_params(axis='x', rotation=45)
    
    df.plot(x='Configuration', y='Memory (MB)', kind='bar', ax=axes[2], color='lightgreen', legend=False)
    axes[2].set_title('Peak Memory Usage (Lower is Better)')
    axes[2].set_ylabel('Megabytes (MB)')
    axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print(f"‚ö†Ô∏è Ablation results not found at {results_path}. Make sure Cell 11 ran successfully.")



## Proposal Evaluation (Batched)
Generate expanded scenarios and run proposal evaluation in batches for Kaggle stability.


In [None]:
import os
import subprocess
import sys

hf_cache = "/kaggle/working/hf_cache" if os.path.exists("/kaggle/working") else os.path.abspath("hf_cache")
os.makedirs(hf_cache, exist_ok=True)

subprocess.check_call(
    [
        sys.executable,
        "scripts/generate_proposal_scenarios_large.py",
        "--variants-per-base",
        "14",
        "--output",
        "data/proposal_eval_scenarios_large.jsonl",
    ]
)

cmd = [
    sys.executable,
    "scripts/run_proposal_alignment_eval_batched.py",
    "--scenarios",
    "data/proposal_eval_scenarios_large.jsonl",
    "--batch-size",
    "28",
    "--repeats",
    "1",
    "--max-tokens",
    "80",
    "--temperature",
    "0.2",
    "--baseline-models",
    "phi3:latest",
    "--bertscore-model-type",
    "roberta-large",
    "--bertscore-batch-size",
    "16",
    "--bertscore-cache-dir",
    hf_cache,
]
print("Running:", " ".join(cmd))
subprocess.check_call(cmd)


## Human Evaluation Pack (Optional)
Build blind multi-rater annotation files from the latest proposal run.


In [None]:
import pathlib
import subprocess
import sys

proposal_root = pathlib.Path("artifacts/proposal")
run_dirs = sorted([p for p in proposal_root.iterdir() if p.is_dir()]) if proposal_root.exists() else []
if not run_dirs:
    raise RuntimeError("No proposal runs found under artifacts/proposal. Run proposal eval first.")
latest_run = run_dirs[-1]

subprocess.check_call(
    [
        sys.executable,
        "scripts/build_human_eval_pack.py",
        "--run-dir",
        str(latest_run),
        "--annotators",
        "annotator_1,annotator_2,annotator_3",
        "--shared-ratio",
        "0.35",
    ]
)


## Publication Benchmark Suite
Run non-mock benchmark suite with retrieval security checks.


In [None]:
import subprocess
import sys

cmd = [
    sys.executable,
    "scripts/run_publication_benchmark_suite.py",
    "--repeats",
    "1",
    "--max-tokens",
    "64",
    "--temperature",
    "0.2",
    "--run-security-benchmark",
    "--run-security-spoofed-benchmark",
]
print("Running:", " ".join(cmd))
subprocess.check_call(cmd)


## Proposal Quality Gate
Evaluate whether latest proposal/publication artifacts satisfy the quality bar.


In [None]:
import subprocess
import sys

cmd = [
    sys.executable,
    "scripts/proposal_quality_gate.py",
    "--proposal-run",
    "latest",
    "--publication-run",
    "latest",
    "--require-security-benchmark",
]
print("Running:", " ".join(cmd))
subprocess.check_call(cmd)


## Full Artifact Checkout (Recommended)
Run the complete proposal/publication pipeline and emit a single manifest with all output paths.


In [None]:
import subprocess
import sys

# Option: set True to skip keyword/random ablation baselines in publication retrieval metrics.
SKIP_ABLATION_BASELINES = False

cmd = [
    sys.executable,
    "scripts/run_kaggle_full_results.py",
    "--host",
    "http://127.0.0.1:11434",
]
if SKIP_ABLATION_BASELINES:
    cmd.append("--skip-ablation-baselines")

print("Running:", " ".join(cmd))
subprocess.check_call(cmd)
