In [None]:
# demo.ipynb

# 1) Basic environment + path setup
from pathlib import Path
import os
import sys
import pandas as pd

print("üîß Project LANTERN ‚Äî Demo Runner")

# Detect project root (assumes this notebook lives in <repo>/notebooks)
notebook_cwd = Path.cwd().resolve()

if notebook_cwd.name == "notebooks" and (notebook_cwd.parent / "data").exists():
    PROJECT_ROOT = notebook_cwd.parent
elif (notebook_cwd / "data").exists() and (notebook_cwd / "config").exists():
    PROJECT_ROOT = notebook_cwd
else:
    PROJECT_ROOT = notebook_cwd.parent

DATA_ROOT = PROJECT_ROOT / "data"
CONFIG_ROOT = PROJECT_ROOT / "config"
EPSTEIN_IMAGE_ROOT = DATA_ROOT / "epstein_curated_v1"
OCR_CACHE_DIR = DATA_ROOT / "ocr_cache"
EXPORT_DIR = DATA_ROOT / "outputs"

print(f"üìÅ PROJECT_ROOT  : {PROJECT_ROOT}")
print(f"üìÅ DATA_ROOT     : {DATA_ROOT}")
print(f"üìÅ CONFIG_ROOT   : {CONFIG_ROOT}")

assert DATA_ROOT.exists(), "DATA_ROOT missing (expected data/ at repo root)."
assert CONFIG_ROOT.exists(), "CONFIG_ROOT missing (expected config/ at repo root)."

OCR_CACHE_DIR.mkdir(parents=True, exist_ok=True)
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

# Wire repo into sys.path
for p in [PROJECT_ROOT, PROJECT_ROOT / "src", PROJECT_ROOT / "agents", PROJECT_ROOT / "tools"]:
    p_str = str(p)
    if p_str not in sys.path:
        sys.path.insert(0, p_str)

# 2) Imports from the project
from agents.ocr_agent import OCRAgent, OCRAgentConfig
from agents.threading_agent import ThreadingAgent, ThreadingAgentConfig
from agents.extraction_agent import ExtractionAgent, ExtractionConfig
from src.pipeline import run_pipeline

print("‚úÖ Imports OK ‚Äî OCRAgent, ThreadingAgent, ExtractionAgent, run_pipeline")

# 3) Load full manifest
manifest_path = DATA_ROOT / "manifest.csv"
assert manifest_path.exists(), f"Manifest not found at: {manifest_path}"

manifest_df = pd.read_csv(manifest_path)
print(f"‚úÖ Loaded manifest with {len(manifest_df)} rows.")

# 4) Instantiate agents with sensible defaults
ocr_config = OCRAgentConfig(
    model_name="gpt-4o-mini",   # or "gpt-4o" in richer environments
    max_retries=2,
    timeout=30,
)
ocr_agent = OCRAgent(ocr_config)

threading_agent = ThreadingAgent(enable_threads=True)

extraction_config = ExtractionConfig(
    enable_insights=True,
    max_summary_chars=600,
    debug=False,
)
extraction_agent = ExtractionAgent(extraction_config)

print("‚úÖ Agents instantiated.")

# 5) Run the pipeline on the full manifest
from time import perf_counter

print("\nüöÄ Running full Project LANTERN pipeline on manifest.csv")
print(f"   OCR cache dir : {OCR_CACHE_DIR}")
print(f"   Export dir    : {EXPORT_DIR}")
print("‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")

start = perf_counter()

enriched_pages, sequence_summaries = run_pipeline(
    manifest_df=manifest_df,
    ocr_agent=ocr_agent,
    threading_agent=threading_agent,
    extraction_agent=extraction_agent,
    epstein_image_root=EPSTEIN_IMAGE_ROOT,
    ocr_cache_dir=OCR_CACHE_DIR,
    use_ocr_cache=True,
    save_ocr_cache=True,
    export_dir=EXPORT_DIR,
    export_jsonl=True,
)

elapsed = perf_counter() - start

print("\n‚úÖ Demo pipeline complete.")
print(f"   ‚Ä¢ Enriched pages     : {len(enriched_pages)}")
print(f"   ‚Ä¢ Sequence summaries : {len(sequence_summaries)}")
print(f"   ‚Ä¢ Elapsed time       : {elapsed:0.2f} seconds")
print("")
print("Artifacts written to:")
print(f"   ‚Ä¢ {EXPORT_DIR / 'pages.jsonl'}")
print(f"   ‚Ä¢ {EXPORT_DIR / 'sequences.jsonl'}")
