In [14]:
# === Cell 1: Planner (no downloads) ===
#!/usr/bin/env python3
"""
Plans the next FMP jobs to run (no API calls), prints a status snapshot and a preview list.
Run this first. It defines PLANNED_JOBS and STATUS_MAP for Cell 2 to consume.
"""

from pathlib import Path
from typing import Dict, List, Tuple
import json
import duckdb

# ---------------- CONFIG (shared across cells) ----------------
API_KEY   = "c5PobUQjaaMTHySILWqmWi9uyIDqYJBi"  # your key
DB_PATH   = "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb"
RAW_ROOT  = Path("/Users/martingobbo/stock-dashboard/data/raw/fmp")

SUBDIRS = {
    "income_statement": RAW_ROOT / "income_statement",
    "balance_sheet":    RAW_ROOT / "balance_sheet",
    "cash_flow":        RAW_ROOT / "cash_flow",
    "ratios":           RAW_ROOT / "ratios",
}

BASE = "https://financialmodelingprep.com/api/v3"
# Index-aligned with STATUS_COLS
ENDPOINTS: List[Tuple[str, str, dict]] = [
    ("income_statement",  f"{BASE}/income-statement/{{sym}}",        {"period": "annual",  "limit": "60"}),  # inc_FY
    ("income_statement",  f"{BASE}/income-statement/{{sym}}",        {"period": "quarter", "limit": "40"}),  # inc_Q
    ("balance_sheet",     f"{BASE}/balance-sheet-statement/{{sym}}", {"period": "annual",  "limit": "60"}),  # bs_FY
    ("balance_sheet",     f"{BASE}/balance-sheet-statement/{{sym}}", {"period": "quarter", "limit": "40"}),  # bs_Q
    ("cash_flow",         f"{BASE}/cash-flow-statement/{{sym}}",     {"period": "annual",  "limit": "60"}),  # cf_FY
    ("cash_flow",         f"{BASE}/cash-flow-statement/{{sym}}",     {"period": "quarter", "limit": "40"}),  # cf_Q
    ("ratios",            f"{BASE}/ratios/{{sym}}",                  {"limit": "60"}),                       # ratios
]
STATUS_COLS = ["inc_FY", "inc_Q", "bs_FY", "bs_Q", "cf_FY", "cf_Q", "ratios"]

# Tweakable display knobs for planning
STATUS_LIMIT = 40   # show first N tickers in the status table
PREVIEW      = 30   # show first N planned jobs

# ---------------- Helpers (shared with Cell 2) ----------------
def ensure_dirs():
    for p in SUBDIRS.values():
        p.mkdir(parents=True, exist_ok=True)

def get_all_tickers_from_duckdb() -> List[str]:
    con = duckdb.connect(DB_PATH, read_only=True)
    rows = con.execute("SELECT ticker FROM dim_ticker ORDER BY ticker;").fetchall()
    return [r[0] for r in rows]

def file_has_any_rows(path: Path) -> bool:
    if not path.exists():
        return False
    try:
        with path.open("r") as f:
            for _ in f:
                return True
    except Exception:
        return False
    return False

def jsonl_has_period(path: Path, want_period: str) -> bool:
    """
    want_period 'annual' -> accepts 'FY'/'annual'/'year' (case-insensitive)
    want_period 'quarter' -> accepts 'quarter' or strings starting with 'Q' (Q1..Q4)
    """
    if not path.exists():
        return False
    annual_tags = {"fy", "annual", "year"}
    try:
        with path.open("r") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                except Exception:
                    continue
                p = str(obj.get("period", "")).strip().lower()
                if want_period == "annual":
                    if p in annual_tags:
                        return True
                else:  # quarter
                    if p == "quarter" or p.startswith("q"):
                        return True
    except Exception:
        return False
    return False

def detect_status_for_symbol(sym: str) -> Dict[str, bool]:
    status = {k: False for k in STATUS_COLS}
    inc_path = SUBDIRS["income_statement"] / f"{sym}.jsonl"
    status["inc_FY"] = jsonl_has_period(inc_path, "annual")
    status["inc_Q"]  = jsonl_has_period(inc_path, "quarter")

    bs_path = SUBDIRS["balance_sheet"] / f"{sym}.jsonl"
    status["bs_FY"] = jsonl_has_period(bs_path, "annual")
    status["bs_Q"]  = jsonl_has_period(bs_path, "quarter")

    cf_path = SUBDIRS["cash_flow"] / f"{sym}.jsonl"
    status["cf_FY"] = jsonl_has_period(cf_path, "annual")
    status["cf_Q"]  = jsonl_has_period(cf_path, "quarter")

    ratios_path = SUBDIRS["ratios"] / f"{sym}.jsonl"
    status["ratios"] = file_has_any_rows(ratios_path)
    return status

def print_status_table(status_map: Dict[str, Dict[str, bool]], limit: int = 40):
    cols = ["ticker"] + STATUS_COLS + ["missing"]
    header = " | ".join(f"{c:>8s}" for c in cols)
    print("\n=== STATUS (first {} tickers) ===".format(limit))
    print(header)
    print("-" * len(header))
    count = 0
    for sym, st in status_map.items():
        missing = sum(1 for v in st.values() if not v)
        row = [sym] + [("✔" if st[c] else "·") for c in STATUS_COLS] + [str(missing)]
        print(" | ".join(f"{c:>8s}" for c in row))
        count += 1
        if count >= limit:
            break
    total_syms = len(status_map)
    total_missing = sum(sum(1 for v in st.values() if not v) for st in status_map.values())
    print(f"\n[summary] symbols: {total_syms} | total missing endpoints: {total_missing}\n")

def missing_count(st: Dict[str, bool]) -> int:
    return sum(1 for v in st.values() if not v)

def plan_jobs(status_map: Dict[str, Dict[str, bool]]) -> List[Tuple[str, str]]:
    tickers = list(status_map.keys())
    needers = [sym for sym in tickers if missing_count(status_map[sym]) > 0]
    needers.sort(key=lambda s: (missing_count(status_map[s]), s))
    if "AON" in needers:
        needers.remove("AON")
        needers.insert(0, "AON")

    jobs: List[Tuple[str, str]] = []
    for sym in needers:
        st = status_map[sym]
        for label in STATUS_COLS:
            if not st[label]:
                jobs.append((sym, label))
    return jobs

# ---------------- Run planner ----------------
ensure_dirs()
TICKERS: List[str] = get_all_tickers_from_duckdb()
print(f"[info] total dim_ticker symbols: {len(TICKERS)}")

STATUS_MAP: Dict[str, Dict[str, bool]] = {sym: detect_status_for_symbol(sym) for sym in TICKERS}
print_status_table(STATUS_MAP, limit=STATUS_LIMIT)

PLANNED_JOBS: List[Tuple[str, str]] = plan_jobs(STATUS_MAP)

print("=== PLANNED JOBS (preview) ===")
if not PLANNED_JOBS:
    print("(no pending jobs — everything appears present)")
else:
    for i, (sym, label) in enumerate(PLANNED_JOBS[:PREVIEW], 1):
        scope = "annual" if label.endswith("_FY") else ("quarter" if label.endswith("_Q") else "all")
        print(f"{i:2d}. {sym:<8s} → {label:<7s} ({scope})")
    if len(PLANNED_JOBS) > PREVIEW:
        print(f"... and {len(PLANNED_JOBS) - PREVIEW} more")


IOException: IO Error: Could not set lock on file "/Users/martingobbo/stock-dashboard/data/serving/analytics.duckdb": Conflicting lock is held in /usr/local/bin/node (PID 16999) by user martingobbo. See also https://duckdb.org/docs/stable/connect/concurrency

In [12]:
# === Cell 2: Downloader (executes the planned jobs, respects budget) ===
#!/usr/bin/env python3
"""
Consumes PLANNED_JOBS from Cell 1 and downloads ONLY missing endpoints, within a hard budget.
Re-checks disk presence before each call; appends JSONL rows.
"""

from pathlib import Path
from typing import Dict, List, Tuple
import json, time, requests

# ---- Runtime knobs ----
BUDGET = 300     # max API attempts this cell will make
SLEEP  = 1.0     # seconds between successful attempts

# ---- Helpers that rely on Cell 1 symbols (SUBDIRS, ENDPOINTS, STATUS_COLS, jsonl_has_period, file_has_any_rows) ----
def append_jsonl(path: Path, rows: List[dict]):
    if not rows:
        return
    with path.open("a") as f:
        for row in rows:
            f.write(json.dumps(row) + "\n")

def get_json(url: str, params: Dict[str, str], session: requests.Session) -> Tuple[int, List[dict]]:
    q = dict(params or {})
    q["apikey"] = API_KEY
    try:
        r = session.get(url, params=q, timeout=30)
    except Exception:
        return (-1, [])
    code = r.status_code
    if code == 200:
        try:
            data = r.json()
            if not isinstance(data, list):
                data = []
        except Exception:
            data = []
        return (200, data)
    return (code, [])

def label_to_endpoint_idx(label: str) -> int:
    return STATUS_COLS.index(label)

def is_job_already_satisfied(sym: str, label: str) -> bool:
    if label == "ratios":
        path = SUBDIRS["ratios"] / f"{sym}.jsonl"
        return file_has_any_rows(path)
    if label.startswith("inc_"):
        path = SUBDIRS["income_statement"] / f"{sym}.jsonl"
    elif label.startswith("bs_"):
        path = SUBDIRS["balance_sheet"] / f"{sym}.jsonl"
    elif label.startswith("cf_"):
        path = SUBDIRS["cash_flow"] / f"{sym}.jsonl"
    else:
        return False
    want = "annual" if label.endswith("FY") else "quarter"
    return jsonl_has_period(path, want)

def run_job(sym: str, label: str, session: requests.Session) -> Tuple[bool, int]:
    idx = label_to_endpoint_idx(label)
    subdir, tmpl, params = ENDPOINTS[idx]
    url = tmpl.format(sym=sym)
    status_code, data = get_json(url, params, session)
    if status_code == 200:
        out_path = SUBDIRS[subdir] / f"{sym}.jsonl"
        before_present = is_job_already_satisfied(sym, label)
        append_jsonl(out_path, data)
        now_present = is_job_already_satisfied(sym, label)
        got = len(data)
        if now_present and not before_present:
            print(f"[ok] {sym:6s} {label:7s} → +{got:3d} rows → {out_path}")
        else:
            print(f"[ok] {sym:6s} {label:7s} → +{got:3d} rows (no new presence) → {out_path}")
        return (True, got)
    else:
        print(f"[warn] {sym} {label} HTTP {status_code} (attempt counted)")
        return (False, 0)

# ---- Execute plan ----
if "PLANNED_JOBS" not in globals():
    raise RuntimeError("PLANNED_JOBS not found. Run Cell 1 first.")

calls_left = BUDGET
jobs_done = 0
total_rows = 0

if calls_left <= 0 or not PLANNED_JOBS:
    print("\n[stop] nothing to do (no budget or no planned jobs)\n")
else:
    with requests.Session() as s:
        for (sym, label) in PLANNED_JOBS:
            if calls_left <= 0:
                print("[stop] budget exhausted before next job")
                break

            # Skip if already satisfied (double-check disk)
            if is_job_already_satisfied(sym, label):
                if "STATUS_MAP" in globals():
                    STATUS_MAP[sym][label] = True
                continue

            success, got = run_job(sym, label, s)
            calls_left -= 1
            jobs_done += 1
            total_rows += got

            if success and "STATUS_MAP" in globals() and is_job_already_satisfied(sym, label):
                STATUS_MAP[sym][label] = True

            if calls_left <= 0:
                print("[stop] budget exhausted after this job")
                break

            if SLEEP > 0:
                time.sleep(SLEEP)

    remaining_missing = None
    if "STATUS_MAP" in globals():
        remaining_missing = sum(sum(1 for v in st.values() if not v) for st in STATUS_MAP.values())

    print(f"\n[done] jobs attempted: {jobs_done} | rows appended: {total_rows} | calls remaining: {calls_left}")
    if remaining_missing is not None:
        print(f"[after] total missing endpoints across universe: {remaining_missing}\n")


[ok] BRK.B  ratios  → +  0 rows (no new presence) → /Users/martingobbo/stock-dashboard/data/raw/fmp/ratios/BRK.B.jsonl
[ok] KLAC   inc_Q   → + 40 rows → /Users/martingobbo/stock-dashboard/data/raw/fmp/income_statement/KLAC.jsonl
[ok] KLAC   bs_FY   → + 40 rows → /Users/martingobbo/stock-dashboard/data/raw/fmp/balance_sheet/KLAC.jsonl
[ok] KLAC   bs_Q    → + 40 rows → /Users/martingobbo/stock-dashboard/data/raw/fmp/balance_sheet/KLAC.jsonl
[ok] KLAC   cf_FY   → + 36 rows → /Users/martingobbo/stock-dashboard/data/raw/fmp/cash_flow/KLAC.jsonl
[ok] KLAC   cf_Q    → + 40 rows → /Users/martingobbo/stock-dashboard/data/raw/fmp/cash_flow/KLAC.jsonl
[ok] KLAC   ratios  → + 40 rows → /Users/martingobbo/stock-dashboard/data/raw/fmp/ratios/KLAC.jsonl
[ok] BF.B   inc_FY  → +  0 rows (no new presence) → /Users/martingobbo/stock-dashboard/data/raw/fmp/income_statement/BF.B.jsonl
[ok] BF.B   inc_Q   → +  0 rows (no new presence) → /Users/martingobbo/stock-dashboard/data/raw/fmp/income_statement/BF.B.j