In [1]:
#!/usr/bin/env python3
"""
FMP JSONL Duplicate Checker (Annual & Quarterly)

Scans your existing JSONL files in:
  /Users/martingobbo/stock-dashboard/data/raw/fmp/{income_statement,balance_sheet,cash_flow}

Reports potential duplicates based on robust keys:
- Annual (FY):   key = ("FY", calendarYear) or ("FY", date[:4]) fallback
- Quarterly (Q): key = ("Q", date, period)   where period ∈ {Q1,Q2,Q3,Q4,quarter}

Outputs a CSV summary: duplicate_scan_report.csv
Does NOT modify any files.
"""

from pathlib import Path
import json
from collections import defaultdict
import csv

# ---------- CONFIG ----------
ROOT = Path("/Users/martingobbo/stock-dashboard/data/raw/fmp")
SUBDIRS = {
    "income_statement": ROOT / "income_statement",
    "balance_sheet":    ROOT / "balance_sheet",
    "cash_flow":        ROOT / "cash_flow",
    # ratios usually lacks FY/Q split; skip for this checker
}
OUTPUT_CSV = Path("duplicate_scan_report.csv")


# ---------- HELPERS ----------
def read_jsonl(path: Path):
    """Yield parsed dicts from a .jsonl file; skip malformed lines."""
    with path.open("r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except Exception:
                continue


def norm_period(value):
    """Normalize period string to: 'FY','Q1','Q2','Q3','Q4','quarter','unknown'."""
    if value is None:
        return "unknown"
    s = str(value).strip().lower()
    if s in {"fy", "annual", "year"}:
        return "FY"
    if s in {"q1", "q2", "q3", "q4"}:
        return s.upper()
    if s == "quarter":
        return "quarter"
    return s or "unknown"


def is_annual(p):
    return p == "FY"


def is_quarterly(p):
    return p in {"Q1", "Q2", "Q3", "Q4", "quarter"}


def annual_key(row):
    """Compute robust FY key from a row."""
    cal_year = str(row.get("calendarYear") or "").strip()
    if cal_year:
        return ("FY", cal_year)
    date = str(row.get("date") or "").strip()
    year = date[:4] if len(date) >= 4 else ""
    return ("FY", year or "unknown")


def quarterly_key(row, period_norm):
    """Compute robust Q key from a row (date + period)."""
    date = str(row.get("date") or "").strip()
    q = period_norm if period_norm in {"Q1", "Q2", "Q3", "Q4"} else period_norm
    return ("Q", date or "unknown", q or "unknown")


def scan_file(path: Path):
    """Scan a single .jsonl and return (totals, annual_dupes, quarterly_dupes)."""
    totals = {"annual": 0, "quarterly": 0, "other": 0}
    seen_annual = defaultdict(int)
    seen_quarterly = defaultdict(int)

    for row in read_jsonl(path):
        pnorm = norm_period(row.get("period"))
        if is_annual(pnorm):
            key = annual_key(row)
            seen_annual[key] += 1
            totals["annual"] += 1
        elif is_quarterly(pnorm):
            key = quarterly_key(row, pnorm)
            seen_quarterly[key] += 1
            totals["quarterly"] += 1
        else:
            totals["other"] += 1

    a_dupes = {k: c for k, c in seen_annual.items() if c > 1}
    q_dupes = {k: c for k, c in seen_quarterly.items() if c > 1}
    return totals, a_dupes, q_dupes


def preview(d, max_items=5):
    items = list(d.items())[:max_items]
    return "; ".join(f"{k} → {c}x" for k, c in items)


def summarize_folder(subdir_name, folder: Path):
    rows = []
    for f in sorted(folder.glob("*.jsonl")):
        sym = f.stem
        totals, a_dupes, q_dupes = scan_file(f)
        row = {
            "subdir": subdir_name,
            "ticker": sym,
            "path": str(f),
            "total_rows": totals["annual"] + totals["quarterly"] + totals["other"],
            "annual_rows": totals["annual"],
            "quarterly_rows": totals["quarterly"],
            "other_rows": totals["other"],
            "annual_dupe_keys": preview(a_dupes),
            # number of extra rows beyond unique (e.g., c=3 → 2 extras)
            "annual_dupe_count": sum(c - 1 for c in a_dupes.values()),
            "quarterly_dupe_keys": preview(q_dupes),
            "quarterly_dupe_count": sum(c - 1 for c in q_dupes.values()),
        }
        rows.append(row)
    return rows


# ---------- MAIN ----------
def main():
    all_rows = []
    for name, folder in SUBDIRS.items():
        if not folder.exists():
            print(f"[warn] Missing folder: {folder}")
            continue
        rows = summarize_folder(name, folder)
        all_rows.extend(rows)

    # Sort by worst offenders (most duplicate extras), then ticker
    all_rows.sort(
        key=lambda r: (r["annual_dupe_count"] + r["quarterly_dupe_count"], r["ticker"]),
        reverse=True,
    )

    fieldnames = [
        "subdir", "ticker", "path",
        "total_rows", "annual_rows", "quarterly_rows", "other_rows",
        "annual_dupe_count", "annual_dupe_keys",
        "quarterly_dupe_count", "quarterly_dupe_keys",
    ]

    with OUTPUT_CSV.open("w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for row in all_rows:
            w.writerow(row)

    print(f"[ok] Wrote report → {OUTPUT_CSV.resolve()}")
    print("Top 20 offenders:")
    for r in all_rows[:20]:
        total_extras = r["annual_dupe_count"] + r["quarterly_dupe_count"]
        print(
            f"{r['ticker']:6s}  extras={total_extras:3d}  "
            f"(annual={r['annual_dupe_count']}, quarterly={r['quarterly_dupe_count']})  {r['subdir']}"
        )


if __name__ == "__main__":
    main()


[ok] Wrote report → /Users/martingobbo/stock-dashboard/backtesting/just_testing/duplicate_scan_report.csv
Top 20 offenders:
WFC     extras= 80  (annual=40, quarterly=40)  income_statement
RTX     extras= 80  (annual=40, quarterly=40)  income_statement
RTX     extras= 80  (annual=40, quarterly=40)  balance_sheet
DIS     extras= 80  (annual=40, quarterly=40)  income_statement
DIS     extras= 80  (annual=40, quarterly=40)  balance_sheet
AON     extras= 80  (annual=40, quarterly=40)  income_statement
AON     extras= 80  (annual=40, quarterly=40)  balance_sheet
AJG     extras= 80  (annual=40, quarterly=40)  income_statement
AJG     extras= 80  (annual=40, quarterly=40)  balance_sheet
AIG     extras= 80  (annual=40, quarterly=40)  income_statement
AIG     extras= 80  (annual=40, quarterly=40)  balance_sheet
AFL     extras= 80  (annual=40, quarterly=40)  income_statement
AFL     extras= 80  (annual=40, quarterly=40)  balance_sheet
AEP     extras= 80  (annual=40, quarterly=40)  income_statemen

In [2]:
#!/usr/bin/env python3
"""
FMP JSONL Dedupe & Verify (Annual & Quarterly)

What it does (per file under income_statement, balance_sheet, cash_flow):
1) Reads .jsonl rows
2) Detects duplicates using robust keys:
   - Annual (FY):   ("FY", calendarYear) or ("FY", date[:4]) fallback
   - Quarterly (Q): ("Q", date, period) with period ∈ {Q1,Q2,Q3,Q4,quarter}
3) Keeps exactly ONE row per key and removes extras
   - Never deletes the only row for a key (so FY/Q coverage is preserved)
   - Tie-break: keeps the row with the greatest "completeness" (most non-null fields);
               if tied, keeps the LAST occurrence in the file (stable, deterministic)
4) Writes a timestamped backup of the original file next to it
5) Writes the cleaned file in place
6) Re-runs a duplicate scan and prints a short report
7) Also writes a CSV summary after-cleanup: duplicate_scan_report_after.csv

Only touches: income_statement, balance_sheet, cash_flow.
Skips: ratios (often not FY/Q-marked).

Usage:
  python3 fmp_dedupe_jsonl.py
"""

from pathlib import Path
import json
from collections import defaultdict
import csv
from datetime import datetime
from typing import Dict, Tuple, Any, List

# ---------- CONFIG ----------
ROOT = Path("/Users/martingobbo/stock-dashboard/data/raw/fmp")
SUBDIRS = {
    "income_statement": ROOT / "income_statement",
    "balance_sheet":    ROOT / "balance_sheet",
    "cash_flow":        ROOT / "cash_flow",
}
OUTPUT_CSV_AFTER = Path("duplicate_scan_report_after.csv")

# ---------- HELPERS ----------
def read_jsonl(path: Path):
    """Yield (line_idx, parsed_dict) from a .jsonl; skip malformed lines but preserve index."""
    with path.open("r") as f:
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            try:
                yield i, json.loads(line)
            except Exception:
                # Malformed lines are ignored during dedupe (cannot key them), but we won't re-emit them.
                # If you prefer, you could append them verbatim at the end.
                continue

def norm_period(value: Any) -> str:
    """Normalize period string to: 'FY','Q1','Q2','Q3','Q4','quarter','unknown'."""
    if value is None:
        return "unknown"
    s = str(value).strip().lower()
    if s in {"fy", "annual", "year"}:
        return "FY"
    if s in {"q1", "q2", "q3", "q4"}:
        return s.upper()
    if s == "quarter":
        return "quarter"
    return s or "unknown"

def is_annual(p: str) -> bool:
    return p == "FY"

def is_quarterly(p: str) -> bool:
    return p in {"Q1", "Q2", "Q3", "Q4", "quarter"}

def annual_key(row: Dict[str, Any]) -> Tuple[str, str]:
    """Compute robust FY key from a row."""
    cal_year = str(row.get("calendarYear") or "").strip()
    if cal_year:
        return ("FY", cal_year)
    date = str(row.get("date") or "").strip()
    year = date[:4] if len(date) >= 4 else ""
    return ("FY", year or "unknown")

def quarterly_key(row: Dict[str, Any], period_norm: str) -> Tuple[str, str, str]:
    """Compute robust Q key from a row (date + period)."""
    date = str(row.get("date") or "").strip()
    q = period_norm if period_norm in {"Q1", "Q2", "Q3", "Q4"} else period_norm
    return ("Q", date or "unknown", q or "unknown")

def completeness_score(row: Dict[str, Any]) -> int:
    """
    Score row by number of non-null, non-empty fields.
    We exclude a few common 'meta' fields from contributing (adjust if you want).
    """
    exclude = {
        "symbol", "reportedCurrency", "cik", "fillingDate", "acceptedDate",
        "period", "calendarYear", "link", "finalLink"
    }
    score = 0
    for k, v in row.items():
        if k in exclude:
            continue
        if v is None:
            continue
        # treat empty strings as null-like
        if isinstance(v, str) and not v.strip():
            continue
        score += 1
    return score

def best_row(existing: Tuple[int, Dict[str, Any]], candidate: Tuple[int, Dict[str, Any]]) -> Tuple[int, Dict[str, Any]]:
    """
    Tie-breaker for duplicates:
    1) Keep the row with higher completeness_score
    2) If tied, keep the one appearing later in file (greater line_idx)
    Returns the winning (line_idx, row)
    """
    (i1, r1) = existing
    (i2, r2) = candidate
    s1, s2 = completeness_score(r1), completeness_score(r2)
    if s2 > s1:
        return (i2, r2)
    if s2 < s1:
        return (i1, r1)
    # tie → prefer later occurrence
    return (i2, r2) if i2 > i1 else (i1, r1)

def scan_file_for_dupes(path: Path):
    """
    Returns:
      totals: dict
      a_dupes: Dict[key, count]
      q_dupes: Dict[key, count]
    """
    totals = {"annual": 0, "quarterly": 0, "other": 0}
    seen_annual = defaultdict(int)
    seen_quarterly = defaultdict(int)

    for _, row in read_jsonl(path):
        pnorm = norm_period(row.get("period"))
        if is_annual(pnorm):
            key = annual_key(row)
            seen_annual[key] += 1
            totals["annual"] += 1
        elif is_quarterly(pnorm):
            key = quarterly_key(row, pnorm)
            seen_quarterly[key] += 1
            totals["quarterly"] += 1
        else:
            totals["other"] += 1

    a_dupes = {k: c for k, c in seen_annual.items() if c > 1}
    q_dupes = {k: c for k, c in seen_quarterly.items() if c > 1}
    return totals, a_dupes, q_dupes

def preview(d: Dict[Any, int], max_items=5) -> str:
    items = list(d.items())[:max_items]
    return "; ".join(f"{k} → {c}x" for k, c in items)

def write_backup(original: Path) -> Path:
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    backup = original.with_suffix(original.suffix + f".bak_{ts}")
    backup.write_bytes(original.read_bytes())
    return backup

def dedupe_file_in_place(path: Path) -> Tuple[int, int, int]:
    """
    Dedupe a single JSONL file in place.
    Returns:
      (removed_annual, removed_quarterly, kept_other)
    """
    # First pass: choose a single best row per key
    annual_best: Dict[Tuple[str, str], Tuple[int, Dict[str, Any]]] = {}
    quarterly_best: Dict[Tuple[str, str, str], Tuple[int, Dict[str, Any]]] = {}
    others: List[Dict[str, Any]] = []

    for line_idx, row in read_jsonl(path):
        pnorm = norm_period(row.get("period"))
        if is_annual(pnorm):
            key = annual_key(row)
            if key in annual_best:
                annual_best[key] = best_row(annual_best[key], (line_idx, row))
            else:
                annual_best[key] = (line_idx, row)
        elif is_quarterly(pnorm):
            key = quarterly_key(row, pnorm)
            if key in quarterly_best:
                quarterly_best[key] = best_row(quarterly_best[key], (line_idx, row))
            else:
                quarterly_best[key] = (line_idx, row)
        else:
            # Keep "other" rows exactly as-is (no dedupe rule)
            others.append(row)

    # Rebuild file contents from winners only (ensuring at least one per key)
    # Backup before writing
    _ = write_backup(path)

    winners = [r for (_, r) in annual_best.values()] + [r for (_, r) in quarterly_best.values()] + others

    with path.open("w") as f:
        for r in winners:
            f.write(json.dumps(r, separators=(",", ":")) + "\n")

    # Compute how many extras were removed (for info)
    # Re-scan with counts BEFORE would require a pre-pass; instead, approximate by re-reading and comparing keys.
    # Do a quick second scan with counts to report dupes after (should be zero).
    totals_after, a_dupes_after, q_dupes_after = scan_file_for_dupes(path)

    # Return a rough stat: extras remaining are 0; we can't easily report removed count without a pre-count.
    removed_annual = sum(c - 1 for c in a_dupes_after.values())  # should be 0
    removed_quarterly = sum(c - 1 for c in q_dupes_after.values())  # should be 0
    kept_other = totals_after["other"]
    return removed_annual, removed_quarterly, kept_other

def summarize_folder(subdir_name: str, folder: Path):
    rows = []
    for f in sorted(folder.glob("*.jsonl")):
        sym = f.stem
        totals, a_dupes, q_dupes = scan_file_for_dupes(f)
        rows.append({
            "subdir": subdir_name,
            "ticker": sym,
            "path": str(f),
            "total_rows": totals["annual"] + totals["quarterly"] + totals["other"],
            "annual_rows": totals["annual"],
            "quarterly_rows": totals["quarterly"],
            "other_rows": totals["other"],
            "annual_dupe_count": sum(c - 1 for c in a_dupes.values()),
            "annual_dupe_keys": preview(a_dupes),
            "quarterly_dupe_count": sum(c - 1 for c in q_dupes.values()),
            "quarterly_dupe_keys": preview(q_dupes),
        })
    return rows

# ---------- MAIN ----------
def main():
    # 1) Dedupe in place across all target subfolders
    print("[info] Starting in-place dedupe with backups...")
    for name, folder in SUBDIRS.items():
        if not folder.exists():
            print(f"[warn] Missing folder: {folder}")
            continue
        for f in sorted(folder.glob("*.jsonl")):
            removed_a, removed_q, kept_o = dedupe_file_in_place(f)
            print(f"[ok] Cleaned {name:16s} {f.stem:6s} → wrote deduped file (others kept={kept_o})")

    # 2) After-cleanup verification & CSV
    print("[info] Verifying duplicates AFTER cleanup...")
    all_rows = []
    for name, folder in SUBDIRS.items():
        if not folder.exists():
            continue
        rows = summarize_folder(name, folder)
        all_rows.extend(rows)

    # Sort by any remaining offenders (should be zero), then ticker
    all_rows.sort(
        key=lambda r: (r["annual_dupe_count"] + r["quarterly_dupe_count"], r["ticker"]),
        reverse=True,
    )

    fieldnames = [
        "subdir", "ticker", "path",
        "total_rows", "annual_rows", "quarterly_rows", "other_rows",
        "annual_dupe_count", "annual_dupe_keys",
        "quarterly_dupe_count", "quarterly_dupe_keys",
    ]
    with OUTPUT_CSV_AFTER.open("w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for row in all_rows:
            w.writerow(row)

    print(f"[ok] Wrote AFTER report → {OUTPUT_CSV_AFTER.resolve()}")
    offenders = [r for r in all_rows if (r["annual_dupe_count"] + r["quarterly_dupe_count"]) > 0]
    if offenders:
        print("Remaining offenders (top 20):")
        for r in offenders[:20]:
            total_extras = r["annual_dupe_count"] + r["quarterly_dupe_count"]
            print(
                f"{r['ticker']:6s}  extras={total_extras:3d}  "
                f"(annual={r['annual_dupe_count']}, quarterly={r['quarterly_dupe_count']})  {r['subdir']}"
            )
    else:
        print("[good] No remaining duplicates found across scanned files.")

if __name__ == "__main__":
    main()


[info] Starting in-place dedupe with backups...
[ok] Cleaned income_statement A      → wrote deduped file (others kept=0)
[ok] Cleaned income_statement AAPL   → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ABBV   → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ABNB   → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ABT    → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ACGL   → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ACN    → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ADBE   → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ADI    → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ADM    → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ADP    → wrote deduped file (others kept=0)
[ok] Cleaned income_statement ADSK   → wrote deduped file (others kept=0)
[ok] Cleaned income_statement AEE    → wrote deduped file (other