In [1]:
"""
diagnose_candidates.py

Quick diagnostic for Step 07 candidate pool vs Golden Set.
Helps explain why Step 08 fidelity is so low.
"""

import pandas as pd
from config import TMDB_DIR, GOLDEN_TITLES
from utils import normalize_for_matching_extended

# --- Load candidates from Step 07
cands_path = TMDB_DIR / "tmdb_input_candidates_clean.csv"
cands = pd.read_csv(cands_path, dtype={"release_group_id": str})

print(f"Loaded {len(cands):,} candidate rows from {cands_path}")

# --- Normalize titles for alignment
cands["norm"] = cands["title"].fillna("").apply(normalize_for_matching_extended)

# --- Inspect overall stats
print("\n=== Candidate Pool Summary ===")
print("release_group_secondary_type value counts:")
print(cands["release_group_secondary_type"].value_counts(dropna=False).head(10))
print("\nMost common words in titles:")
top_words = (
    cands["norm"]
    .str.split()
    .explode()
    .value_counts()
    .head(20)
)
print(top_words)

# --- Golden Set coverage
print("\n=== Golden Set Coverage ===")
for gtitle in GOLDEN_TITLES:
    g_norm = normalize_for_matching_extended(gtitle)
    # Candidates within ¬±5 years (looser than Step 08)
    rows = cands[cands["year"].between(1900, 2025)]
    subset = rows[rows["norm"].str.contains(g_norm.split()[0], na=False)]

    print(f"\nüé¨ {gtitle} ‚Üí norm={g_norm}")
    print(f"  Candidates found: {len(subset)}")
    print(subset[["title", "year", "release_group_secondary_type"]].head(20).to_string(index=False))


Loaded 237,235 candidate rows from D:\Capstone_Staging\data\tmdb\tmdb_input_candidates_clean.csv

=== Candidate Pool Summary ===
release_group_secondary_type value counts:
release_group_secondary_type
Soundtrack    237235
Name: count, dtype: int64

Most common words in titles:
norm
the            14721
of             11846
hits            7198
collection      6981
music           4694
in              4321
de              4022
rock            3735
best            3473
and             3305
songs           2910
jazz            2899
love            2892
to              2870
for             2739
christmas       2722
sampler         2655
compilation     2477
classics        2475
dance           2458
Name: count, dtype: int64

=== Golden Set Coverage ===

üé¨ Titanic ‚Üí norm=titanic
  Candidates found: 10
                                        title  year release_group_secondary_type
                                      Titanic  2024                   Soundtrack
                          

In [16]:
import pandas as pd
from pathlib import Path
import csv

# --- Config
path = Path(r"D:\Capstone_Staging\data\joined_release_data.tsv")
out_csv = Path(r"D:\Capstone_Staging\data\golden_audit.csv")

# Golden Set targets (expand if needed)
golden_targets = [
    {"title": "Jaws", "artist": "John Williams", "year": 1975},
    {"title": "Gladiator", "artist": "Hans Zimmer", "year": 2000},
    {"title": "Frozen", "artist": "Christophe Beck", "year": 2013},
    {"title": "The Lord of the Rings", "artist": "Howard Shore", "year": 2001},
    {"title": "Inception", "artist": "Hans Zimmer", "year": 2010},
    {"title": "Jurassic Park", "artist": "John Williams", "year": 1993},
    {"title": "Titanic", "artist": "James Horner", "year": 1997},
    {"title": "Star Wars", "artist": "John Williams", "year": 1977},
    {"title": "The Dark Knight", "artist": "Hans Zimmer", "year": 2008},
    {"title": "E.T.", "artist": "John Williams", "year": 1982},
]

# --- Streaming search (captures all hits, not just 10)
def find_candidates(title, artist, year, window=5):
    hits = []
    with open(path, encoding="utf-8") as f:
        for i, line in enumerate(f):
            lower = line.lower()
            if title.lower() in lower:
                artist_ok = (not artist) or (artist.lower() in lower)
                year_ok = any(str(y) in lower for y in range(year-window, year+window+1))
                if artist_ok and year_ok:
                    hits.append((i, line.strip()))
    return hits

# --- Collect results
all_hits = []
for target in golden_targets:
    title, artist, year = target["title"], target["artist"], target["year"]
    matches = find_candidates(title, artist, year)
    print(f"\nüé¨ {title} (expect {artist}, ~{year})")
    if not matches:
        print("  ‚ö†Ô∏è No strong matches found")
    else:
        for idx, text in matches[:5]:
            print(f"  Line {idx}: {text[:200]}...")
        # Save all matches (not truncated)
        for idx, text in matches:
            all_hits.append({
                "golden_title": title,
                "expected_artist": artist,
                "expected_year": year,
                "line_number": idx,
                "raw_text": text
            })

# --- Write audit CSV
if all_hits:
    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=list(all_hits[0].keys()))
        writer.writeheader()
        writer.writerows(all_hits)

    print(f"\n‚úÖ Saved {len(all_hits)} matches to {out_csv}")
else:
    print("\n‚ö†Ô∏è No matches found for any Golden titles.")



üé¨ Jaws (expect John Williams, ~1975)
  Line 4489307: 5050997	5b9feacb-c556-4838-8245-1d204784464d	Jaws: Music From the Original Motion Picture Soundtrack	94	104397	1	\N	120	28	\N	1975 mix	0	-1	2025-06-29 07:00:15.952301+00	94	John Williams	1	27955	2011-...

üé¨ Gladiator (expect Hans Zimmer, ~2000)
  Line 558594: 1544068	801aff2f-9b9b-46bf-99f7-b0b225370e6d	Gladiator: Music From the Motion Picture	816664	200626	1	\N	120	28	892001002509	limited edition	0	-1	2019-08-21 13:00:18.426885+00	816664	Hans Zimmer & Lis...

üé¨ Frozen (expect Christophe Beck, ~2013)
  Line 1858501: 1372032	3c2da54f-3dfc-4be7-a46f-c55d4aab3e43	Frozen (deluxe edition soundtrack)	2480354	1325522	1	1	120	28	050087299439	2013 film soundtrack, jewel case	0	-1	2020-10-09 17:05:39.05572+00	2480354	Krist...
  Line 1858532: 1598260	d42c03d5-c932-4f02-bbf6-c282d3da7079	Frozen	2480354	1325522	1	3	120	28	8808678259155	2013 film soundtrack, deluxe edition	0	-1	2019-08-17 08:00:58.752861+00	2480354	Kristen Anderson‚ÄêLop