In [3]:
import json
from pathlib import Path

import pandas as pd
from utils import get_random_movie_id, get_metadata
from imdb_scraper import scrape_reviews
from mojo_scraper import scrape_worldwide_box_office

# ---------- CONFIG ----------
OUTPUT_PATH = Path("data/mini.jsonl")
TARGET = 30          # number of movies to scrape
MIN_VOTES = 1_000    # rating threshold

# create / truncate output file safely
OUTPUT_PATH.write_text("")

scraped = 0
while scraped < TARGET:
    movie_id = get_random_movie_id(min_votes=MIN_VOTES)
    try:
        # — reviews first (quick exit if none) —
        reviews = scrape_reviews(movie_id)
        if not reviews:
            continue

        # — box‑office (mandatory) —
        try:
            worldwide_gross = scrape_worldwide_box_office(movie_id)
        except Exception as e:
            print(f"⚠️  skipping {movie_id} (no box‑office): {e}")
            continue  # must have box‑office to keep

        # — metadata —
        meta_raw = get_metadata(movie_id)
        # replace pandas NaN/NaT with None so JSON is valid
        meta = {k: (None if pd.isna(v) else v) for k, v in meta_raw.items()}

        record = {
            **meta,
            "worldwide_box_office": worldwide_gross,
            "review_bodies": reviews,
        }

        with OUTPUT_PATH.open("a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

        scraped += 1
        print(
            f"✅  {scraped}/{TARGET} | {movie_id} | {meta.get('primaryTitle')}"
        )

    except Exception as e:
        print(f"⚠️  skipping {movie_id}: {e}")

⚠️  skipping tt2327119 (no box‑office): Worldwide gross not found
⚠️  skipping tt6905692 (no box‑office): Worldwide gross not found
⚠️  skipping tt15469890 (no box‑office): Worldwide gross not found
✅  1/30 | tt0265343 | Monsoon Wedding
✅  2/30 | tt9802890 | Christmas Jars
⚠️  skipping tt2973418 (no box‑office): Worldwide gross not found
⚠️  skipping tt1032125 (no box‑office): Worldwide gross not found
⚠️  skipping tt3487994 (no box‑office): Worldwide gross not found
✅  3/30 | tt0411118 | Anthony Zimmer
⚠️  skipping tt0808348 (no box‑office): Worldwide gross not found
⚠️  skipping tt0369036 (no box‑office): Worldwide gross not found
⚠️  skipping tt9567818 (no box‑office): Worldwide gross not found
⚠️  skipping tt0082183 (no box‑office): Worldwide gross not found
⚠️  skipping tt0582962 (no box‑office): Worldwide gross not found
⚠️  skipping tt0400690 (no box‑office): Worldwide gross not found
⚠️  skipping tt12867916 (no box‑office): Worldwide gross not found
⚠️  skipping tt0113939 (no b