# OMDb Enrichment Script

This notebook contains the Python code I used to call the OMDb API and create the
`omdb_mojo_batch*_enriched.csv` files for my project.

The OMDb API provides a free API key for **personal and educational, non-commercial use**, which
I obtained specifically for this course project. The script is designed to automatically fetch
metadata (IMDb rating, Metascore, awards, votes, etc.) for all movies in the Box Office Mojo
budget dataset (`Mojo_budget_update.csv`).

There are 3,245 movies in this dataset. Manually looking up each title on OMDb/IMDb and copying
the information into a CSV would be extremely time-consuming and error-prone. Therefore, an
automated Python script that sends HTTP requests, handles API limits (daily quota), retries on
errors, and periodically saves checkpoints is both **necessary and appropriate**.

The specific Python techniques required for this script (HTTP requests, session handling, retry
logic with backoff, checkpointing, simple text parsing of the `Awards` field into Oscar flags)
go beyond the basic Python material covered in this course. For this reason, I used ChatGPT as a
coding assistant to help me structure and refine this enrichment script. I then adapted it to my
own dataset, chose parameter values (batch sizes, sleep times), executed it myself, and verified
the resulting CSV files.


**AI assistance note:** ChatGPT was used to help design and refine this script; all execution,
debugging, and validation of the final data were done by me.


In [None]:
import pandas as pd
import numpy as np
import re, time, requests
from datetime import datetime

BOM_CSV      = "/content/Mojo_budget_update.csv"
OMDB_KEY     = "key"   # <= OMDB KEY obtained via email from OMDB
OUT_CSV      = "/content/omdb_mojo_batch4.csv" # output file for day 1
BATCH_START  = 2700                            # inclusive index
BATCH_END    = 3245                            # exclusive index (first 900). The key has 1000 daily limit thats why 900 is chosen each day
SLEEP_SEC    = 0.35                            # ~3 req/sec
CHECKPOINT_EVERY = 100                         # save every N rows
MAX_RETRIES  = 3
TIMEOUT_SEC  = 12


# ----- load mojo & pick batch -----
bom = pd.read_csv(BOM_CSV, encoding="utf-8")
bom.columns = [c.strip().lower() for c in bom.columns]

# ensure IMDb id column is named movie_id
if "movie_id" not in bom.columns:
    bom.rename(columns={bom.columns[0]: "movie_id"}, inplace=True)

# keep valid tt IDs
bom["movie_id"] = bom["movie_id"].astype(str)
bom = bom[bom["movie_id"].str.contains(r"^tt\d{6,}$", na=False)].copy()

# select the batch slice by row index (on this filtered set)
batch_ids = bom["movie_id"].dropna().astype(str).iloc[BATCH_START:BATCH_END].unique().tolist()
print(f"Batch size (unique tt-ids): {len(batch_ids)}")

# ----- resume-safe: read existing output and skip fetched -----
cols = [
    "imdb_id","omdb_title","omdb_year","awards_text","omdb_boxoffice",
    "omdb_imdb_rating","omdb_metascore","omdb_imdb_votes",
    "response_ok","error_msg","fetched_at"
]
try:
    out = pd.read_csv(OUT_CSV)
    # normalize expected columns
    missing_cols = [c for c in cols if c not in out.columns]
    for c in missing_cols: out[c] = np.nan
    have = set(out["imdb_id"].dropna().astype(str).unique())
    todo = [i for i in batch_ids if i not in have]
    print(f"Resuming: {len(have)} already fetched, {len(todo)} remaining.")
except Exception:
    out = pd.DataFrame(columns=cols)
    todo = batch_ids
    print(f"No previous output found. Will fetch {len(todo)} ids.")

# ----- helper: clean money -----
def parse_money(s):
    if pd.isna(s): return np.nan
    ds = re.sub(r"[^\d]", "", str(s))
    return float(ds) if ds else np.nan

# ----- fetch function with retries -----
session = requests.Session()

def fetch_omdb(imdb_id):
    url = f"http://www.omdbapi.com/?i={imdb_id}&apikey={OMDB_KEY}"
    last_err = ""
    for attempt in range(1, MAX_RETRIES+1):
        try:
            r = session.get(url, timeout=TIMEOUT_SEC)
            j = r.json()
            if j.get("Response") == "True":
                return {
                    "imdb_id": imdb_id,
                    "omdb_title": j.get("Title"),
                    "omdb_year": j.get("Year"),
                    "awards_text": j.get("Awards"),
                    "omdb_boxoffice": j.get("BoxOffice"),
                    "omdb_imdb_rating": j.get("imdbRating"),
                    "omdb_metascore": j.get("Metascore"),
                    "omdb_imdb_votes": j.get("imdbVotes"),
                    "response_ok": True,
                    "error_msg": "",
                    "fetched_at": datetime.utcnow().isoformat()
                }
            else:
                # API responded but not found/valid
                return {
                    "imdb_id": imdb_id,
                    "omdb_title": j.get("Title"),
                    "omdb_year": j.get("Year"),
                    "awards_text": j.get("Awards"),
                    "omdb_boxoffice": j.get("BoxOffice"),
                    "omdb_imdb_rating": j.get("imdbRating"),
                    "omdb_metascore": j.get("Metascore"),
                    "omdb_imdb_votes": j.get("imdbVotes"),
                    "response_ok": False,
                    "error_msg": j.get("Error", "Response=False"),
                    "fetched_at": datetime.utcnow().isoformat()
                }
        except Exception as e:
            last_err = str(e)
            # small backoff on network hiccups
            time.sleep(0.8 * attempt)
    # after retries, return error row
    return {
        "imdb_id": imdb_id,
        "omdb_title": np.nan,
        "omdb_year": np.nan,
        "awards_text": np.nan,
        "omdb_boxoffice": np.nan,
        "omdb_imdb_rating": np.nan,
        "omdb_metascore": np.nan,
        "omdb_imdb_votes": np.nan,
        "response_ok": False,
        "error_msg": last_err or "max_retries_reached",
        "fetched_at": datetime.utcnow().isoformat()
    }

# ----- main loop with checkpointing -----
count_start = len(out)
for i, imdb_id in enumerate(todo, 1):
    row = fetch_omdb(imdb_id)
    out = pd.concat([out, pd.DataFrame([row])], ignore_index=True)

    if i % CHECKPOINT_EVERY == 0:
        out.to_csv(OUT_CSV, index=False)
        print(f"[Checkpoint] fetched {i}/{len(todo)} this run; total rows now {len(out)}")

    time.sleep(SLEEP_SEC)

# final save
out.to_csv(OUT_CSV, index=False)
print(f"Done. Fetched {len(out)-count_start} rows this run; wrote {OUT_CSV}")

# (optional) add parsed numeric columns for quick checks
out2 = out.copy()
out2["omdb_boxoffice_usd"] = out2["omdb_boxoffice"].apply(parse_money)
# flags from awards text
out2["awards_text"] = out2["awards_text"].astype(str)
out2["oscar_winner_flag"]  = out2["awards_text"].str.contains(r"\bwon\b.*\boscar", case=False, na=False).astype(int)
out2["oscar_nominee_flag"] = out2["awards_text"].str.contains(r"\bnominat.*\boscar", case=False, na=False).astype(int)

# quick sanity print
print("Coverage (boxoffice text present):", out2["omdb_boxoffice"].notna().mean())
print("Coverage (public IMDb rating present):", out2["omdb_imdb_rating"].notna().mean())

# save an enriched version too (optional)
out2.to_csv(OUT_CSV.replace(".csv", "_enriched.csv"), index=False)
print("Also wrote:", OUT_CSV.replace(".csv", "_enriched.csv"))


Batch size (unique tt-ids): 543
No previous output found. Will fetch 543 ids.


  "fetched_at": datetime.utcnow().isoformat()


[Checkpoint] fetched 100/543 this run; total rows now 100
[Checkpoint] fetched 200/543 this run; total rows now 200
[Checkpoint] fetched 300/543 this run; total rows now 300
[Checkpoint] fetched 400/543 this run; total rows now 400
[Checkpoint] fetched 500/543 this run; total rows now 500
âœ… Done. Fetched 543 rows this run; wrote /content/omdb_mojo_batch4.csv
Coverage (boxoffice text present): 1.0
Coverage (public IMDb rating present): 1.0
ðŸ’¾ Also wrote: /content/omdb_mojo_batch4_enriched.csv
