In [3]:
# 02_data_cleaning.ipynb
# Project: The Golden Age Myth â€“ IMDb Analysis
# Author: Prateek Chandra

import pandas as pd
from pathlib import Path

# -----------------------------
# Load merged data
# -----------------------------
DATA_PATH = Path("../data/processed/imdb_merged_raw.csv")
df = pd.read_csv(DATA_PATH)
print("Initial shape:", df.shape)

# -----------------------------
# Year cleaning
# -----------------------------
df["startYear"] = pd.to_numeric(df["startYear"], errors="coerce")
df = df.dropna(subset=["startYear"])
df["startYear"] = df["startYear"].astype(int)

df = df[(df["startYear"] >= 1900) & (df["startYear"] <= 2025)]
print("After year cleaning:", df.shape)

# -----------------------------
# Decade feature
# -----------------------------
df["decade"] = (df["startYear"] // 10) * 10

# -----------------------------
# Runtime & genre cleanup
# -----------------------------
df["runtimeMinutes"] = pd.to_numeric(df["runtimeMinutes"], errors="coerce")
df["runtimeMinutes"] = df["runtimeMinutes"].fillna(df["runtimeMinutes"].median())
df["genres"] = df["genres"].fillna("Unknown")

# -----------------------------
# Inspect title types (important)
# -----------------------------
print("Top titleTypes:")
print(df["titleType"].value_counts().head(10))

# -----------------------------
# SAFE content separation
# -----------------------------
# Movies: include theatrical + TV movies
movies_df = df[df["titleType"].isin(["movie", "tvMovie"])]

# TV content
tv_df = df[df["titleType"].isin(["tvSeries", "tvMiniSeries"])]

print("Movies shape (before vote filter):", movies_df.shape)
print("TV shape (before vote filter):", tv_df.shape)

# -----------------------------
# DO NOT filter votes here
# Vote filtering belongs to analysis notebooks
# -----------------------------

# -----------------------------
# Save cleaned data
# -----------------------------
OUTPUT_DIR = Path("../data/processed")
movies_df.to_csv(OUTPUT_DIR / "movies_clean.csv", index=False)
tv_df.to_csv(OUTPUT_DIR / "tv_clean.csv", index=False)

print("Saved movies_clean.csv and tv_clean.csv")

# -----------------------------
# Sanity check
# -----------------------------
print("Movies per decade:")
print(movies_df["decade"].value_counts().sort_index().head(10))


  df = pd.read_csv(DATA_PATH)


Initial shape: (1627720, 9)
After year cleaning: (1622303, 9)
Top titleTypes:
titleType
tvEpisode       829262
movie           338126
short           172924
tvSeries        108948
video            56928
tvMovie          56079
tvMiniSeries     24581
videoGame        19132
tvSpecial        13815
tvShort           2508
Name: count, dtype: int64
Movies shape (before vote filter): (394205, 10)
TV shape (before vote filter): (133529, 10)
Saved movies_clean.csv and tv_clean.csv
Movies per decade:
decade
1900      164
1910     2112
1920     4024
1930     9385
1940     9284
1950    13631
1960    20205
1970    27716
1980    32045
1990    35769
Name: count, dtype: int64
