In [1]:
# 03_feature_engineering.ipynb
# Project: The Golden Age Myth â€“ IMDb Analysis
# Author: Prateek Chandra

# This notebook performs feature engineering required for
# bias analysis and visualization.

# ---
# Cell 1: Imports
import pandas as pd
import numpy as np
from pathlib import Path

# ---
# Cell 2: Load Cleaned Data
DATA_DIR = Path("../data/processed")
movies = pd.read_csv(DATA_DIR / "movies_clean.csv")
tv = pd.read_csv(DATA_DIR / "tv_clean.csv")

print("Movies:", movies.shape)
print("TV:", tv.shape)

# ---
# Cell 3: Vote-Weighted Rating
# Weighted rating = rating * log(numVotes)
movies["weightedRating"] = movies["averageRating"] * np.log1p(movies["numVotes"])
tv["weightedRating"] = tv["averageRating"] * np.log1p(tv["numVotes"])

# ---
# Cell 4: Popularity Buckets
bins = [0, 100, 1000, 10000, 100000, np.inf]
labels = ["Very Low", "Low", "Medium", "High", "Very High"]

movies["popularity"] = pd.cut(movies["numVotes"], bins=bins, labels=labels)
tv["popularity"] = pd.cut(tv["numVotes"], bins=bins, labels=labels)

# ---
# Cell 5: Genre Explosion
movies_genres = movies.assign(genres=movies["genres"].str.split(",")).explode("genres")
tv_genres = tv.assign(genres=tv["genres"].str.split(",")).explode("genres")

# ---
# Cell 6: Decade-Level Aggregations
movie_decade_stats = movies.groupby("decade").agg(
    mean_rating=("averageRating", "mean"),
    weighted_rating=("weightedRating", "mean"),
    title_count=("tconst", "count"),
    avg_votes=("numVotes", "mean")
).reset_index()

# ---
# Cell 7: Save Engineered Data
movies.to_csv(DATA_DIR / "movies_features.csv", index=False)
tv.to_csv(DATA_DIR / "tv_features.csv", index=False)
movies_genres.to_csv(DATA_DIR / "movies_genres.csv", index=False)
tv_genres.to_csv(DATA_DIR / "tv_genres.csv", index=False)
movie_decade_stats.to_csv(DATA_DIR / "movie_decade_stats.csv", index=False)

print("Feature-engineered datasets saved")

# ---
# Cell 8: Preview
movie_decade_stats.head()

Movies: (394205, 10)
TV: (133529, 10)
Feature-engineered datasets saved


Unnamed: 0,decade,mean_rating,weighted_rating,title_count,avg_votes
0,1900,3.467073,11.111494,164,35.780488
1,1910,5.781108,21.224718,2112,137.564394
2,1920,5.99498,24.167339,4024,573.929423
3,1930,6.08179,27.626578,9385,768.846777
4,1940,6.189401,29.074374,9284,1218.827122
