In [None]:
# 06_vote_weighted_trends.ipynb
# Project: The Golden Age Myth â€“ IMDb Analysis
# Author: Prateek Chandra

# This notebook compares raw average ratings with vote-weighted
# ratings to show how small-sample inflation affects older decades.

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from pathlib import Path

pio.templates.default = "plotly_white"

# ---
# Cell 2: Load Feature Data
DATA_DIR = Path("../data/processed")
movies = pd.read_csv(DATA_DIR / "movies_features.csv")

# ---
# Cell 3: Type Safety & Cleaning
movies["decade"] = pd.to_numeric(movies["decade"], errors="coerce")
movies["averageRating"] = pd.to_numeric(movies["averageRating"], errors="coerce")
movies["numVotes"] = pd.to_numeric(movies["numVotes"], errors="coerce")

movies = movies.dropna(subset=["decade", "averageRating", "numVotes"])
movies["decade"] = movies["decade"].astype(int)

# ---
# Cell 4: Remove Sparse Decades
MIN_TITLES = 20
valid_decades = movies["decade"].value_counts()
valid_decades = valid_decades[valid_decades >= MIN_TITLES].index
movies = movies[movies["decade"].isin(valid_decades)]

In [4]:
# ---
# Cell 5: Compute Raw vs Vote-Weighted Mean

def weighted_mean(group):
    return np.average(group["averageRating"], weights=np.log1p(group["numVotes"]))

summary = movies.groupby("decade").apply(
    lambda x: pd.Series({
        "raw_mean": x["averageRating"].mean(),
        "vote_weighted_mean": weighted_mean(x)
    })
).reset_index()

In [5]:
# ---
# Cell 6: Plot Comparison
fig = go.Figure()

fig.add_trace(go.Scatter(x=summary["decade"], y=summary["raw_mean"],
                    mode='lines+markers',
                    name='Raw Mean'))

fig.add_trace(go.Scatter(x=summary["decade"], y=summary["vote_weighted_mean"],
                    mode='lines+markers',
                    name='Vote-Weighted Mean'))

fig.update_layout(title="Raw vs Vote-Weighted Average Ratings by Decade",
                   xaxis_title="Decade",
                   yaxis_title="Average Rating")
fig.show()

In [6]:
# ---
# Cell 7: Tabular Comparison
print(summary.sort_values("decade").head())

   decade  raw_mean  vote_weighted_mean
0    1900  3.467073            3.581167
1    1910  5.781108            5.848430
2    1920  5.994980            6.129121
3    1930  6.081790            6.146560
4    1940  6.189401            6.280788
