In [None]:
# 04_rating_distribution_by_decade.ipynb
# Project: The Golden Age Myth – IMDb Analysis
# Author: Prateek Chandra

# This notebook analyzes the distribution of ratings over decades for Movies and TV Shows
# using interactive Plotly visualizations.

In [8]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
from pathlib import Path

# Set default template
pio.templates.default = "plotly_white"

# -----------------------------
# Load data
# -----------------------------
DATA_DIR = Path("../data/processed")
movies = pd.read_csv(DATA_DIR / "movies_features.csv")
tv = pd.read_csv(DATA_DIR / "tv_features.csv")

# -----------------------------
# Cleaning & type safety
# -----------------------------
for df in [movies, tv]:
    df["col_decade"] = pd.to_numeric(df["decade"], errors="coerce")  # Renamed to avoid confusion if needed, but keeping simple
    df["decade"] = pd.to_numeric(df["decade"], errors="coerce")
    df["averageRating"] = pd.to_numeric(df["averageRating"], errors="coerce")
    df.dropna(subset=["decade", "averageRating"], inplace=True)
    df["decade"] = df["decade"].astype(int)

MIN_TITLES = 20

print("Total movies:", movies.shape)
print("Unique decades:", sorted(movies["decade"].unique()))
print("\nMovies per decade:")
print(movies["decade"].value_counts().sort_index().head(15))

Total movies: (394205, 13)
Unique decades: [np.int64(1900), np.int64(1910), np.int64(1920), np.int64(1930), np.int64(1940), np.int64(1950), np.int64(1960), np.int64(1970), np.int64(1980), np.int64(1990), np.int64(2000), np.int64(2010), np.int64(2020)]

Movies per decade:
decade
1900       164
1910      2112
1920      4024
1930      9385
1940      9284
1950     13631
1960     20205
1970     27716
1980     32045
1990     35769
2000     62853
2010    109054
2020     67963
Name: count, dtype: int64


In [9]:
# -----------------------------
# MOVIES: Interactive Boxplot
# -----------------------------
# Filter for decades with enough data
decade_counts = movies["decade"].value_counts()
valid_decades = decade_counts[decade_counts >= MIN_TITLES].index
movies_filtered = movies[movies["decade"].isin(valid_decades)]

fig = px.box(movies_filtered, 
             x="decade", 
             y="averageRating", 
             title="Movie Rating Distribution by Decade",
             labels={"decade": "Decade", "averageRating": "Average Rating"},
             color="decade")
fig.update_layout(showlegend=False)
fig.show()

In [10]:
import pandas as pd
import plotly.express as px

# Weighted average function
def weighted_avg(df, value_col, weight_col):
    return (df[value_col] * df[weight_col]).sum() / df[weight_col].sum()

# Compute weighted mean rating per decade
weighted_decade = (
    movies_filtered
    .groupby("decade")
    .apply(lambda x: weighted_avg(x, "averageRating", "numVotes"))
    .reset_index(name="weightedRating")
)

# Line plot
fig = px.line(
    weighted_decade,
    x="decade",
    y="weightedRating",
    markers=True,
    title="Vote-Weighted Average Movie Rating by Decade",
    labels={"weightedRating": "Vote-Weighted Rating", "decade": "Decade"}
)

fig.show()


In [11]:
# -----------------------------
# TV: Interactive Boxplot
# -----------------------------
decade_counts_tv = tv["decade"].value_counts()
valid_decades_tv = decade_counts_tv[decade_counts_tv >= MIN_TITLES].index
tv_filtered = tv[tv["decade"].isin(valid_decades_tv)]

fig = px.box(tv_filtered, 
             x="decade", 
             y="averageRating", 
             title="TV Rating Distribution by Decade",
             labels={"decade": "Decade", "averageRating": "Average Rating"},
             color="decade")
fig.update_layout(showlegend=False)
fig.show()

In [12]:
# -----------------------------
# Median Trend: Movies vs TV
# -----------------------------
movie_median = movies.groupby("decade")["averageRating"].median().reset_index()
movie_median["Type"] = "Movies"

tv_median = tv.groupby("decade")["averageRating"].median().reset_index()
tv_median["Type"] = "TV"

combined_median = pd.concat([movie_median, tv_median])

fig = px.line(combined_median, 
              x="decade", 
              y="averageRating", 
              color="Type", 
              markers=True,
              title="Median Ratings by Decade: Movies vs TV",
              labels={"decade": "Decade", "averageRating": "Median Rating"})
fig.show()