In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

In [None]:
films_path = Path("films.csv.gz")
genres_path = Path("genres.csv.gz")


def read_cached_csv(fp: Path, link: str):
    if fp.exists():
        return pd.read_csv(fp)
    df = pd.read_csv(link)
    df.to_csv(fp, index=False)
    return df


films = read_cached_csv(
    films_path, "http://tmp-borza-public-cyx.s3.amazonaws.com/p26-films.csv.gz"
)
genres = read_cached_csv(
    genres_path, "http://tmp-borza-public-cyx.s3.amazonaws.com/p26-genres.csv.gz"
)

films.head()

In [None]:
genres.sample(10)

In [None]:
df = films.merge(genres, on="imdb_id", how="left")

df.sample(10)

In [None]:
# A

top_genres_a = (
    df.groupby("genre")
    .agg(avg_rating=("rating", "mean"), films=("imdb_id", "count"))
    .loc[lambda df: df["films"] >= 5]
    .sort_values("avg_rating", ascending=False)
)

top_genres_a.head()

In [None]:
# B

grouped = df.groupby("genre")

aggregated = grouped.agg(
    avg_rating=("rating", "mean"),
    films=("imdb_id", "count"),
)

filtered = aggregated[aggregated["films"] >= 5]

top_genres_b = filtered.sort_values(
    by="avg_rating",
    ascending=False,
)

top_genres_b.head()

In [None]:
popular = films[
    (films["rating_count"] >= 10_000) & (films["length_minutes"].between(80, 180))
]

avg_rating_by_year = popular.groupby("year")["rating"].mean()

popular_again = films[
    (films["rating_count"] >= 10_000) & (films["length_minutes"].between(80, 180))
]

median_rating_by_year = popular_again.groupby("year")["rating"].median()

avg_rating_by_year.head()

In [None]:
def filter_popular_reasonable_length(
    df: pd.DataFrame,
) -> pd.DataFrame:
    return df[(df["rating_count"] >= 10_000) & (df["length_minutes"].between(80, 180))]


popular = filter_popular_reasonable_length(films)

avg_rating_by_year = popular.groupby("year")["rating"].mean()
median_rating_by_year = popular.groupby("year")["rating"].median()

avg_rating_by_year.head()

In [None]:
def add_decade_column_mutating(df: pd.DataFrame) -> None:
    df["decade"] = (df["year"] // 10) * 10


films_copy = films.copy()
add_decade_column_mutating(films_copy)

films_copy.head()

In [None]:
def add_decade_column(
    df: pd.DataFrame,
) -> pd.DataFrame:
    return df.assign(decade=(df["year"] // 10) * 10)


films_with_decade = add_decade_column(films)

films_with_decade.head()

In [None]:
def compute_weighted_rating(
    df: pd.DataFrame,
) -> pd.DataFrame:
    C = df["rating"].mean()
    m = df["rating_count"].quantile(0.75)

    return df.assign(
        weighted_rating=(
            (df["rating_count"] / (df["rating_count"] + m)) * df["rating"]
            + (m / (df["rating_count"] + m)) * C
        )
    )


analysis_df = (
    films.pipe(filter_popular_reasonable_length)
    .pipe(add_decade_column)
    .pipe(compute_weighted_rating)
)

analysis_df.head()

In [None]:
avg_by_decade_a = {}

for decade, group in analysis_df.groupby("decade"):
    avg_by_decade_a[decade] = group["weighted_rating"].mean()

avg_by_decade_a

In [None]:
avg_by_decade_b = analysis_df.groupby("decade")["weighted_rating"].mean().to_dict()

avg_by_decade_b

In [None]:
def average_rating_per_genre(
    df: pd.DataFrame,
) -> pd.Series:
    return df.groupby("genre")["rating"].mean()


avg_genre = average_rating_per_genre(df)

avg_genre.head()

In [None]:
topn = 16

rolling = (
    df.groupby(["genre", "year"])
    .agg({"rating": "mean", "metacritic": "mean", "imdb_id": "count"})
    .reset_index()
    .loc[lambda df: df["year"] > 1970]
    .loc[
        lambda df: df["genre"].isin(
            df.groupby("genre")["imdb_id"].sum().sort_values().tail(topn).index
        )
    ]
    .sort_values(["genre", "year"])
    .set_index("year")
    .groupby("genre")[["rating", "metacritic", "imdb_id"]]
    .rolling(window=8, min_periods=1)
    .mean()
    .reset_index()
    .loc[lambda df: df["year"] > 1980]
)

rolling.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


def plot_genre_buckets(rolling: pd.DataFrame, n_rows: int = 3):
    popularity = rolling.groupby("genre")["imdb_id"].mean().sort_values(ascending=False)

    genres_sorted = popularity.index.tolist()
    buckets = np.array_split(genres_sorted, n_rows)
    fig, axes = plt.subplots(
        n_rows,
        2,
        figsize=(18, 5 * n_rows),
        sharex=True,
        sharey="col",
    )

    if n_rows == 1:
        axes = np.array([axes])

    for (ax_left, ax_right), bucket in zip(axes, buckets):

        subset = rolling[rolling["genre"].isin(bucket)]
        sns.lineplot(
            data=subset,
            x="year",
            y="rating",
            hue="genre",
            ax=ax_left,
            legend=False,
        )
        sns.lineplot(
            data=subset,
            x="year",
            y="metacritic",
            hue="genre",
            ax=ax_right,
        )

    plt.tight_layout()
    plt.show()


plot_genre_buckets(rolling, n_rows=4)