In [1]:
import os
import pandas as pd
import numpy as np
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
import duckdb

sns.set(style="whitegrid")

In [2]:
path = kagglehub.dataset_download("vivekananda99/imdb-dataset")
print("Dataset downloaded to:", path)

Dataset downloaded to: /kaggle/input/imdb-dataset


In [3]:
for root, dirs, files in os.walk(path):
    level = root.replace(path, "").count(os.sep)
    indent = " " * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = " " * 4 * (level + 1)
    for f in files:
        print(f"{sub_indent}{f}")

imdb-dataset/
    title.basics.tsv
    title.episode.tsv
    title.principals.tsv
    title.ratings.tsv
    name.basics.tsv
    title.akas.tsv
    title.crew.tsv


In [4]:
import duckdb

In [5]:
# Path to basics file
file_path = os.path.join(path, "title.basics.tsv")

# Connect to DuckDB (in-memory)
con = duckdb.connect(database=":memory:")

# Load title.basics and filter only tvSeries
con.execute(f"""
    CREATE TABLE tv_series AS
    SELECT *
    FROM read_csv_auto('{file_path}', delim='\t', nullstr='\\N')
    WHERE titleType = 'tvSeries'
      AND primaryTitle IS NOT NULL
""")

# Count TV series
series_count = con.execute("SELECT COUNT(*) FROM tv_series").fetchone()[0]
print("Total TV Series in dataset:", series_count)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Total TV Series in dataset: 289564


In [6]:
# Paths to episode and ratings files
episode_path = os.path.join(path, "title.episode.tsv")
ratings_path = os.path.join(path, "title.ratings.tsv")

# Create a table for episode ratings by joining episodes and ratings
con.execute(f"""
    CREATE TABLE episode_ratings AS
    SELECT e.parentTconst AS series_id,
           e.seasonNumber::INTEGER AS seasonNumber,
           e.episodeNumber::INTEGER AS episodeNumber,
           r.averageRating::DOUBLE AS averageRating
    FROM read_csv_auto('{episode_path}', delim='\t', nullstr='\\N') e
    JOIN read_csv_auto('{ratings_path}', delim='\t', nullstr='\\N') r
    ON e.tconst = r.tconst
    WHERE e.parentTconst IS NOT NULL
      AND e.seasonNumber IS NOT NULL
      AND r.averageRating IS NOT NULL
""")

# Aggregate average rating per season per series
season_avg_ratings = con.execute("""
    CREATE TABLE season_avg_ratings AS
    SELECT series_id,
           seasonNumber,
           AVG(averageRating) AS season_avg_rating,
           COUNT(*) AS episodes_count
    FROM episode_ratings
    GROUP BY series_id, seasonNumber
    ORDER BY series_id, seasonNumber
""")

# Fetch result for inspection
df_season_ratings = con.execute("SELECT * FROM season_avg_ratings LIMIT 10").fetchdf()
df_season_ratings

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,series_id,seasonNumber,season_avg_rating,episodes_count
0,tt0039123,3,7.9,2
1,tt0039123,4,5.8,1
2,tt0039123,7,6.266667,3
3,tt0039123,8,8.2,2
4,tt0039123,9,7.95,2
5,tt0039123,10,6.1,1
6,tt0039123,11,7.7,3
7,tt0039125,1,6.1,2
8,tt0040021,1,7.9,1
9,tt0040021,2,7.366667,3


In [8]:
# Total number of series in this analysis
total_series = con.execute("SELECT COUNT(DISTINCT series_id) FROM season_avg_ratings").fetchone()[0]

# Total number of seasons
total_seasons = con.execute("SELECT COUNT(*) FROM season_avg_ratings").fetchone()[0]

# Total number of episodes in this analysis
total_episodes = con.execute("SELECT SUM(episodes_count) FROM season_avg_ratings").fetchone()[0]

print(f"Total Series analyzed: {total_series}")
print(f"Total Seasons analyzed: {total_seasons}")
print(f"Total Episodes analyzed: {total_episodes}")


Total Series analyzed: 43465
Total Seasons analyzed: 80341
Total Episodes analyzed: 817324


In [9]:
# First, get total seasons per series
series_lengths = con.execute("""
    SELECT series_id,
           COUNT(*) AS total_seasons
    FROM season_avg_ratings
    GROUP BY series_id
""").fetchdf()

# Merge with season_avg_ratings to include series length
df = con.execute("""
    SELECT s.series_id,
           s.seasonNumber,
           s.season_avg_rating,
           s.episodes_count,
           l.total_seasons
    FROM season_avg_ratings s
    JOIN (
        SELECT series_id, COUNT(*) AS total_seasons
        FROM season_avg_ratings
        GROUP BY series_id
    ) l
    ON s.series_id = l.series_id
""").fetchdf()

# Stratify series by length
def length_category(n):
    if n <= 2:
        return "Short (1-2)"
    elif n <= 5:
        return "Medium (3-5)"
    else:
        return "Long (6+)"

df['series_length_category'] = df['total_seasons'].apply(length_category)

# Inspect the stratification
df.groupby('series_length_category')['series_id'].nunique()

series_length_category
Long (6+)        2039
Medium (3-5)     4611
Short (1-2)     36815
Name: series_id, dtype: int64