In [1]:
import os
import pandas as pd
import numpy as np
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
import duckdb

sns.set(style="whitegrid")

In [2]:
path = kagglehub.dataset_download("vivekananda99/imdb-dataset")
print("Dataset downloaded to:", path)

Dataset downloaded to: /kaggle/input/imdb-dataset


In [3]:
for root, dirs, files in os.walk(path):
    level = root.replace(path, "").count(os.sep)
    indent = " " * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = " " * 4 * (level + 1)
    for f in files:
        print(f"{sub_indent}{f}")

imdb-dataset/
    title.basics.tsv
    title.episode.tsv
    title.principals.tsv
    title.ratings.tsv
    name.basics.tsv
    title.akas.tsv
    title.crew.tsv


In [4]:
file_path = f"{path}/title.basics.tsv"

con = duckdb.connect(database=':memory:')

con.execute(f"""
    CREATE TABLE movies_filtered AS
    SELECT *
    FROM read_csv_auto('{file_path}', delim='\t', nullstr='\\N')
    WHERE titleType = 'movie'
      AND startYear IS NOT NULL
      AND CAST(startYear AS INTEGER) >= 1995
      AND genres IS NOT NULL
      AND genres != ''
""")

filtered_count = con.execute("SELECT COUNT(*) FROM movies_filtered").fetchone()[0]
filtered_count

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

366304

In [5]:
con.execute("""
    CREATE TABLE movies_expanded AS
    SELECT 
        tconst,
        CAST(startYear AS INTEGER) AS startYear,
        TRIM(genre) AS genre
    FROM movies_filtered
    LEFT JOIN UNNEST(string_split(genres, ',')) AS g(genre)
    ON TRUE
""")

expanded_count = con.execute("SELECT COUNT(*) FROM movies_expanded").fetchone()[0]
expanded_count

562530

In [6]:
genre_counts = con.execute("""
    SELECT genre, COUNT(*) AS cnt
    FROM movies_expanded
    GROUP BY genre
    ORDER BY cnt DESC
""").fetchdf()

target_total = 100_000
genre_counts["sample_size"] = (
    genre_counts["cnt"] / genre_counts["cnt"].sum() * target_total
).astype(int)

genre_counts

Unnamed: 0,genre,cnt,sample_size
0,Drama,133957,23813
1,Documentary,116498,20709
2,Comedy,65223,11594
3,Thriller,29212,5192
4,Action,28867,5131
5,Horror,26719,4749
6,Romance,26457,4703
7,Crime,19426,3453
8,Adventure,14213,2526
9,Biography,13796,2452


In [7]:
queries = []

for _, row in genre_counts.iterrows():
    g = row["genre"]
    n = row["sample_size"]
    queries.append(f"""
        SELECT * FROM (
            SELECT *, ROW_NUMBER() OVER (PARTITION BY genre ORDER BY RANDOM()) AS rn
            FROM movies_expanded
            WHERE genre = '{g}'
        ) WHERE rn <= {n}
    """)

final_query = " UNION ALL ".join(queries)

df_sampled = con.execute(final_query).fetchdf()
df_sampled.shape

(99985, 4)

In [8]:
ratings_path = f"{path}/title.ratings.tsv"

con.register("sampled_movies", df_sampled)

con.execute(f"""
    CREATE TABLE sampled_ratings AS
    SELECT 
        s.tconst,
        s.startYear,
        s.genre,
        r.averageRating,
        r.numVotes
    FROM sampled_movies s
    JOIN read_csv_auto('{ratings_path}', delim='\t', nullstr='\\N') r
    USING (tconst)
    WHERE r.averageRating IS NOT NULL
      AND r.numVotes IS NOT NULL
""")

final_count = con.execute("SELECT COUNT(*) FROM sampled_ratings").fetchone()[0]
final_count

65037

In [9]:
final_df = con.execute("""
    SELECT 
        tconst,
        startYear,
        genre,
        CAST(averageRating AS DOUBLE) AS averageRating,
        CAST(numVotes AS INTEGER) AS numVotes
    FROM sampled_ratings
""").fetchdf()

row_count = len(final_df)
missing_values = final_df.isnull().sum()
duplicate_rows = final_df.duplicated().sum()
summary = final_df.describe(include='all')

row_count, missing_values, duplicate_rows, summary

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


(65037,
 tconst           0
 startYear        0
 genre            0
 averageRating    0
 numVotes         0
 dtype: int64,
 0,
            tconst     startYear  genre  averageRating      numVotes
 count       65037  65037.000000  65037   65037.000000  6.503700e+04
 unique      58829           NaN     26            NaN           NaN
 top     tt3511812           NaN  Drama            NaN           NaN
 freq            3           NaN  16081            NaN           NaN
 mean          NaN   2013.445593    NaN       6.145351  7.279073e+03
 std           NaN      7.762885    NaN       1.454814  5.289774e+04
 min           NaN   1995.000000    NaN       1.000000  5.000000e+00
 25%           NaN   2008.000000    NaN       5.200000  2.600000e+01
 50%           NaN   2015.000000    NaN       6.300000  1.140000e+02
 75%           NaN   2020.000000    NaN       7.100000  6.910000e+02
 max           NaN   2025.000000    NaN      10.000000  2.528612e+06)

In [10]:
final_df.to_csv("genre_movies_sampled.csv", index=False)