In [1]:
import os
import pandas as pd
import numpy as np
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
import duckdb

sns.set(style="whitegrid")

In [2]:
path = kagglehub.dataset_download("vivekananda99/imdb-dataset")
print("Dataset downloaded to:", path)

Mounting files to /kaggle/input/imdb-dataset...
Dataset downloaded to: /kaggle/input/imdb-dataset


In [3]:
for root, dirs, files in os.walk(path):
    level = root.replace(path, "").count(os.sep)
    indent = " " * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = " " * 4 * (level + 1)
    for f in files:
        print(f"{sub_indent}{f}")

imdb-dataset/
    title.basics.tsv
    title.episode.tsv
    title.principals.tsv
    title.ratings.tsv
    name.basics.tsv
    title.akas.tsv
    title.crew.tsv


In [6]:
con = duckdb.connect(database='imdb.duckdb')

# Construct file paths properly
basics_path = os.path.join(path, "title.basics.tsv")
ratings_path = os.path.join(path, "title.ratings.tsv")
crew_path = os.path.join(path, "title.crew.tsv")
principals_path = os.path.join(path, "title.principals.tsv")

# Create DuckDB views
con.execute(f"""
    CREATE OR REPLACE VIEW basics AS 
    SELECT * FROM read_csv_auto(
        '{basics_path}',
        delim='\t',
        nullstr='\\N',
        header=True
    );

    CREATE OR REPLACE VIEW ratings AS 
    SELECT * FROM read_csv_auto(
        '{ratings_path}',
        delim='\t',
        nullstr='\\N',
        header=True
    );

    CREATE OR REPLACE VIEW crew AS 
    SELECT * FROM read_csv_auto(
        '{crew_path}',
        delim='\t',
        nullstr='\\N',
        header=True
    );

    CREATE OR REPLACE VIEW principals AS 
    SELECT * FROM read_csv_auto(
        '{principals_path}',
        delim='\t',
        nullstr='\\N',
        header=True
    );
""")

print("Loaded basics, ratings, crew, principals into DuckDB views successfully.")

Loaded basics, ratings, crew, principals into DuckDB views successfully.


In [7]:
print("showing basics sample")
display(con.execute("SELECT * FROM basics LIMIT 5").df())

print("showing ratings sample")
display(con.execute("SELECT * FROM ratings LIMIT 5").df())

print("showing crew sample")
display(con.execute("SELECT * FROM crew LIMIT 5").df())

print("showing principals sample")
display(con.execute("SELECT * FROM principals LIMIT 5").df())

showing basics sample


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,Short


showing ratings sample


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2183
1,tt0000002,5.5,302
2,tt0000003,6.4,2260
3,tt0000004,5.2,194
4,tt0000005,6.2,2999


showing crew sample


Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,nm0721526
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,


showing principals sample


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0005690,producer,producer,
3,tt0000001,4,nm0374658,cinematographer,director of photography,
4,tt0000002,1,nm0721526,director,,


In [21]:
query = """
CREATE OR REPLACE TABLE homepage_master AS
WITH actor_agg AS (
    SELECT 
        tconst,
        string_agg(nconst, ',') AS actors
    FROM principals
    WHERE category IN ('actor', 'actress')
    GROUP BY tconst
),
crew_clean AS (
    SELECT tconst, directors
    FROM crew
    WHERE directors IS NOT NULL AND directors != ''
)
SELECT
    b.tconst,
    b.titleType,
    b.startYear,
    b.genres,
    r.averageRating,
    r.numVotes,
    c.directors,
    a.actors
FROM basics b
INNER JOIN ratings r ON r.tconst = b.tconst
INNER JOIN crew_clean c ON c.tconst = b.tconst
INNER JOIN actor_agg a ON a.tconst = b.tconst
WHERE
    b.titleType IS NOT NULL
    AND b.startYear IS NOT NULL
    AND b.genres IS NOT NULL
    AND r.averageRating IS NOT NULL
    AND r.numVotes IS NOT NULL;
"""

con.execute(query)

print("homepage_master created with directors and actors")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

homepage_master created with directors and actors


In [22]:
# homepage_master overview
print(con.execute("SELECT COUNT(*) AS rows FROM homepage_master").df())

      rows
0  1125644


In [23]:
# homepage_master sample
display(con.execute("SELECT * FROM homepage_master LIMIT 5").df())

Unnamed: 0,tconst,titleType,startYear,genres,averageRating,numVotes,directors,actors
0,tt0042688,movie,1950,"Comedy,Crime,Film-Noir",6.3,202,nm0355284,"nm0007222,nm0676492,nm0003110,nm0943978,nm0204..."
1,tt0042692,movie,1950,"Drama,Music,Romance",7.1,4213,"nm0000019,nm0490444","nm0208375,nm0215744,nm0556399,nm0457839,nm0535..."
2,tt0042951,movie,1950,Drama,6.0,126,nm0758508,"nm0338901,nm0181305,nm0350486,nm0001931,nm0325..."
3,tt0043020,movie,1951,Drama,5.9,52,nm0883334,"nm0231728,nm0133668,nm0082121,nm0734624,nm0033..."
4,tt0043132,movie,1950,"Crime,Drama,Film-Noir",7.5,10875,nm0695937,"nm0000763,nm0000074,nm0581282,nm0293099,nm0876..."


In [24]:
print(con.execute("""
SELECT
    SUM(CASE WHEN titleType IS NULL THEN 1 END) AS missing_titleType,
    SUM(CASE WHEN startYear IS NULL THEN 1 END) AS missing_startYear,
    SUM(CASE WHEN genres IS NULL THEN 1 END) AS missing_genres,
    SUM(CASE WHEN averageRating IS NULL THEN 1 END) AS missing_rating,
    SUM(CASE WHEN numVotes IS NULL THEN 1 END) AS missing_votes,
    SUM(CASE WHEN directors IS NULL THEN 1 END) AS missing_directors,
    SUM(CASE WHEN tconst IS NULL THEN 1 END) AS missing_nconst
FROM homepage_master
""").df())

   missing_titleType  missing_startYear  missing_genres  missing_rating  \
0                NaN                NaN             NaN             NaN   

   missing_votes  missing_directors  missing_nconst  
0            NaN                NaN             NaN  


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [25]:
con.execute("""
COPY homepage_master 
TO 'homepage_master.csv' 
(FORMAT 'csv', HEADER TRUE)
""")

print("homepage_master.csv saved")

homepage_master.csv saved
