In [2]:
import os
import pandas as pd
import numpy as np
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
import duckdb

sns.set(style="whitegrid")

In [3]:
path = kagglehub.dataset_download("vivekananda99/imdb-dataset")
print("Dataset downloaded to:", path)

Dataset downloaded to: /kaggle/input/imdb-dataset


In [4]:
for root, dirs, files in os.walk(path):
    level = root.replace(path, "").count(os.sep)
    indent = " " * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = " " * 4 * (level + 1)
    for f in files:
        print(f"{sub_indent}{f}")

imdb-dataset/
    title.basics.tsv
    title.episode.tsv
    title.principals.tsv
    title.ratings.tsv
    name.basics.tsv
    title.akas.tsv
    title.crew.tsv


In [5]:
import duckdb
import os

# Connect to DuckDB (persistent file)
con = duckdb.connect("imdb.duckdb")

# Base path where TSV files were downloaded
base_path = path  

# Load each TSV into DuckDB as a table
con.execute(f"""
CREATE OR REPLACE TABLE title_basics AS 
SELECT * FROM read_csv_auto('{os.path.join(base_path, "title.basics.tsv")}', delim='\t', nullstr='\\N');
""")

con.execute(f"""
CREATE OR REPLACE TABLE title_ratings AS 
SELECT * FROM read_csv_auto('{os.path.join(base_path, "title.ratings.tsv")}', delim='\t', nullstr='\\N');
""")

con.execute(f"""
CREATE OR REPLACE TABLE title_crew AS 
SELECT * FROM read_csv_auto('{os.path.join(base_path, "title.crew.tsv")}', delim='\t', nullstr='\\N');
""")

con.execute(f"""
CREATE OR REPLACE TABLE name_basics AS 
SELECT * FROM read_csv_auto('{os.path.join(base_path, "name.basics.tsv")}', delim='\t', nullstr='\\N');
""")

print("Tables created successfully!")

# Show table names to confirm
con.execute("SHOW TABLES;").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Tables created successfully!


Unnamed: 0,name
0,name_basics
1,title_basics
2,title_crew
3,title_ratings


In [6]:
# Create the merged table with director, movie, rating, numVotes, runtime
con.execute("""
CREATE OR REPLACE TABLE director_movies AS
SELECT
    b.tconst,
    b.primaryTitle,
    r.averageRating,
    r.numVotes,
    b.runtimeMinutes,
    n.primaryName AS director,
    d.directorId
FROM title_basics b
JOIN title_ratings r 
    ON b.tconst = r.tconst
JOIN (
    SELECT 
        tconst,
        unnest(string_split(directors, ',')) AS directorId
    FROM title_crew
) d ON b.tconst = d.tconst
JOIN name_basics n 
    ON d.directorId = n.nconst
WHERE b.titleType = 'movie'
""")

print("director_movies table created successfully!")

# quick preview
con.execute("SELECT * FROM director_movies LIMIT 10;").df()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

director_movies table created successfully!


Unnamed: 0,tconst,primaryTitle,averageRating,numVotes,runtimeMinutes,director,directorId
0,tt0255059,La ciudad es nuestra,6.1,9,56,Tino Calabuig,nm0129255
1,tt1414111,The Tiger Next Door,7.0,29,86,Camilla Calamandrei,nm0129280
2,tt0151650,Movie Madness,7.6,17,80,Steve Sayre,nm0129286
3,tt0197375,Core mio,6.6,10,90,Stefano Calanchi,nm0129306
4,tt2012665,Repentance,4.9,2920,90,Philippe Caland,nm0129310
5,tt0039364,Il fabbro del convento,5.3,11,88,Max Calandri,nm0129327
6,tt0175289,Vacanta la mare,6.6,49,75,Andrei Calarasu,nm0129340
7,tt0424242,Longo Caminho da Morte,6.6,27,85,Júlio Calasso,nm0129350
8,tt0074844,Makarras Conexion,3.4,11,88,Francisco Calatrava,nm0129364
9,tt0074844,Makarras Conexion,3.4,11,88,Manuel Calatrava,nm0129365


In [7]:
con.execute("""
COPY director_movies
TO 'director_movies.csv'
WITH (HEADER, DELIMITER ',');
""")

print("CSV exported successfully → director_movies.csv")

CSV exported successfully → director_movies.csv


In [8]:
con.execute("SELECT COUNT(*) AS total_rows FROM director_movies;").df()

Unnamed: 0,total_rows
0,380614


In [9]:
# Create filtered table: directors with >=3 movies, runtime removed
con.execute("""
    CREATE OR REPLACE TABLE director_movies_filtered AS
    WITH movie_counts AS (
        SELECT director, COUNT(*) AS movie_count
        FROM director_movies
        GROUP BY director
    )
    SELECT 
        d.director,
        d.primaryTitle,
        d.averageRating,
        d.numVotes
    FROM director_movies d
    JOIN movie_counts mc
        ON d.director = mc.director
    WHERE mc.movie_count >= 3;
""")

# Export to CSV
con.execute("""
COPY director_movies_filtered
TO 'director_movies_filter.csv'
WITH (HEADER, DELIMITER ',');
""")

print("CSV exported successfully → director_movies_filtered.csv")

# Row count
con.execute("SELECT COUNT(*) AS total_rows FROM director_movies_filtered;").df()

CSV exported successfully → director_movies_filtered.csv


Unnamed: 0,total_rows
0,241734
