https://developer.imdb.com/non-commercial-datasets/


In [1]:
import polars as pl
import os
import orjson

In [2]:
data_dir = "/Users/maxwoolf/Downloads"

df_titles = pl.read_csv(
    os.path.join(data_dir, "title.basics.tsv.gz"),
    separator="\t",
    ignore_errors=True,
    quote_char=None,
)

df_titles

tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
str,str,str,str,i64,i64,str,str,str
"""tt0000001""","""short""","""Carmencita""","""Carmencita""",0,1894,"""\N""","""1""","""Documentary,Short"""
"""tt0000002""","""short""","""Le clown et ses chiens""","""Le clown et ses chiens""",0,1892,"""\N""","""5""","""Animation,Short"""
"""tt0000003""","""short""","""Poor Pierrot""","""Pauvre Pierrot""",0,1892,"""\N""","""5""","""Animation,Comedy,Romance"""
"""tt0000004""","""short""","""Un bon bock""","""Un bon bock""",0,1892,"""\N""","""12""","""Animation,Short"""
"""tt0000005""","""short""","""Blacksmith Scene""","""Blacksmith Scene""",0,1893,"""\N""","""1""","""Short"""
…,…,…,…,…,…,…,…,…
"""tt9916848""","""tvEpisode""","""Episode #3.17""","""Episode #3.17""",0,2009,"""\N""","""\N""","""Drama"""
"""tt9916850""","""tvEpisode""","""Episode #3.19""","""Episode #3.19""",0,2010,"""\N""","""\N""","""Drama"""
"""tt9916852""","""tvEpisode""","""Episode #3.20""","""Episode #3.20""",0,2010,"""\N""","""\N""","""Drama"""
"""tt9916856""","""short""","""The Wind""","""The Wind""",0,2015,"""\N""","""27""","""Short"""


In [3]:
df_titles["titleType"].unique().to_list()

['tvSpecial',
 'tvPilot',
 'tvEpisode',
 'tvMovie',
 'short',
 'tvShort',
 'video',
 'tvSeries',
 'tvMiniSeries',
 'videoGame',
 'movie']

In [4]:
df_titles = df_titles.filter(pl.col("titleType").is_in(["movie", "tvMovie"]))
df_titles

tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
str,str,str,str,i64,i64,str,str,str
"""tt0000009""","""movie""","""Miss Jerry""","""Miss Jerry""",0,1894,"""\N""","""45""","""Romance"""
"""tt0000147""","""movie""","""The Corbett-Fitzsimmons Fight""","""The Corbett-Fitzsimmons Fight""",0,1897,"""\N""","""100""","""Documentary,News,Sport"""
"""tt0000502""","""movie""","""Bohemios""","""Bohemios""",0,1905,"""\N""","""100""","""\N"""
"""tt0000574""","""movie""","""The Story of the Kelly Gang""","""The Story of the Kelly Gang""",0,1906,"""\N""","""70""","""Action,Adventure,Biography"""
"""tt0000591""","""movie""","""The Prodigal Son""","""L'enfant prodigue""",0,1907,"""\N""","""90""","""Drama"""
…,…,…,…,…,…,…,…,…
"""tt9916622""","""movie""","""Rodolpho Teóphilo - O Legado d…","""Rodolpho Teóphilo - O Legado d…",0,2015,"""\N""","""57""","""Documentary"""
"""tt9916680""","""movie""","""De la ilusión al desconcierto:…","""De la ilusión al desconcierto:…",0,2007,"""\N""","""100""","""Documentary"""
"""tt9916706""","""movie""","""Dankyavar Danka""","""Dankyavar Danka""",0,2013,"""\N""","""\N""","""Comedy"""
"""tt9916730""","""movie""","""6 Gunn""","""6 Gunn""",0,2017,"""\N""","""116""","""Drama"""


In [5]:
df_ratings = pl.read_csv(
    os.path.join(data_dir, "title.ratings.tsv.gz"), separator="\t", ignore_errors=True
).filter(pl.col("numVotes") >= 30)

df_ratings

tconst,averageRating,numVotes
str,f64,i64
"""tt0000001""",5.7,2131
"""tt0000002""",5.6,289
"""tt0000003""",6.4,2167
"""tt0000004""",5.3,184
"""tt0000005""",6.2,2894
…,…,…
"""tt9916362""",6.4,6002
"""tt9916380""",8.2,120
"""tt9916544""",6.8,81
"""tt9916578""",7.4,48


Make the base table as small as possible before unnesting/joining.


In [6]:
df_denorm = (
    df_titles.join(df_ratings, on="tconst")
    .filter(pl.col("averageRating").is_not_null())
    .sort("tconst", descending=True)
)

df_denorm

tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
str,str,str,str,i64,i64,str,str,str,f64,i64
"""tt9916362""","""movie""","""Coven""","""Akelarre""",0,2020,"""\N""","""92""","""Drama,History""",6.4,6002
"""tt9916270""","""movie""","""Il talento del calabrone""","""Il talento del calabrone""",0,2020,"""\N""","""84""","""Thriller""",5.8,1501
"""tt9916190""","""movie""","""Safeguard""","""Safeguard""",0,2020,"""\N""","""95""","""Action,Adventure,Thriller""",3.6,262
"""tt9916160""","""movie""","""Drømmeland""","""Drømmeland""",0,2019,"""\N""","""72""","""Documentary""",6.2,52
"""tt9915790""","""movie""","""Bobbyr Bondhura""","""Bobbyr Bondhura""",0,2019,"""\N""","""106""","""Family""",7.0,44
…,…,…,…,…,…,…,…,…,…,…
"""tt0000630""","""movie""","""Hamlet""","""Amleto""",0,1908,"""\N""","""\N""","""Drama""",3.1,30
"""tt0000591""","""movie""","""The Prodigal Son""","""L'enfant prodigue""",0,1907,"""\N""","""90""","""Drama""",5.6,30
"""tt0000574""","""movie""","""The Story of the Kelly Gang""","""The Story of the Kelly Gang""",0,1906,"""\N""","""70""","""Action,Adventure,Biography""",6.0,971
"""tt0000147""","""movie""","""The Corbett-Fitzsimmons Fight""","""The Corbett-Fitzsimmons Fight""",0,1897,"""\N""","""100""","""Documentary,News,Sport""",5.3,549


In [7]:
valid_tconst = df_denorm.select("tconst")

df_principals = pl.read_csv(
    os.path.join(data_dir, "title.principals.tsv.gz"),
    separator="\t",
    ignore_errors=True,
    quote_char=None,
)

# df_principals = df_principals.join(df_denorm, on="tconst").filter(
#     pl.col("nconst").is_not_null()
# )
df_principals = df_principals.filter(
    pl.col("tconst").is_in(valid_tconst),
)

df_names = pl.read_csv(
    os.path.join(data_dir, "name.basics.tsv.gz"),
    separator="\t",
    ignore_errors=True,
    quote_char=None,
)

df_principals = df_principals.join(df_names, on="nconst")

df_principals

tconst,ordering,nconst,category,job,characters,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
str,i64,str,str,str,str,str,str,str,str,str
"""tt0023926""",6,"""nm0000001""","""actor""","""\N""","""[""Fred Astaire""]""","""Fred Astaire""","""1899""","""1987""","""actor,miscellaneous,producer""","""tt0072308,tt0050419,tt0053137,…"
"""tt0024025""",5,"""nm0000001""","""actor""","""\N""","""[""Fred Ayres""]""","""Fred Astaire""","""1899""","""1987""","""actor,miscellaneous,producer""","""tt0072308,tt0050419,tt0053137,…"
"""tt0025164""",1,"""nm0000001""","""actor""","""\N""","""[""Guy Holden""]""","""Fred Astaire""","""1899""","""1987""","""actor,miscellaneous,producer""","""tt0072308,tt0050419,tt0053137,…"
"""tt0026942""",2,"""nm0000001""","""actor""","""\N""","""[""Huck Haines""]""","""Fred Astaire""","""1899""","""1987""","""actor,miscellaneous,producer""","""tt0072308,tt0050419,tt0053137,…"
"""tt0027125""",1,"""nm0000001""","""actor""","""\N""","""[""Jerry Travers""]""","""Fred Astaire""","""1899""","""1987""","""actor,miscellaneous,producer""","""tt0072308,tt0050419,tt0053137,…"
…,…,…,…,…,…,…,…,…,…,…
"""tt33337159""",4,"""nm9993693""","""actress""","""\N""","""\N""","""Apsara Rani""","""1996""","""\N""","""actress""","""tt12856788,tt8302382,tt1384750…"
"""tt10140990""",9,"""nm9993694""","""actor""","""\N""","""[""Susheel Panda""]""","""Chinmay Mishra""","""\N""","""\N""","""actor,director,producer""","""tt18361688,tt18687502,tt873775…"
"""tt18361688""",9,"""nm9993694""","""director""","""\N""","""\N""","""Chinmay Mishra""","""\N""","""\N""","""actor,director,producer""","""tt18361688,tt18687502,tt873775…"
"""tt18361688""",10,"""nm9993694""","""writer""","""story and screenplay""","""\N""","""Chinmay Mishra""","""\N""","""\N""","""actor,director,producer""","""tt18361688,tt18687502,tt873775…"


In [8]:
df_principals_agg = (
    df_principals.sort(["tconst", "ordering"])
    .group_by("tconst")
    .agg(
        director_names=pl.col("primaryName").filter(pl.col("category") == "director"),
        writer_names=pl.col("primaryName").filter(pl.col("category") == "writer"),
        producer_names=pl.col("primaryName").filter(pl.col("category") == "producer"),
        actor_names=pl.col("primaryName").filter(
            pl.col("category").is_in(["actor", "actress"])
        ),
        principal_names=pl.col("primaryName").filter(
            ~pl.col("category").is_in(
                ["director", "writer", "producer", "actor", "actress"]
            )
        ),
        principal_roles=pl.col("category").filter(
            ~pl.col("category").is_in(
                ["director", "writer", "producer", "actor", "actress"]
            )
        ),
    )
)

df_principals_agg

tconst,director_names,writer_names,producer_names,actor_names,principal_names,principal_roles
str,list[str],list[str],list[str],list[str],list[str],list[str]
"""tt18815700""","[""Kazuya Shiraishi""]","[""Ryû Kushiki"", ""Ryô Takada""]","[""Kazumi Fukase"", ""Shintaro Hori"", ""Takuro Nagai""]","[""Sadao Abe"", ""Koshi Mizukami"", … ""Ryô Satô""]","[""Naoya Ikeda""]","[""cinematographer""]"
"""tt0277371""","[""Joel Gallen""]","[""Mike Bender"", ""Adam Jay Epstein"", … ""Phil Beauman""]","[""Neal H. Moritz""]","[""Chyler Leigh"", ""Jaime Pressly"", … ""Sam Huntington""]","[""Theodore Shapiro"", ""Reynaldo Villalobos"", … ""Joseph T. Garrity""]","[""composer"", ""cinematographer"", … ""production_designer""]"
"""tt0335036""","[""Lupita Aquino-Kashiwahara"", ""Leroy Salvador""]","[""Tina Loy"", ""Jose Javier Reyes"", ""Armando Lao""]","[""William C. Leary""]","[""Herbert Bautista"", ""Herbert Bautista"", … ""Lenlen Oreta""]","[""Mon Del Rosario"", ""Johnny Araojo"", … ""Edgar Martin Littaua""]","[""composer"", ""cinematographer"", … ""production_designer""]"
"""tt0933389""","[""Michael Hoffman Jr.""]","[""Meghan Jones""]","[""Raymond L. Blagmon"", ""Michael Hoffman Jr.""]","[""Joe Estevez"", ""Reggie Bannister"", … ""Nick Bubb""]","[""Joseph Beaty"", ""Joseph Butera III"", … ""Pamela M. Staton""]","[""composer"", ""composer"", … ""casting_director""]"
"""tt0102933""","[""John Patterson""]","[""Jack Olsen"", ""Richard Fielder""]","[""Randy T. Siegel""]","[""Elizabeth Montgomery"", ""Dale Midkiff"", … ""Keith Atkinson""]","[""Richard Gibbs"", ""Jules Brenner"", … ""John Leimanis""]","[""composer"", ""cinematographer"", … ""production_designer""]"
…,…,…,…,…,…,…
"""tt0299569""","[""Walter Tennyson""]","[""Ian Walker""]",[],"[""Anthony Hulme"", ""C. Denier Warren"", … ""Charles Paton""]","[""Desmond Dickinson"", ""Etta Simpson""]","[""cinematographer"", ""editor""]"
"""tt0032785""","[""Otakar Vávra""]","[""Zdenek Stepánek"", ""Otakar Vávra""]",[],"[""Zdenek Stepánek"", ""Frantisek Kreuzmann"", … ""Václav Vydra""]","[""Jan Roth""]","[""cinematographer""]"
"""tt10244900""","[""Michael Robison""]","[""Nancy Grace"", ""Michelle Ricci""]","[""Christian Bruyère""]","[""Kellie Martin"", ""Viv Leacock"", … ""Vanessa Walsh""]","[""Hamish Thomson"", ""Christopher Wishart""]","[""composer"", ""production_designer""]"
"""tt0087364""","[""Leon de Winter""]","[""Leon de Winter""]",[],"[""Johan Leysen"", ""Linda van Dyck"", … ""José María Blanco""]","[""Henk Van Eeghen""]","[""editor""]"


## Reaggregate Everything and Build JSON Representation


In [9]:
df_final = df_denorm.join(df_principals_agg, on="tconst")

df_final

tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,director_names,writer_names,producer_names,actor_names,principal_names,principal_roles
str,str,str,str,i64,i64,str,str,str,f64,i64,list[str],list[str],list[str],list[str],list[str],list[str]
"""tt9916362""","""movie""","""Coven""","""Akelarre""",0,2020,"""\N""","""92""","""Drama,History""",6.4,6002,"[""Pablo Agüero""]","[""Pablo Agüero"", ""Katell Guillou""]","[""Iker Ganuza"", ""Fred Prémel"", ""Koldo Zuazua""]","[""Amaia Aberasturi"", ""Alex Brendemühl"", … ""Elena Uriz""]","[""Maite Arroitajauregi"", ""Aránzazu Calleja"", … ""Mikel Serrano""]","[""composer"", ""composer"", … ""production_designer""]"
"""tt9916270""","""movie""","""Il talento del calabrone""","""Il talento del calabrone""",0,2020,"""\N""","""84""","""Thriller""",5.8,1501,"[""Giacomo Cimini""]","[""Giacomo Cimini"", ""Lorenzo Collalti"", ""Alessandro Regaldo""]","[""Isabella Cocuzza"", ""Arturo Paglia""]","[""Sergio Castellitto"", ""Lorenzo Richelmy"", … ""Bianca Friscelli""]","[""Dimitri Scarlato"", ""Maurizio Calvesi"", … ""Ivana Gargiulo""]","[""composer"", ""cinematographer"", … ""production_designer""]"
"""tt9916190""","""movie""","""Safeguard""","""Safeguard""",0,2020,"""\N""","""95""","""Action,Adventure,Thriller""",3.6,262,"[""Fraser Precious""]","[""Fraser Precious""]","[""Fraser Precious"", ""Megan Young""]","[""Patrick Gallagher"", ""Akie Kotabe"", … ""Lee Byford""]","[""Simone Vallecorsa"", ""Matt Perren"", … ""Kelly Toode""]","[""composer"", ""cinematographer"", … ""production_designer""]"
"""tt9916160""","""movie""","""Drømmeland""","""Drømmeland""",0,2019,"""\N""","""72""","""Documentary""",6.2,52,"[""Joost van der Wiel""]",[],"[""Wout Conijn""]",[],"[""Nils Leidal"", ""Tobias Borkert"", … ""Herman P. Koerts""]","[""self"", ""composer"", … ""editor""]"
"""tt9915790""","""movie""","""Bobbyr Bondhura""","""Bobbyr Bondhura""",0,2019,"""\N""","""106""","""Family""",7.0,44,"[""Sudipa Chatterjee""]",[],"[""Akash Chatterjee""]","[""Kaushik Sen"", ""Sreelekha Mitra"", … ""Bhaswar Chatterjee""]","[""Santanu Mukherjee""]","[""editor""]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""tt0000630""","""movie""","""Hamlet""","""Amleto""",0,1908,"""\N""","""\N""","""Drama""",3.1,30,"[""Mario Caserini""]","[""William Shakespeare""]","[""Giuseppe de Liguoro""]","[""Fernanda Negri Pouget""]",[],[]
"""tt0000591""","""movie""","""The Prodigal Son""","""L'enfant prodigue""",0,1907,"""\N""","""90""","""Drama""",5.6,30,"[""Michel Carré""]","[""Michel Carré""]",[],"[""Georges Wague"", ""Henri Gouget"", … ""Gilberte Sergy""]",[],[]
"""tt0000574""","""movie""","""The Story of the Kelly Gang""","""The Story of the Kelly Gang""",0,1906,"""\N""","""70""","""Action,Adventure,Biography""",6.0,971,"[""Charles Tait""]","[""Charles Tait""]","[""W.A. Gibson"", ""Millard Johnson"", … ""Nevin Tait""]","[""Elizabeth Tait"", ""John Tait"", … ""John Forde""]","[""Eric Chapus"", ""Millard Johnson"", … ""Reg Perry""]","[""composer"", ""cinematographer"", … ""cinematographer""]"
"""tt0000147""","""movie""","""The Corbett-Fitzsimmons Fight""","""The Corbett-Fitzsimmons Fight""",0,1897,"""\N""","""100""","""Documentary,News,Sport""",5.3,549,"[""Enoch J. Rector""]",[],"[""William A. Brady""]",[],"[""James J. Corbett"", ""Bob Fitzsimmons"", … ""Enoch J. Rector""]","[""self"", ""self"", … ""cinematographer""]"


In [10]:
def movie_json(row):
    # row = row.to_dicts()[0]
    # print(row)
    genres = row["genres"].split(",") if row["genres"] != "\\N" else None
    principals = [
        {name: role}
        for name, role in zip(row["principal_names"], row["principal_roles"])
    ]

    json_dict = {
        "title": row["primaryTitle"],
        "genres": genres,
        "is_adult": True if row["isAdult"] == 1 else False,
        "release_year": row["startYear"],
        "runtime_minutes": int(row["runtimeMinutes"])
        if row["runtimeMinutes"] != "\\N"
        else None,
        "directors": row["director_names"],
        "writers": row["writer_names"],
        "producers": row["producer_names"],
        "actors": row["actor_names"],
        "principals": principals,
    }
    return orjson.dumps(json_dict, option=orjson.OPT_INDENT_2).decode("utf-8")

In [11]:
print(movie_json(df_final[0].to_dicts()[0]))

{
  "title": "Coven",
  "genres": [
    "Drama",
    "History"
  ],
  "is_adult": false,
  "release_year": 2020,
  "runtime_minutes": 92,
  "directors": [
    "Pablo Agüero"
  ],
  "writers": [
    "Pablo Agüero",
    "Katell Guillou"
  ],
  "producers": [
    "Iker Ganuza",
    "Fred Prémel",
    "Koldo Zuazua"
  ],
  "actors": [
    "Amaia Aberasturi",
    "Alex Brendemühl",
    "Daniel Fanego",
    "Garazi Urkola",
    "Yune Nogueiras",
    "Jone Laspiur",
    "Irati Saez de Urabain",
    "Lorea Ibarra",
    "Asier Oruesagasti",
    "Elena Uriz"
  ],
  "principals": [
    {
      "Maite Arroitajauregi": "composer"
    },
    {
      "Aránzazu Calleja": "composer"
    },
    {
      "Javier Agirre": "cinematographer"
    },
    {
      "Teresa Font": "editor"
    },
    {
      "Txabe Atxa": "casting_director"
    },
    {
      "Nathalie Camidebach": "casting_director"
    },
    {
      "Florencia Inés González": "casting_director"
    },
    {
      "Mikel Serrano": "production_de

https://stackoverflow.com/a/77558612


In [12]:
df_final = df_final.with_columns(
    json=pl.struct(pl.all()).map_elements(movie_json, return_dtype=str)
)

df_final.shape

(238628, 18)

In [13]:
# Star Wars: A New Hope
df_final.filter(pl.col("tconst") == "tt0076759").glimpse()

Rows: 1
Columns: 18
$ tconst                <str> 'tt0076759'
$ titleType             <str> 'movie'
$ primaryTitle          <str> 'Star Wars: Episode IV - A New Hope'
$ originalTitle         <str> 'Star Wars'
$ isAdult               <i64> 0
$ startYear             <i64> 1977
$ endYear               <str> '\\N'
$ runtimeMinutes        <str> '121'
$ genres                <str> 'Action,Adventure,Fantasy'
$ averageRating         <f64> 8.6
$ numVotes              <i64> 1494026
$ director_names  <list[str]> ['George Lucas']
$ writer_names    <list[str]> ['George Lucas']
$ producer_names  <list[str]> ['Gary Kurtz', 'Rick McCallum']
$ actor_names     <list[str]> ['Mark Hamill', 'Harrison Ford', 'Carrie Fisher', 'Alec Guinness', 'Peter Cushing', 'Anthony Daniels', 'Kenny Baker', 'Peter Mayhew', 'David Prowse', 'Phil Brown']
$ principal_names <list[str]> ['John Williams', 'Gilbert Taylor', 'Richard Chew', 'T.M. Christopher', 'Paul Hirsch', 'Marcia Lucas', 'Dianne Crittenden', 'Irene Lamb', 'Vic 

In [14]:
# Sonic 3 (released December 2024 to check for recent data)
df_final.filter(pl.col("tconst") == "tt18259086").glimpse()

Rows: 1
Columns: 18
$ tconst                <str> 'tt18259086'
$ titleType             <str> 'movie'
$ primaryTitle          <str> 'Sonic the Hedgehog 3'
$ originalTitle         <str> 'Sonic the Hedgehog 3'
$ isAdult               <i64> 0
$ startYear             <i64> 2024
$ endYear               <str> '\\N'
$ runtimeMinutes        <str> '110'
$ genres                <str> 'Action,Adventure,Comedy'
$ averageRating         <f64> 7.0
$ numVotes              <i64> 43854
$ director_names  <list[str]> ['Jeff Fowler']
$ writer_names    <list[str]> ['Pat Casey', 'Josh Miller', 'John Whittington']
$ producer_names  <list[str]> ['Toby Ascher', 'Neal H. Moritz', 'Toru Nakahara', 'Hitoshi Okuno']
$ actor_names     <list[str]> ['Jim Carrey', 'Jim Carrey', 'Ben Schwartz', 'Keanu Reeves', 'Idris Elba', "Colleen O'Shaughnessey", 'James Marsden', 'Tika Sumpter', 'Lee Majdoub', 'Krysten Ritter', 'Adam Pally']
$ principal_names <list[str]> ['Tom Holkenborg', 'Brandon Trost', 'Al LeVine', 'Sophie Holland

In [15]:
df_final.select(
    ["tconst", "startYear", "numVotes", "averageRating", "json"]
).write_parquet(os.path.join(data_dir, "test_movie_json_input.parquet"))