In [1]:
import polars as pl
import os
import json

Reuse the processed dataset since the actors contained will already have been filtered.


In [2]:
data_dir = "/Users/maxwoolf/Downloads"

df = pl.read_parquet(
    os.path.join(data_dir, "test_movie_json_input.parquet"),
)

df

tconst,startYear,averageRating,json
str,i64,f64,str
"""tt9916730""",2017,7.0,"""{""title"":""6 Gunn"",""genres"":[""D…"
"""tt9916706""",2013,7.7,"""{""title"":""Dankyavar Danka"",""ge…"
"""tt9916538""",2019,8.3,"""{""title"":""Kuambil Lagi Hatiku""…"
"""tt9916460""",2019,8.9,"""{""title"":""Pink Taxi"",""genres"":…"
"""tt9916428""",2019,3.6,"""{""title"":""The Secret of China""…"
…,…,…,…
"""tt0000591""",1907,5.6,"""{""title"":""The Prodigal Son"",""g…"
"""tt0000574""",1906,6.0,"""{""title"":""The Story of the Kel…"
"""tt0000502""",1905,4.4,"""{""title"":""Bohemios"",""genres"":n…"
"""tt0000147""",1897,5.2,"""{""title"":""The Corbett-Fitzsimm…"


...but will need to process the JSON again. https://stackoverflow.com/a/73129057

The JSON is extracted as a `str` so must be converted to a `List[str]`. Which can be hacked with `json_decode()`


In [35]:
df_actors_test = (
    df.head(5)
    .with_columns(
        actors=pl.col("json").str.json_path_match(r"$.actors").str.json_decode()
    )
    .select(pl.col("actors"))
)

df_actors_test

actors
list[str]
"[""Devadhar Archit"", ""Sunil Barve"", … ""Atul Todankar""]"
"[""Makarand Anaspure"", ""Anvay Bendre"", … ""Pravin Tarde""]"
"[""Lala Karmela"", ""Cut Mini Theo"", … ""Marwoto""]"
"[""Argiris Tafralidis"", ""Christina Tafralidou"", … ""Giorgos Vezirgiannidis""]"
"[""Kenan Heppe"", ""Wang Peng Kai"", … ""Shenyang Xiao""]"


Run on the full set, and simultaneously aggregate metrics.


In [40]:
df_actors = (
    df.with_columns(
        actors=pl.col("json").str.json_path_match(r"$.actors").str.json_decode()
    )
    .explode("actors")
    .filter(pl.col("actors").is_not_null())
    .group_by("actors")
    .agg(
        movie_count=pl.col("actors").len(),
        avg_movie_rating=pl.col("averageRating").mean(),
    )
    .sort("movie_count", descending=True)
)

df_actors

actors,movie_count,avg_movie_rating
str,u32,f64
"""Brahmanandam""",810,5.946296
"""Jagathy Sreekumar""",532,5.597556
"""Shakti Kapoor""",486,5.215844
"""Eric Roberts""",449,4.816927
"""Mammootty""",417,6.4
…,…,…
"""Mark Dancewicz""",1,5.1
"""Ramon Magsaysay""",1,6.3
"""Delly Malik Muharyoso""",1,6.6
"""Gordon White""",1,8.1


In [49]:
df_actors.head(10).rows(named=True)

[{'actors': 'Brahmanandam',
  'movie_count': 810,
  'avg_movie_rating': 5.946296296296295},
 {'actors': 'Jagathy Sreekumar',
  'movie_count': 532,
  'avg_movie_rating': 5.597556390977438},
 {'actors': 'Shakti Kapoor',
  'movie_count': 486,
  'avg_movie_rating': 5.215843621399179},
 {'actors': 'Eric Roberts',
  'movie_count': 449,
  'avg_movie_rating': 4.816926503340759},
 {'actors': 'Mammootty',
  'movie_count': 417,
  'avg_movie_rating': 6.3999999999999995},
 {'actors': 'Mohammad Ali',
  'movie_count': 379,
  'avg_movie_rating': 5.6255936675461715},
 {'actors': 'Mohanlal',
  'movie_count': 378,
  'avg_movie_rating': 6.337830687830692},
 {'actors': 'Nassar',
  'movie_count': 378,
  'avg_movie_rating': 6.119312169312167},
 {'actors': 'Aruna Irani',
  'movie_count': 377,
  'avg_movie_rating': 5.737665782493368},
 {'actors': 'Mithun Chakraborty',
  'movie_count': 376,
  'avg_movie_rating': 5.754255319148937}]

In [45]:
df_actors.filter(pl.col("actors") == "Keanu Reeves")

actors,movie_count,avg_movie_rating
str,u32,f64
"""Keanu Reeves""",69,6.275362


Per Reeves' IMDb, he has acted (at time of writing) in [78 movies](https://www.imdb.com/name/nm0000206/?ref_=nv_sr_srsg_0_tt_7_nm_1_in_0_q_keanu%2520reeves) w/ a rating, so this is close but may not be comprehensive.
