## Data Loading

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

titles_schema = StructType([
    StructField("id", StringType()),
    StructField("title", StringType()),
    StructField("type", StringType()),
    StructField("description", StringType()),
    StructField("release_year", IntegerType()),
    StructField("age_certification", StringType()),
    StructField("runtime", IntegerType()),
    StructField("genres", StringType()),  # will fix later
    StructField("production_countries", StringType()),  # will fix later
    StructField("seasons", IntegerType()),
    StructField("imdb_id", StringType()),
    StructField("imdb_score", DoubleType()),
    StructField("imdb_votes", IntegerType()),
    StructField("tmdb_popularity", DoubleType()),
    StructField("tmdb_score", DoubleType())
])

credits_schema = StructType([
    StructField("person_id", IntegerType()),
    StructField("id", StringType()),
    StructField("name", StringType()),
    StructField("character", StringType()),
    StructField("role", StringType())
])

In [0]:
# titles_df = spark.read.format("csv") \
#     .option("header","true") \
#     .schema(titles_schema) \
#     .load("/Volumes/workspace/default/netflix/titles.csv") 
# titles_df.write.format("delta").mode("overwrite").saveAsTable("titles_df")

In [0]:
titles_df = spark.read.table("titles_df")

In [0]:
# credits_df = spark.read.format("csv") \
#     .option("header","true") \
#     .schema(credits_schema) \
#     .load("/Volumes/workspace/default/netflix/credits.csv") 
# credits_df.write.format("delta").mode("overwrite").saveAsTable("credits_df")

In [0]:
credits_df = spark.read.table("credits_df")

## Data Cleaning

In [0]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import ArrayType, StringType

titles_df = titles_df.withColumn("genres_array", from_json("genres", ArrayType(StringType()))) \
    .withColumn("countries_array", from_json("production_countries", ArrayType(StringType())))

In [0]:
print(titles_df.dtypes)
display(titles_df.limit(10))

[('id', 'string'), ('title', 'string'), ('type', 'string'), ('description', 'string'), ('release_year', 'int'), ('age_certification', 'string'), ('runtime', 'int'), ('genres', 'string'), ('production_countries', 'string'), ('seasons', 'int'), ('imdb_id', 'string'), ('imdb_score', 'double'), ('imdb_votes', 'int'), ('tmdb_popularity', 'double'), ('tmdb_score', 'double'), ('genres_array', 'array<string>'), ('countries_array', 'array<string>')]


id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,genres_array,countries_array
ts300399,Five Came Back: The Reference Films,SHOW,"""This collection includes 12 World War II-era propaganda films — many of which are graphic and offensive — discussed in the docuseries """"Five Came Back.""""""",1945,TV-MA,51,['documentation'],['US'],,,,,0.6,,List(documentation),List(US)
tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works as a night-time taxi driver in New York City where the perceived decadence and sleaze feed his urge for violent action.,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,,40.965,8.179,"List(drama, crime)",List(US)
tm154986,Deliverance,MOVIE,"Intent on seeing the Cahulawassee River before it's turned into one huge lake, outdoor fanatic Lewis Medlock takes his friends on a river-rafting trip they'll never forget into the dangerous American back-country.",1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,,10.01,7.3,"List(drama, action, thriller, european)",List(US)
tm127384,Monty Python and the Holy Grail,MOVIE,"""King Arthur, accompanied by his squire, recruits his Knights of the Round Table, including Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Robin the Not-Quite-So-Brave-As-Sir-Lancelot and Sir Galahad the Pure. On the way, Arthur battles the Black Knight who, despite having had all his limbs chopped off, insists he can still fight. They reach Camelot, but Arthur decides not to enter, as """"it is a silly place"""".""",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,,15.461,7.811,"List(fantasy, action, comedy)",List(GB)
tm120801,The Dirty Dozen,MOVIE,"12 American military prisoners in World War II are ordered to infiltrate a well-guarded enemy château and kill the Nazi officers vacationing there. The soldiers, most of whom are facing death sentences for a variety of violent crimes, agree to the mission and the possible commuting of their sentences.",1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,,20.398,7.6,"List(war, action)","List(GB, US)"
ts22164,Monty Python's Flying Circus,SHOW,"A British sketch comedy series with the shows being composed of surreality, risqué or innuendo-laden humour, sight gags and observational sketches without punchlines.",1969,TV-14,30,"['comedy', 'european']",['GB'],,tt0063929,8.8,,17.617,8.306,"List(comedy, european)",List(GB)
tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, but through a series of ridiculous events, he gains a reputation as the Messiah. When he's not dodging his followers or being scolded by his shrill mother, the hapless Brian has to contend with the pompous Pontius Pilate and acronym-obsessed members of a separatist movement. Rife with Monty Python's signature absurdity, the tale finds Brian's life paralleling Biblical lore, albeit with many more laughs.",1979,R,94,['comedy'],['GB'],,tt0079470,8.0,,17.77,7.8,List(comedy),List(GB)
tm14873,Dirty Harry,MOVIE,"When a madman dubbed 'Scorpio' terrorizes San Francisco, hard-nosed cop, Harry Callahan – famous for his take-no-prisoners approach to law enforcement – is tasked with hunting down the psychopath. Harry eventually collars Scorpio in the process of rescuing a kidnap victim, only to see him walk on technicalities. Now, the maverick detective is determined to nail the maniac himself.",1971,R,102,"['thriller', 'action', 'crime']",['US'],,tt0066999,7.7,,12.817,7.5,"List(thriller, action, crime)",List(US)
tm119281,Bonnie and Clyde,MOVIE,"In the 1930s, bored waitress Bonnie Parker falls in love with an ex-con named Clyde Barrow and together they start a violent crime spree through the country, stealing cars and robbing banks.",1967,R,110,"['crime', 'drama', 'action']",['US'],,tt0061418,7.7,,15.687,7.5,"List(crime, drama, action)",List(US)
tm98978,The Blue Lagoon,MOVIE,"Two small children and a ship's cook survive a shipwreck and find safety on an idyllic tropical island. Soon, however, the cook dies and the young boy and girl are left on their own. Days become years and Emmeline and Richard make a home for themselves surrounded by exotic creatures and nature's beauty. But will they ever see civilization again?",1980,R,104,"['romance', 'action', 'drama']",['US'],,tt0080453,5.8,,50.324,6.156,"List(romance, action, drama)",List(US)


In [0]:
from pyspark.sql.functions import col, isnull, count, when
nulls = titles_df.select([count(when(isnull(c),c)).alias(c) for c in titles_df.columns])
nulls.display()

id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,genres_array,countries_array
0,3,18,40,324,2594,303,97,96,5881,541,737,5914,289,458,329,243


In [0]:
cnulls = credits_df.select([count(when(isnull(c),c)).alias(c) for c in credits_df.columns])
cnulls.display()

person_id,id,name,character,role
0,0,0,9772,0


In [0]:
display(credits_df.where((col("character").isNull()) & (col("role") == "ACTOR")).limit(10))

person_id,id,name,character,role
548006,tm70993,Peter Brett,,ACTOR
557168,tm135083,Loutfi El Hakim,,ACTOR
557170,tm135083,F. El Demerdache,,ACTOR
557171,tm135083,Said El Araby,,ACTOR
557172,tm135083,Hana Abdel Fattah,,ACTOR
557173,tm135083,Safia Sarwat,,ACTOR
1521416,tm135083,Sherine,,ACTOR
1521414,tm135083,Soheir,,ACTOR
1321149,tm135083,Esmat Mahmoud,,ACTOR
1521441,tm135083,Nawal Morsi,,ACTOR


In [0]:
from pyspark.sql.functions import lit
credits_df = credits_df.withColumn("character",
    when((col("character").isNull()) & (col("role") == "ACTOR"), lit("Unknown Character")).otherwise(col("character")))

credits_df = credits_df.withColumn(
    "character",
    when(col("character").isNull(), lit("N/A")).otherwise(col("character"))
)

In [0]:
from pyspark.sql.functions import lit
titles_df = titles_df.withColumn("imdb_votes",
    when((col("imdb_votes").isNull()) & (col("imdb_score").isNotNull()), lit(100)) \
    .when(col("imdb_votes").isNull(), lit(0)) \
        .otherwise(col("imdb_votes"))
    )

In [0]:
critical_col = ['id', 'title', 'release_year', 'imdb_id']
titles_df = titles_df.dropna(subset=critical_col)

In [0]:
titles_df = titles_df.fillna("No Description", "description")

In [0]:
import random
ty = ["MOVIE","SHOW"]
titles_df = titles_df.fillna(random.choice(ty),"type")

In [0]:
from pyspark.sql.functions import desc
mode = (titles_df.filter(col("age_certification").isNotNull()).groupBy("age_certification").count().orderBy(desc("count")).first()["age_certification"])
print(mode)
titles_df = titles_df.fillna(mode,"age_certification")

TV-MA


In [0]:
from pyspark.sql.functions import mean
mean_runtime = titles_df.select(mean("runtime")).first()[0]
print(mean_runtime)
titles_df = titles_df.fillna(mean_runtime, "runtime")

78.73397312859885


In [0]:
titles_df = titles_df.withColumn("seasons",
    when(col("seasons").isNull(), when(col("type") == "SHOW", 1).otherwise(0)).otherwise(col("seasons")))

In [0]:

score_means = titles_df.select(*[mean(col(c)).alias(c) for c in ["tmdb_score", "imdb_score", "tmdb_popularity"]]).first()
print(score_means)
titles_df = titles_df.fillna({
    "tmdb_score": score_means["tmdb_score"],
    "imdb_score": score_means["imdb_score"],
    "tmdb_popularity": score_means["tmdb_popularity"]
    })

Row(tmdb_score=6.812709456740442, imdb_score=6.54076694503787, tmdb_popularity=23.47914915188145)


In [0]:
from pyspark.sql.functions import array
titles_df = titles_df.withColumn("genres_array",
    when(col("genres_array").isNull(), array().cast("array<string>")).otherwise(col("genres_array"))
)

In [0]:
titles_df = titles_df.withColumn("countries_array",
    when(col("countries_array").isNull(), array().cast("array<string>")).otherwise(col("countries_array"))
    )

## Data Analysis using PySpark

In [0]:
from pyspark.sql.functions import explode

exploded_genres = titles_df.withColumn("genre", explode("genres_array"))
exploded_countries = titles_df.withColumn("country", explode("countries_array"))
display(exploded_countries.limit(10))

id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,genres_array,countries_array,country
tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works as a night-time taxi driver in New York City where the perceived decadence and sleaze feed his urge for violent action.,1976,R,114,"['drama', 'crime']",['US'],0,tt0075314,8.2,100,40.965,8.179,"List(drama, crime)",List(US),US
tm154986,Deliverance,MOVIE,"Intent on seeing the Cahulawassee River before it's turned into one huge lake, outdoor fanatic Lewis Medlock takes his friends on a river-rafting trip they'll never forget into the dangerous American back-country.",1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],0,tt0068473,7.7,100,10.01,7.3,"List(drama, action, thriller, european)",List(US),US
tm127384,Monty Python and the Holy Grail,MOVIE,"""King Arthur, accompanied by his squire, recruits his Knights of the Round Table, including Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Robin the Not-Quite-So-Brave-As-Sir-Lancelot and Sir Galahad the Pure. On the way, Arthur battles the Black Knight who, despite having had all his limbs chopped off, insists he can still fight. They reach Camelot, but Arthur decides not to enter, as """"it is a silly place"""".""",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],0,tt0071853,8.2,100,15.461,7.811,"List(fantasy, action, comedy)",List(GB),GB
tm120801,The Dirty Dozen,MOVIE,"12 American military prisoners in World War II are ordered to infiltrate a well-guarded enemy château and kill the Nazi officers vacationing there. The soldiers, most of whom are facing death sentences for a variety of violent crimes, agree to the mission and the possible commuting of their sentences.",1967,TV-MA,150,"['war', 'action']","['GB', 'US']",0,tt0061578,7.7,100,20.398,7.6,"List(war, action)","List(GB, US)",GB
tm120801,The Dirty Dozen,MOVIE,"12 American military prisoners in World War II are ordered to infiltrate a well-guarded enemy château and kill the Nazi officers vacationing there. The soldiers, most of whom are facing death sentences for a variety of violent crimes, agree to the mission and the possible commuting of their sentences.",1967,TV-MA,150,"['war', 'action']","['GB', 'US']",0,tt0061578,7.7,100,20.398,7.6,"List(war, action)","List(GB, US)",US
ts22164,Monty Python's Flying Circus,SHOW,"A British sketch comedy series with the shows being composed of surreality, risqué or innuendo-laden humour, sight gags and observational sketches without punchlines.",1969,TV-14,30,"['comedy', 'european']",['GB'],1,tt0063929,8.8,100,17.617,8.306,"List(comedy, european)",List(GB),GB
tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, but through a series of ridiculous events, he gains a reputation as the Messiah. When he's not dodging his followers or being scolded by his shrill mother, the hapless Brian has to contend with the pompous Pontius Pilate and acronym-obsessed members of a separatist movement. Rife with Monty Python's signature absurdity, the tale finds Brian's life paralleling Biblical lore, albeit with many more laughs.",1979,R,94,['comedy'],['GB'],0,tt0079470,8.0,100,17.77,7.8,List(comedy),List(GB),GB
tm14873,Dirty Harry,MOVIE,"When a madman dubbed 'Scorpio' terrorizes San Francisco, hard-nosed cop, Harry Callahan – famous for his take-no-prisoners approach to law enforcement – is tasked with hunting down the psychopath. Harry eventually collars Scorpio in the process of rescuing a kidnap victim, only to see him walk on technicalities. Now, the maverick detective is determined to nail the maniac himself.",1971,R,102,"['thriller', 'action', 'crime']",['US'],0,tt0066999,7.7,100,12.817,7.5,"List(thriller, action, crime)",List(US),US
tm119281,Bonnie and Clyde,MOVIE,"In the 1930s, bored waitress Bonnie Parker falls in love with an ex-con named Clyde Barrow and together they start a violent crime spree through the country, stealing cars and robbing banks.",1967,R,110,"['crime', 'drama', 'action']",['US'],0,tt0061418,7.7,100,15.687,7.5,"List(crime, drama, action)",List(US),US
tm98978,The Blue Lagoon,MOVIE,"Two small children and a ship's cook survive a shipwreck and find safety on an idyllic tropical island. Soon, however, the cook dies and the young boy and girl are left on their own. Days become years and Emmeline and Richard make a home for themselves surrounded by exotic creatures and nature's beauty. But will they ever see civilization again?",1980,R,104,"['romance', 'action', 'drama']",['US'],0,tt0080453,5.8,100,50.324,6.156,"List(romance, action, drama)",List(US),US


In [0]:
exploded_df = titles_df.withColumn("genre", explode("genres_array")).withColumn("country", explode("countries_array"))
display(exploded_df.limit(10))

id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,genres_array,countries_array,genre,country
tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works as a night-time taxi driver in New York City where the perceived decadence and sleaze feed his urge for violent action.,1976,R,114,"['drama', 'crime']",['US'],0,tt0075314,8.2,100,40.965,8.179,"List(drama, crime)",List(US),drama,US
tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works as a night-time taxi driver in New York City where the perceived decadence and sleaze feed his urge for violent action.,1976,R,114,"['drama', 'crime']",['US'],0,tt0075314,8.2,100,40.965,8.179,"List(drama, crime)",List(US),crime,US
tm154986,Deliverance,MOVIE,"Intent on seeing the Cahulawassee River before it's turned into one huge lake, outdoor fanatic Lewis Medlock takes his friends on a river-rafting trip they'll never forget into the dangerous American back-country.",1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],0,tt0068473,7.7,100,10.01,7.3,"List(drama, action, thriller, european)",List(US),drama,US
tm154986,Deliverance,MOVIE,"Intent on seeing the Cahulawassee River before it's turned into one huge lake, outdoor fanatic Lewis Medlock takes his friends on a river-rafting trip they'll never forget into the dangerous American back-country.",1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],0,tt0068473,7.7,100,10.01,7.3,"List(drama, action, thriller, european)",List(US),action,US
tm154986,Deliverance,MOVIE,"Intent on seeing the Cahulawassee River before it's turned into one huge lake, outdoor fanatic Lewis Medlock takes his friends on a river-rafting trip they'll never forget into the dangerous American back-country.",1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],0,tt0068473,7.7,100,10.01,7.3,"List(drama, action, thriller, european)",List(US),thriller,US
tm154986,Deliverance,MOVIE,"Intent on seeing the Cahulawassee River before it's turned into one huge lake, outdoor fanatic Lewis Medlock takes his friends on a river-rafting trip they'll never forget into the dangerous American back-country.",1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],0,tt0068473,7.7,100,10.01,7.3,"List(drama, action, thriller, european)",List(US),european,US
tm127384,Monty Python and the Holy Grail,MOVIE,"""King Arthur, accompanied by his squire, recruits his Knights of the Round Table, including Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Robin the Not-Quite-So-Brave-As-Sir-Lancelot and Sir Galahad the Pure. On the way, Arthur battles the Black Knight who, despite having had all his limbs chopped off, insists he can still fight. They reach Camelot, but Arthur decides not to enter, as """"it is a silly place"""".""",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],0,tt0071853,8.2,100,15.461,7.811,"List(fantasy, action, comedy)",List(GB),fantasy,GB
tm127384,Monty Python and the Holy Grail,MOVIE,"""King Arthur, accompanied by his squire, recruits his Knights of the Round Table, including Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Robin the Not-Quite-So-Brave-As-Sir-Lancelot and Sir Galahad the Pure. On the way, Arthur battles the Black Knight who, despite having had all his limbs chopped off, insists he can still fight. They reach Camelot, but Arthur decides not to enter, as """"it is a silly place"""".""",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],0,tt0071853,8.2,100,15.461,7.811,"List(fantasy, action, comedy)",List(GB),action,GB
tm127384,Monty Python and the Holy Grail,MOVIE,"""King Arthur, accompanied by his squire, recruits his Knights of the Round Table, including Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Robin the Not-Quite-So-Brave-As-Sir-Lancelot and Sir Galahad the Pure. On the way, Arthur battles the Black Knight who, despite having had all his limbs chopped off, insists he can still fight. They reach Camelot, but Arthur decides not to enter, as """"it is a silly place"""".""",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],0,tt0071853,8.2,100,15.461,7.811,"List(fantasy, action, comedy)",List(GB),comedy,GB
tm120801,The Dirty Dozen,MOVIE,"12 American military prisoners in World War II are ordered to infiltrate a well-guarded enemy château and kill the Nazi officers vacationing there. The soldiers, most of whom are facing death sentences for a variety of violent crimes, agree to the mission and the possible commuting of their sentences.",1967,TV-MA,150,"['war', 'action']","['GB', 'US']",0,tt0061578,7.7,100,20.398,7.6,"List(war, action)","List(GB, US)",war,GB


In [0]:
exploded_genres.groupBy("genre").avg("tmdb_score").orderBy("avg(tmdb_score)", ascending=False).show()
exploded_genres.groupBy("genre").avg("imdb_score").orderBy("avg(imdb_score)", ascending=False).show()

+-------------+------------------+
|        genre|   avg(tmdb_score)|
+-------------+------------------+
|    animation| 7.293797938967137|
|      reality| 7.183429691355684|
|      history| 7.171325958979685|
|       family| 7.169489345044626|
|      fantasy|7.1154994483910174|
|        scifi| 7.111733757976429|
|documentation| 7.039824511008362|
|        music| 7.034559849473412|
|          war| 7.031749459423207|
|        sport|  7.01549352642803|
|        drama|  6.85058051914697|
|       action|  6.81617335734635|
|        crime| 6.811674200184313|
|      western| 6.770095114275396|
|      romance| 6.757829571354383|
|       comedy| 6.728796571898445|
|     thriller| 6.625630437516879|
|     european| 6.577847267388113|
|       horror|6.4787646111965955|
+-------------+------------------+

+-------------+------------------+
|        genre|   avg(imdb_score)|
+-------------+------------------+
|      history| 7.129435483870959|
|          war|              7.06|
|documentation|6.99

In [0]:
exploded_countries.groupBy("country").count().orderBy("count", ascending = False).show()

+-------+-----+
|country|count|
+-------+-----+
|     US| 2090|
|     IN|  586|
|     GB|  364|
|     JP|  231|
|     FR|  229|
|     ES|  195|
|     KR|  194|
|     CA|  188|
|     DE|  127|
|     MX|  115|
|     BR|   97|
|     CN|   91|
|     TR|   80|
|     AU|   79|
|     PH|   78|
|     IT|   76|
|     NG|   63|
|     AR|   62|
|     ID|   58|
|     TW|   52|
+-------+-----+
only showing top 20 rows


In [0]:
exploded_countries.groupBy("country").avg("imdb_score").where(col("country") == "IN").show()
exploded_countries.groupBy("country").avg("tmdb_score").orderBy("avg(tmdb_score)", ascending=False).show()

+-------+-----------------+
|country|  avg(imdb_score)|
+-------+-----------------+
|     IN|6.413829069496906|
+-------+-----------------+

+-------+------------------+
|country|   avg(tmdb_score)|
+-------+------------------+
|     NP|               8.1|
|     PK| 8.066666666666666|
|     CD|               8.0|
|     MW|               7.9|
|     VE|               7.8|
|     CU|               7.7|
|     KR| 7.687186228712478|
|     TZ| 7.656354728370221|
|     HR|               7.6|
|     BS|             7.518|
|     BT|               7.4|
|     KE| 7.370903152246814|
|     JP| 7.327440856393772|
|     VA|               7.3|
|     KN|               7.3|
|     TH|7.2937236485580135|
|     CN| 7.266054945054947|
|     RU| 7.246666666666668|
|     SA|7.2125418913480885|
|     PT| 7.159999999999999|
+-------+------------------+
only showing top 20 rows


In [0]:
from pyspark.sql.functions import sum
exploded_countries.groupBy("country").agg(sum(col("tmdb_popularity").cast("double")).alias("total_popularity")).orderBy("total_popularity", ascending=False).show()

+-------+------------------+
|country|  total_popularity|
+-------+------------------+
|     US| 61234.76319321513|
|     GB|          8626.368|
|     JP| 8389.846149151883|
|     KR| 6665.676149151882|
|     ES| 6277.817447455646|
|     FR| 4755.061999999997|
|     CA| 4127.265000000001|
|     IN| 3491.780491518816|
|     MX|3239.3559999999998|
|     TW|2635.3420000000006|
|     DE|          2563.323|
|     CO|2466.1701491518816|
|     AU|2078.8680000000004|
|     ZA|2058.8361491518813|
|     CN|1956.4939999999997|
|     IT|1908.2029999999995|
|     PL|1625.4199999999996|
|     BE|1302.1670000000001|
|     AR|1111.5859999999998|
|     BR|1110.1291491518814|
+-------+------------------+
only showing top 20 rows


In [0]:
exploded_countries.groupBy("country").avg("runtime").orderBy("avg(runtime)", ascending=False).show()

+-------+-----------------+
|country|     avg(runtime)|
+-------+-----------------+
|     SU|            154.0|
|     BS|            144.0|
|     MT|            142.5|
|     NP|            138.0|
|     KG|            128.0|
|     IN|125.6433447098976|
|     DZ|            124.0|
|     GR|            124.0|
|     BF|            124.0|
|     PK|            123.0|
|     LT|            118.5|
|     CM|            118.0|
|     ZW|            116.0|
|     HU|            116.0|
|     GH|           114.25|
|     IQ|            113.0|
|     MW|            113.0|
|     CU|            113.0|
|Lebanon|            112.0|
|     AL|            111.0|
+-------+-----------------+
only showing top 20 rows


In [0]:
from datetime import datetime
from zoneinfo import ZoneInfo
now = datetime.now(ZoneInfo("Asia/Kolkata"))
print(now.strftime("%Y-%m-%d %H-%M-%S"))
print(now.strftime("%I-%p-%M-%S"))
print(now.strftime("%j"))
print(now.strftime("%B-%d-%Y"))
print(now.strftime("%Z"))

today = datetime.today()
print(f"Today is:", today.strftime("%A"),",", "Today is:", today.strftime("%B"))

2025-07-04 18-58-11
06-PM-58-11
185
July-04-2025
IST
Today is: Friday , Today is: July


In [0]:
from pyspark.sql.functions import col, avg, count, desc
from pyspark.sql.window import Window
exploded_genres = exploded_genres.withColumn("Normalized_Score", col("tmdb_score")/avg("tmdb_score").over(Window.partitionBy("release_year")))
genre_ranking = exploded_genres.groupBy("genre").agg(count("*").alias("count"), avg("Normalized_score").alias("avg_normalized_score")).orderBy(desc("avg_normalized_score"))
genre_ranking.show()

+-------------+-----+--------------------+
|        genre|count|avg_normalized_score|
+-------------+-----+--------------------+
|    animation|  600|   1.061730606618677|
|       family|  624|  1.0437442727671633|
|      reality|  212|  1.0410284718902578|
|      history|  248|  1.0408630150696698|
|      fantasy|  588|   1.035859828294048|
|        scifi|  525|  1.0340934113166695|
|          war|  150|  1.0246572235100562|
|documentation|  832|  1.0238872313925538|
|        music|  239|  1.0226656799992777|
|        sport|  167|  1.0202944994908683|
|        drama| 2742|  0.9973465083068569|
|       action| 1058|  0.9933045893407961|
|        crime|  893|   0.992257369941982|
|      western|   39|  0.9858902088645044|
|      romance|  922|   0.984407573694766|
|       comedy| 2124|  0.9799020623976237|
|     thriller| 1155|  0.9650671475396626|
|     european|  407|  0.9632572395194806|
|       horror|  357|  0.9430239241182721|
+-------------+-----+--------------------+



In [0]:
print(titles_df.count(),credits_df.count())

5222 77801


In [0]:
from pyspark.sql.functions import broadcast
bjoin_df = credits_df.join(broadcast(titles_df), on="id", how="inner")
actors_df = bjoin_df.filter(col("role") == "ACTOR")

actors_df.groupBy("name").agg(count("*").alias("appearances"), avg("tmdb_score").alias("avg_tmdb_score")).filter(col("appearances") > 5).orderBy(desc("avg_tmdb_score")).show()

+-------------------+-----------+-----------------+
|               name|appearances|   avg_tmdb_score|
+-------------------+-----------+-----------------+
|       Jeon Bae-soo|          6|8.241999999999999|
|      Kim Sun-young|          7|8.172714285714285|
|       Ahn Nae-sang|          8|           8.1625|
|       Lee Jung-eun|          9|8.158999999999999|
|       Megumi Ogata|          6|8.149166666666666|
|   Miyuki Sawashiro|          7|8.145999999999999|
|        Kim Hye-eun|          6|            8.136|
|      Kim Mi-kyeong|         11|8.107818181818182|
|      Lee Joon-hyuk|          6|8.102333333333332|
|    Takehito Koyasu|          9|8.100000000000001|
|     Jang Hyun-sung|          6|8.012166666666667|
|       Jun Fukuyama|         11|8.008727272727272|
|Yoshitsugu Matsuoka|          6|8.003666666666666|
| Nobunaga Shimazaki|          6|7.991499999999999|
|   Tatsuhisa Suzuki|          6|7.971999999999999|
|     You Chea-myung|          6|7.964499999999998|
|         Ke

In [0]:
from pyspark.sql.functions import broadcast
bjoin_df = credits_df.join(broadcast(exploded_countries), on="id", how="inner")
actors_df = bjoin_df.filter((col("role") == "ACTOR") & (col("country") == "IN"))

actors_df.groupBy("name").agg(count("*").alias("appearances"), avg("tmdb_score").alias("avg_tmdb_score")).filter(col("appearances") > 15).orderBy(desc("avg_tmdb_score")).show()

+--------------------+-----------+------------------+
|                name|appearances|    avg_tmdb_score|
+--------------------+-----------+------------------+
|          Aamir Khan|         16|             7.057|
| Nawazuddin Siddiqui|         19| 6.984210526315789|
|      Shah Rukh Khan|         22| 6.710045454545456|
|Priyanka Chopra J...|         17| 6.651529411764705|
|    Amitabh Bachchan|         20| 6.621049999999999|
|    Naseeruddin Shah|         16| 6.549999999999999|
|        Rani Mukerji|         16|         6.5200625|
|              Nassar|         17| 6.494117647058823|
|       Murali Sharma|         16|6.4430000000000005|
|         Anupam Kher|         18| 6.371166666666665|
|             Om Puri|         18|  6.30626163648558|
|        Paresh Rawal|         20|              6.22|
|         Boman Irani|         24| 6.208333333333333|
| Kareena Kapoor Khan|         24| 6.175874999999999|
|          Ajay Devgn|         18| 6.144444444444444|
+--------------------+------

In [0]:
exploded_df.groupBy("genre","country").count().orderBy(desc("count")).show()

+-------------+-------+-----+
|        genre|country|count|
+-------------+-------+-----+
|       comedy|     US|  879|
|        drama|     US|  851|
|documentation|     US|  499|
|        drama|     IN|  450|
|     thriller|     US|  422|
|       action|     US|  412|
|       family|     US|  325|
|        crime|     US|  320|
|        scifi|     US|  255|
|    animation|     US|  254|
|       comedy|     IN|  251|
|      fantasy|     US|  249|
|      romance|     US|  226|
|      romance|     IN|  195|
|     thriller|     IN|  177|
|       horror|     US|  173|
|       action|     IN|  162|
|        drama|     GB|  161|
|        drama|     KR|  149|
|        drama|     JP|  145|
+-------------+-------+-----+
only showing top 20 rows


In [0]:
from pyspark.sql.functions import row_number
w = Window.partitionBy("release_year").orderBy(desc("tmdb_score"))
top_titles = titles_df.withColumn("rank", row_number().over(w)).filter("rank < 3")
top_titles.select("release_year", "title", "tmdb_score").orderBy("release_year", "rank").show()

+------------+--------------------+-----------------+
|release_year|               title|       tmdb_score|
+------------+--------------------+-----------------+
|          13| the two series h...|6.812709456740442|
|          15| the exotic creat...|6.812709456740442|
|          24| they feel like t...|6.812709456740442|
|          25| Histoire et Hôpi...|6.812709456740442|
|          27| or will it be ga...|6.812709456740442|
|          43| the raven-haired...|6.812709456740442|
|          46| and cable TV SET...|6.812709456740442|
|          48| but can't get th...|6.812709456740442|
|          51| but one of the s...|6.812709456740442|
|          59| technology has t...|6.812709456740442|
|          64| South Korea. The...|6.812709456740442|
|          69| Heo Im travels t...|6.812709456740442|
|        1954|     White Christmas|              7.2|
|        1954|     The Blazing Sun|              7.0|
|        1956|         Dark Waters|              5.9|
|        1958|       Cairo S

In [0]:

joined_df = titles_df.join(credits_df, on="id", how="inner")

In [0]:
from pyspark.sql.functions import collect_list, size, array_distinct

actor_pairs = credits_df \
    .filter(col("role") == "ACTOR").groupBy("id").agg(collect_list("name").alias("actors")).withColumn("pair_count", size(array_distinct("actors")))

actor_pairs.orderBy(desc("pair_count")).show(truncate = False)

+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
joined_df.show()

+-------+-----------+-----+--------------------+------------+-----------------+-------+------------------+--------------------+-------+---------+----------+----------+---------------+----------+--------------+---------------+---------+------------------+--------------------+-----+
|     id|      title| type|         description|release_year|age_certification|runtime|            genres|production_countries|seasons|  imdb_id|imdb_score|imdb_votes|tmdb_popularity|tmdb_score|  genres_array|countries_array|person_id|              name|           character| role|
+-------+-----------+-----+--------------------+------------+-----------------+-------+------------------+--------------------+-------+---------+----------+----------+---------------+----------+--------------+---------------+---------+------------------+--------------------+-----+
|tm84618|Taxi Driver|MOVIE|A mentally unstab...|        1976|                R|    114|['drama', 'crime']|              ['US']|      0|tt0075314|       8.

In [0]:
credits_df.groupBy("name") \
    .count().orderBy("count", ascending=False).limit(10).show()

+--------------------+-----+
|                name|count|
+--------------------+-----+
|         Boman Irani|   25|
| Kareena Kapoor Khan|   25|
|      Shah Rukh Khan|   23|
|    Takahiro Sakurai|   21|
|         Raúl Campos|   20|
|        Paresh Rawal|   20|
|    Amitabh Bachchan|   20|
|Priyanka Chopra J...|   20|
|           Jan Suter|   19|
|          Aamir Khan|   19|
+--------------------+-----+



In [0]:
titles_df.groupBy("type").count().show()

+--------------------+-----+
|                type|count|
+--------------------+-----+
| classmates and t...|    1|
|                2019|    1|
|                2017|    1|
| cast out of the ...|    1|
|               MOVIE| 3349|
|          two courts|    1|
|                SHOW| 1855|
|                2010|    1|
|                2022|    1|
|                2015|    1|
|      as of May 2008|    1|
|                1998|    1|
| after all. A boo...|    1|
| and they are abo...|    1|
|                2006|    1|
|                2018|    1|
|                2011|    2|
|                2020|    1|
|                2001|    1|
+--------------------+-----+



In [0]:
titles_df.filter("type = 'MOVIE' and imdb_score is not null") \
    .orderBy("imdb_score", ascending=False).select("title", "imdb_score").limit(10).show()

+--------------------+----------+
|               title|imdb_score|
+--------------------+----------+
|Chhota Bheem & Kr...|       9.1|
|               Major|       9.1|
|   C/o Kancharapalem|       8.9|
|David Attenboroug...|       8.9|
|        Forrest Gump|       8.8|
|          GoodFellas|       8.7|
|Chhota Bheem & Kr...|       8.7|
|          Anbe Sivam|       8.7|
|Chhota Bheem Neel...|       8.7|
| A Lion in the House|       8.7|
+--------------------+----------+



In [0]:
titles_df.groupBy("release_year").count().orderBy("release_year").show()

+------------+-----+
|release_year|count|
+------------+-----+
|          13|    1|
|          15|    1|
|          24|    1|
|          25|    1|
|          27|    1|
|          43|    1|
|          46|    1|
|          48|    1|
|          51|    1|
|          59|    1|
|          64|    1|
|          69|    1|
|        1954|    2|
|        1956|    1|
|        1958|    1|
|        1959|    1|
|        1960|    1|
|        1961|    1|
|        1963|    1|
|        1966|    1|
+------------+-----+
only showing top 20 rows


In [0]:
credits_df.filter("role = 'ACTOR'").groupBy("name") \
    .count().orderBy("count", ascending=False).limit(10).show()


+--------------------+-----+
|                name|count|
+--------------------+-----+
| Kareena Kapoor Khan|   25|
|         Boman Irani|   25|
|      Shah Rukh Khan|   23|
|    Takahiro Sakurai|   21|
|Priyanka Chopra J...|   20|
|        Paresh Rawal|   20|
|    Amitabh Bachchan|   20|
| Nawazuddin Siddiqui|   19|
|         Anupam Kher|   19|
|      Junichi Suwabe|   19|
+--------------------+-----+



In [0]:
credits_df.filter("name = 'Robert De Niro'") \
    .join(titles_df, "id").select("title", "release_year", "imdb_score").show()

+--------------------+------------+----------+
|               title|release_year|imdb_score|
+--------------------+------------+----------+
|         Taxi Driver|        1976|       8.2|
|          GoodFellas|        1990|       8.7|
|Once Upon a Time ...|        1984|       8.3|
|Jim Norton: Mouth...|        2017|       7.0|
|        The Irishman|        2019|       7.8|
+--------------------+------------+----------+



In [0]:
from pyspark.sql.functions import explode

titles_df.withColumn("genre", explode("genres_array")) \
    .groupBy("genres_array").avg("tmdb_score").orderBy("avg(tmdb_score)", ascending=False).show()

+--------------------+---------------+
|        genres_array|avg(tmdb_score)|
+--------------------+---------------+
|[family, romance,...|           10.0|
|[family, fantasy,...|           10.0|
|[thriller, horror...|           10.0|
|[fantasy, comedy,...|           10.0|
|[drama, scifi, co...|           10.0|
|[comedy, family, ...|           10.0|
|[crime, sport, do...|           10.0|
|[family, music, a...|           10.0|
|[animation, comed...|           10.0|
|[animation, famil...|           10.0|
|[scifi, action, f...|            9.5|
|[animation, comed...|            9.3|
|[scifi, action, d...|          9.081|
|[action, scifi, t...|            9.0|
|[drama, thriller,...|            9.0|
|[scifi, fantasy, ...|            9.0|
|[fantasy, scifi, ...|            9.0|
|[action, comedy, ...|            8.9|
|[fantasy, reality...|            8.9|
|[action, scifi, f...|            8.9|
+--------------------+---------------+
only showing top 20 rows


In [0]:
titles_df.filter("type = 'SHOW'") \
    .orderBy("tmdb_popularity", ascending=False).select("title", "tmdb_popularity").show()

+--------------------+---------------+
|               title|tmdb_popularity|
+--------------------+---------------+
|     Stranger Things|       2226.231|
|       Resident Evil|       1387.392|
|      Peaky Blinders|       1038.077|
|             Lucifer|        922.112|
|Money Heist (Kore...|        904.326|
|      Grey's Anatomy|        803.786|
|Money Heist: From...|        712.899|
|Kung Fu Panda: Th...|        656.832|
|    The Walking Dead|        597.277|
|  All of Us Are Dead|        491.672|
|          Boo, Bitch|        481.983|
| The Vampire Diaries|        479.354|
|           Control Z|        473.002|
|My Little Pony: M...|        400.035|
|The Umbrella Academy|        388.952|
|        Supernatural|        388.093|
|          Squid Game|        361.925|
|        Breaking Bad|        353.848|
|           Riverdale|        341.862|
|Pablo Escobar: Th...|        337.869|
+--------------------+---------------+
only showing top 20 rows


In [0]:
display(titles_df.limit(10))

id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,genres_array,countries_array
tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works as a night-time taxi driver in New York City where the perceived decadence and sleaze feed his urge for violent action.,1976,R,114,"['drama', 'crime']",['US'],0,tt0075314,8.2,100,40.965,8.179,"List(drama, crime)",List(US)
tm154986,Deliverance,MOVIE,"Intent on seeing the Cahulawassee River before it's turned into one huge lake, outdoor fanatic Lewis Medlock takes his friends on a river-rafting trip they'll never forget into the dangerous American back-country.",1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],0,tt0068473,7.7,100,10.01,7.3,"List(drama, action, thriller, european)",List(US)
tm127384,Monty Python and the Holy Grail,MOVIE,"""King Arthur, accompanied by his squire, recruits his Knights of the Round Table, including Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Robin the Not-Quite-So-Brave-As-Sir-Lancelot and Sir Galahad the Pure. On the way, Arthur battles the Black Knight who, despite having had all his limbs chopped off, insists he can still fight. They reach Camelot, but Arthur decides not to enter, as """"it is a silly place"""".""",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],0,tt0071853,8.2,100,15.461,7.811,"List(fantasy, action, comedy)",List(GB)
tm120801,The Dirty Dozen,MOVIE,"12 American military prisoners in World War II are ordered to infiltrate a well-guarded enemy château and kill the Nazi officers vacationing there. The soldiers, most of whom are facing death sentences for a variety of violent crimes, agree to the mission and the possible commuting of their sentences.",1967,TV-MA,150,"['war', 'action']","['GB', 'US']",0,tt0061578,7.7,100,20.398,7.6,"List(war, action)","List(GB, US)"
ts22164,Monty Python's Flying Circus,SHOW,"A British sketch comedy series with the shows being composed of surreality, risqué or innuendo-laden humour, sight gags and observational sketches without punchlines.",1969,TV-14,30,"['comedy', 'european']",['GB'],1,tt0063929,8.8,100,17.617,8.306,"List(comedy, european)",List(GB)
tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, but through a series of ridiculous events, he gains a reputation as the Messiah. When he's not dodging his followers or being scolded by his shrill mother, the hapless Brian has to contend with the pompous Pontius Pilate and acronym-obsessed members of a separatist movement. Rife with Monty Python's signature absurdity, the tale finds Brian's life paralleling Biblical lore, albeit with many more laughs.",1979,R,94,['comedy'],['GB'],0,tt0079470,8.0,100,17.77,7.8,List(comedy),List(GB)
tm14873,Dirty Harry,MOVIE,"When a madman dubbed 'Scorpio' terrorizes San Francisco, hard-nosed cop, Harry Callahan – famous for his take-no-prisoners approach to law enforcement – is tasked with hunting down the psychopath. Harry eventually collars Scorpio in the process of rescuing a kidnap victim, only to see him walk on technicalities. Now, the maverick detective is determined to nail the maniac himself.",1971,R,102,"['thriller', 'action', 'crime']",['US'],0,tt0066999,7.7,100,12.817,7.5,"List(thriller, action, crime)",List(US)
tm119281,Bonnie and Clyde,MOVIE,"In the 1930s, bored waitress Bonnie Parker falls in love with an ex-con named Clyde Barrow and together they start a violent crime spree through the country, stealing cars and robbing banks.",1967,R,110,"['crime', 'drama', 'action']",['US'],0,tt0061418,7.7,100,15.687,7.5,"List(crime, drama, action)",List(US)
tm98978,The Blue Lagoon,MOVIE,"Two small children and a ship's cook survive a shipwreck and find safety on an idyllic tropical island. Soon, however, the cook dies and the young boy and girl are left on their own. Days become years and Emmeline and Richard make a home for themselves surrounded by exotic creatures and nature's beauty. But will they ever see civilization again?",1980,R,104,"['romance', 'action', 'drama']",['US'],0,tt0080453,5.8,100,50.324,6.156,"List(romance, action, drama)",List(US)
tm44204,The Guns of Navarone,MOVIE,"A team of allied saboteurs are assigned an impossible mission: infiltrate an impregnable Nazi-held island and destroy the two enormous long-range field guns that prevent the rescue of 2,000 trapped British soldiers.",1961,TV-MA,158,"['action', 'drama', 'war']","['GB', 'US']",0,tt0054953,7.5,100,13.844,7.3,"List(action, drama, war)","List(GB, US)"


In [0]:
credits_df.groupBy("id").count().orderBy("count", ascending=False).show()
credits_df.groupBy("id").count().summary("count", "min", "25%", "50%", "75%", "max", "mean").show()

+--------+-----+
|      id|count|
+--------+-----+
| tm32982|  208|
|tm244149|  174|
| tm39888|  161|
|tm158304|  138|
|tm467467|  138|
|tm155787|  136|
|tm111828|  129|
|tm979026|  127|
|tm191110|  118|
| tm60292|  116|
|tm845437|  116|
| tm41792|  113|
| tm24088|  110|
| tm88045|  108|
|tm204163|  105|
|tm172683|  102|
|tm244174|   99|
|tm315344|   98|
| tm58382|   98|
|tm191013|   96|
+--------+-----+
only showing top 20 rows
+-------+---------+------------------+
|summary|       id|             count|
+-------+---------+------------------+
|  count|     5489|              5489|
|    min|tm1000037|                 1|
|    25%|     NULL|                 5|
|    50%|     NULL|                10|
|    75%|     NULL|                18|
|    max|    ts987|               208|
|   mean|     NULL|14.173984332300966|
+-------+---------+------------------+



### Salting

In [0]:
skewed_keys_df = credits_df.groupBy("id").count().filter("count > 100")
skewed_ids = [row["id"] for row in skewed_keys_df.collect()]

In [0]:
from pyspark.sql.functions import concat_ws, floor, rand

credits_salted = credits_df.withColumn("original_id", col("id")).withColumn(
    "id_salted",
    when(
        col("id").isin(skewed_ids),
        concat_ws("_", col("id"), floor(rand(seed=68) * 10).cast("int"))
    ).otherwise(col("id"))
)

credits_salted = credits_salted.alias("credits")


In [0]:
from pyspark.sql.functions import explode, sequence, lit

titles_skewed = titles_df.filter(col("id").isin(skewed_ids))
titles_not_skewed = titles_df.filter(~col("id").isin(skewed_ids))

titles_salted = titles_skewed.withColumn(
    "salt", explode(sequence(lit(0), lit(9)))
).withColumn(
    "id_salted", concat_ws("_", col("id"), col("salt"))
).drop("salt")

titles_not_skewed = titles_not_skewed.withColumn("id_salted", col("id"))

titles_salted_final = titles_salted.unionByName(titles_not_skewed)

titles_salted_final = titles_salted_final.alias("titles")


In [0]:
joined_df = credits_salted.join(
    titles_salted_final,
    on="id_salted",
    how="inner"
).select(
    col("credits.original_id").alias("credit_id"),
    col("credits.name"),
    col("credits.role"),
    col("titles.title"),
    col("titles.type"),
    col("titles.release_year")
)


In [0]:
joined_df.show(10, truncate=False)

+---------+---------------+-----+-----------+-----+------------+
|credit_id|name           |role |title      |type |release_year|
+---------+---------------+-----+-----------+-----+------------+
|tm84618  |Robert De Niro |ACTOR|Taxi Driver|MOVIE|1976        |
|tm84618  |Jodie Foster   |ACTOR|Taxi Driver|MOVIE|1976        |
|tm84618  |Albert Brooks  |ACTOR|Taxi Driver|MOVIE|1976        |
|tm84618  |Harvey Keitel  |ACTOR|Taxi Driver|MOVIE|1976        |
|tm84618  |Cybill Shepherd|ACTOR|Taxi Driver|MOVIE|1976        |
|tm84618  |Peter Boyle    |ACTOR|Taxi Driver|MOVIE|1976        |
|tm84618  |Leonard Harris |ACTOR|Taxi Driver|MOVIE|1976        |
|tm84618  |Diahnne Abbott |ACTOR|Taxi Driver|MOVIE|1976        |
|tm84618  |Gino Ardito    |ACTOR|Taxi Driver|MOVIE|1976        |
|tm84618  |Martin Scorsese|ACTOR|Taxi Driver|MOVIE|1976        |
+---------+---------------+-----+-----------+-----+------------+
only showing top 10 rows


In [0]:
from pyspark.sql.window import Window
w = Window.partitionBy("id").orderBy(desc("character"))

credits_df.withColumn("actor_rank", row_number().over(w)).filter("actor_rank = 1").show()

+---------+---------+--------------------+--------------------+--------+----------+
|person_id|       id|                name|           character|    role|actor_rank|
+---------+---------+--------------------+--------------------+--------+----------+
|  1279270|tm1000037|           Aziz Dyab|               Yusuf|   ACTOR|         1|
|    61651|tm1000147|        Olwen Fouéré|              Royale|   ACTOR|         1|
|  1094198| tm100015|         Isa Briones|              Sunday|   ACTOR|         1|
|  1793832|tm1000166|        Saron Sakina|                 N/A|DIRECTOR|         1|
|   160320|tm1000185|Sebastian Stankie...|             Wiesiek|   ACTOR|         1|
|   219025| tm100027| Maya-Gozel Aimedova|episode (uncredited)|   ACTOR|         1|
|  1658977|tm1000296|           Zhang Zhe| young Nezha (voice)|   ACTOR|         1|
|  1054158|tm1000551|     Suliman Ibrahim|               Somto|   ACTOR|         1|
|  2058257|tm1000599|Ehuana Yaira Yano...|                Self|   ACTOR|    

In [0]:
titles_df.groupBy("type").pivot("age_certification").agg(count(lit(1))).display()

type,G,NC-17,PG,PG-13,R,TV-14,TV-G,TV-MA,TV-PG,TV-Y,TV-Y7,"['action', 'animation', 'comedy', 'family']","['animation', 'comedy', 'family', 'music', 'fantasy', 'european']","['comedy', 'drama', 'romance']",['comedy'],"['drama', 'comedy', 'reality']","['drama', 'comedy']","['drama', 'fantasy', 'romance']","['drama', 'romance']","['drama', 'scifi', 'thriller', 'european']",['drama'],"['scifi', 'drama', 'animation', 'music']","['scifi', 'fantasy', 'comedy', 'drama']"
classmates and the media follow his every move. He meets Chen Qing Qing,,,,,,1.0,,,,,,,,,,,,,,,,,
2019,,,,,,,,,,,,,,,,,,,,,,1.0,
2017,,,,,,,,,,,,,,,,,,1.0,,,,,
cast out of the pod. They know there's only one way they will be allowed to rejoin the pod: They must get legs,,,,,,,1.0,,,,,,,,,,,,,,,,
MOVIE,100.0,14.0,220.0,421.0,511.0,,,2083.0,,,,,,,,,,,,,,,
two courts,,,,,,,,1.0,,,,,,,,,,,,,,,
SHOW,,,,,,410.0,66.0,1015.0,165.0,95.0,104.0,,,,,,,,,,,,
2010,,,,,,,,,,,,1.0,,,,,,,,,,,
2022,,,,,,,,,,,,,,,,,,,,,1.0,,
2015,,,,,,,,,,,,,,,,,,,,,,,1.0
