In [101]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("IMDB Batch Pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
gsc_file_path = 'gs://de_2024_574440/data/data.csv'  
# Create data frame
df = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path)
df.printSchema()



root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)
 |-- releaseYear: string (nullable = true)



In [87]:
df.show(5)

+---------+--------------------+-----+--------------------+--------+-----------+-------+
|       id|               title| type|              genres|numVotes|releaseYear|avg_rat|
+---------+--------------------+-----+--------------------+--------+-----------+-------+
|tt0000009|          Miss Jerry|movie|             Romance|     215|       1894|    5.4|
|tt0000147|The Corbett-Fitzs...|movie|Documentary, News...|     539|       1897|    5.2|
|tt0000502|            Bohemios|movie|                NULL|      18|       1905|    4.4|
|tt0000574|The Story of the ...|movie|Action, Adventure...|     941|       1906|    6.0|
|tt0000591|    The Prodigal Son|movie|               Drama|      28|       1907|    5.7|
+---------+--------------------+-----+--------------------+--------+-----------+-------+
only showing top 5 rows



In [91]:
from pyspark.sql.functions import col, countDistinct

#Select only rows with movies or TvSeries
df.where((df.type=="movie") | (df.type=="tvSeries")).select(countDistinct("id")).show(5)

+------------------+
|count(DISTINCT id)|
+------------------+
|            973121|
+------------------+



In [104]:
from pyspark.sql.functions import col, split
#Remove rows with Null values for key columns
df = df.filter(df.averageRating != 'NULL')

df = df.filter(df.genres != 'NULL')

df = df.filter(df.releaseYear != 'NULL')

#Transform average rating into float
df = df.withColumn("avg_rat", col("averageRating").cast("float")).drop("averageRating")

#Clean genres, so that only the main genre is available per row
df_t = df.withColumn("genre_t", split(df.genres, ', ')).drop("genres")

df_1g = df_t.withColumn("one_genre", col("genre_t").getItem(0)).drop("genre_t")

#Display the table
df_1g.show()
              

+---------+--------------------+-----+--------+-----------+-------+-----------+
|       id|               title| type|numVotes|releaseYear|avg_rat|  one_genre|
+---------+--------------------+-----+--------+-----------+-------+-----------+
|tt0000009|          Miss Jerry|movie|     215|       1894|    5.4|    Romance|
|tt0000147|The Corbett-Fitzs...|movie|     539|       1897|    5.2|Documentary|
|tt0000574|The Story of the ...|movie|     941|       1906|    6.0|     Action|
|tt0000591|    The Prodigal Son|movie|      28|       1907|    5.7|      Drama|
|tt0000615|  Robbery Under Arms|movie|      27|       1907|    4.3|      Drama|
|tt0000630|              Hamlet|movie|      30|       1908|    3.1|      Drama|
|tt0000675|         Don Quijote|movie|      22|       1908|    4.2|      Drama|
|tt0000679|The Fairylogue an...|movie|      78|       1908|    5.2|  Adventure|
|tt0000886|Hamlet, Prince of...|movie|      40|       1910|    4.7|      Drama|
|tt0000941|      Locura de amor|movie|  

In [105]:
from pyspark.sql import Window
from pyspark.sql.functions import dense_rank, desc

#Create windows to partition by genre and rank based the average rating
window_gender=Window.partitionBy("one_genre").orderBy(desc("avg_rat"))

df_1g = df_1g.withColumn("genre_rank", dense_rank().over(window_gender))

df_1g.show()

+----------+--------------------+------------+--------+-----------+-------+---------+----------+
|        id|               title|        type|numVotes|releaseYear|avg_rat|one_genre|genre_rank|
+----------+--------------------+------------+--------+-----------+-------+---------+----------+
| tt7156934|   Independent Roads|       movie|      10|       2012|    9.9|Adventure|         1|
| tt1053817|    Buried in Tucson|       movie|      15|       2007|    9.8|Adventure|         2|
| tt8114896|               Parto|       movie|      85|       2010|    9.8|Adventure|         2|
| tt1419921| Flying Over Everest|       movie|      12|       2004|    9.7|Adventure|         3|
|tt24224080|Jendela Seribu Su...|       movie|      92|       2023|    9.7|Adventure|         3|
|tt24228604|          OdBita Pot|    tvSeries|     874|       2022|    9.7|Adventure|         3|
|tt33083593|        Vikaasaparva|       movie|     505|       2024|    9.7|Adventure|         3|
| tt9824856|  Avalakki Pavalak

In [106]:
#Create windows and rank by year

window_year=Window.partitionBy("releaseYear").orderBy(desc("avg_rat"))

df_1g = df_1g.withColumn("year_rank", dense_rank().over(window_year))

df_1g.show()



+----------+--------------------+-----+--------+-----------+-------+-----------+----------+---------+
|        id|               title| type|numVotes|releaseYear|avg_rat|  one_genre|genre_rank|year_rank|
+----------+--------------------+-----+--------+-----------+-------+-----------+----------+---------+
| tt0000147|The Corbett-Fitzs...|movie|     539|       1897|    5.2|Documentary|        49|        1|
| tt0229676|Reproduction of t...|movie|      63|       1897|    4.4|Documentary|        57|        2|
| tt0138774|Saída dos Operári...|movie|      10|       1898|    4.6|Documentary|        55|        1|
| tt0235357|Dressing Paper Dolls|movie|      25|       1898|    3.6|Documentary|        65|        2|
| tt0221040|Buck Dance, Ute I...|movie|      25|       1898|    3.4|Documentary|        67|        3|
| tt0230366|Jeffries-Sharkey ...|movie|      78|       1899|    3.9|Documentary|        62|        1|
| tt0230765|Sharkey-McCoy Fig...|movie|      34|       1899|    3.3|Documentary|  

In [107]:
#Rank by both genre and year

window=Window.partitionBy("one_genre", "releaseYear").orderBy(desc("avg_rat"))

df_1g = df_1g.withColumn("year_genre_rank", dense_rank().over(window))

df_1g.show()

+---------+--------------------+-----+--------+-----------+-------+---------+----------+---------+---------------+
|       id|               title| type|numVotes|releaseYear|avg_rat|one_genre|genre_rank|year_rank|year_genre_rank|
+---------+--------------------+-----+--------+-----------+-------+---------+----------+---------+---------------+
|tt0000574|The Story of the ...|movie|     941|       1906|    6.0|   Action|        40|        1|              1|
|tt0877604|         Chûshingura|movie|      24|       1910|    5.5|   Action|        45|        5|              1|
|tt0299498|Attack on the Gol...|movie|      25|       1911|    4.1|   Action|        59|       18|              1|
|tt0006206|        Les Vampires|movie|    5574|       1915|    7.3|   Action|        27|       11|              1|
|tt0005905|       The Plunderer|movie|      31|       1915|    6.7|   Action|        33|       17|              2|
|tt0356838|     The Loyal Rebel|movie|      19|       1915|    6.7|   Action|   