In [101]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment 2 Top5 Dataset")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
gsc_file_path = 'gs://de_2024_574440/data/data.csv'  
# Create data frame
df = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path)

df.printSchema()



root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)
 |-- releaseYear: string (nullable = true)



In [None]:
from pyspark.sql.functions import col, countDistinct

#Select only rows with movies or TvSeries
df = df.where((df.type=="movie") | (df.type=="tvSeries"))

#Filter on 2024 only
df = df.filter(df.releaseYear == 2024)

In [104]:
from pyspark.sql.functions import split

#remove rows with null values for key columns
df = df.filter(df.averageRating != 'NULL') \
     .filter(df.genres != 'NULL') \
     .filter(df.releaseYear != 'NULL')

#transform strings into float and int
df = df.withColumn("averageRating", col("averageRating").cast("float")) \
    .withColumn("releaseYear", col("releaseYear").cast("integer")) \
    .withColumn("numVotes", col("numVotes").cast("integer"))


#Clean genres, so that only the main genre is available per row
df = df.withColumn("genre_t", split(df.genres, ', ')).drop("genres") \
     .withColumn("one_genre", col("genre_t").getItem(0)).drop("genre_t")

#Display the table
df.show()
              

+---------+--------------------+-----+--------+-----------+-------+-----------+
|       id|               title| type|numVotes|releaseYear|avg_rat|  one_genre|
+---------+--------------------+-----+--------+-----------+-------+-----------+
|tt0000009|          Miss Jerry|movie|     215|       1894|    5.4|    Romance|
|tt0000147|The Corbett-Fitzs...|movie|     539|       1897|    5.2|Documentary|
|tt0000574|The Story of the ...|movie|     941|       1906|    6.0|     Action|
|tt0000591|    The Prodigal Son|movie|      28|       1907|    5.7|      Drama|
|tt0000615|  Robbery Under Arms|movie|      27|       1907|    4.3|      Drama|
|tt0000630|              Hamlet|movie|      30|       1908|    3.1|      Drama|
|tt0000675|         Don Quijote|movie|      22|       1908|    4.2|      Drama|
|tt0000679|The Fairylogue an...|movie|      78|       1908|    5.2|  Adventure|
|tt0000886|Hamlet, Prince of...|movie|      40|       1910|    4.7|      Drama|
|tt0000941|      Locura de amor|movie|  

In [105]:
from pyspark.sql import Window
from pyspark.sql.functions import dense_rank, desc

#Create windows and rank by genre

window_genre=Window.partitionBy("one_genre").orderBy(desc("averageRating"))

df = df.withColumn("genre_rank", dense_rank().over(window_genre))

#Select only top 5 per genre per year

df = df.filter(df.year_genre_rank < 6)

df.show()



+----------+--------------------+------------+--------+-----------+-------+---------+----------+
|        id|               title|        type|numVotes|releaseYear|avg_rat|one_genre|genre_rank|
+----------+--------------------+------------+--------+-----------+-------+---------+----------+
| tt7156934|   Independent Roads|       movie|      10|       2012|    9.9|Adventure|         1|
| tt1053817|    Buried in Tucson|       movie|      15|       2007|    9.8|Adventure|         2|
| tt8114896|               Parto|       movie|      85|       2010|    9.8|Adventure|         2|
| tt1419921| Flying Over Everest|       movie|      12|       2004|    9.7|Adventure|         3|
|tt24224080|Jendela Seribu Su...|       movie|      92|       2023|    9.7|Adventure|         3|
|tt24228604|          OdBita Pot|    tvSeries|     874|       2022|    9.7|Adventure|         3|
|tt33083593|        Vikaasaparva|       movie|     505|       2024|    9.7|Adventure|         3|
| tt9824856|  Avalakki Pavalak

In [None]:
#Save to the bucket

df.write.format("csv").option("header", True).save("gs://de_2024_574440/data/data_genre24_top5.csv")