In [25]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment 2 Batch Pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
gsc_file_path = 'gs://de_2024_574440/data/data.csv'  
# Create data frame
df = spark.read.format("csv").option("header", "true") \
       .load(gsc_file_path)

df.printSchema()



root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)
 |-- releaseYear: string (nullable = true)



In [26]:
from pyspark.sql.functions import col, countDistinct

#Select only rows with movies or TvSeries
df = df.where((df.type=="movie") | (df.type=="tvSeries"))

In [27]:
from pyspark.sql.functions import split

#remove rows with null values for key columns
df = df.filter(df.averageRating != 'NULL') \
     .filter(df.genres != 'NULL') \
     .filter(df.releaseYear != 'NULL')

#transform strings into float and int
df = df.withColumn("averageRating", col("averageRating").cast("float")) \
    .withColumn("releaseYear", col("releaseYear").cast("integer")) \
    .withColumn("numVotes", col("numVotes").cast("integer"))


#Clean genres, so that only the main genre is available per row
df = df.withColumn("genre_t", split(df.genres, ', ')).drop("genres") \
     .withColumn("one_genre", col("genre_t").getItem(0)).drop("genre_t")

#Display the table
df.show()
              

+----------+--------------------+--------+-------------+--------+-----------+-----------+
|        id|               title|    type|averageRating|numVotes|releaseYear|  one_genre|
+----------+--------------------+--------+-------------+--------+-----------+-----------+
| tt0096167|Stars in Broad Da...|   movie|          7.1|      98|       2024|      Drama|
| tt0162252|    Come the Morning|   movie|          6.0|     155|       2024|      Drama|
| tt0202523|  The Perfect Shadow|   movie|          8.9|      11|       2024|      Drama|
| tt0254777|   Seevalaperi Pandi|   movie|          6.9|      59|       2024|     Action|
| tt0287075|    Universal Groove|   movie|          5.5|     165|       2024|      Drama|
| tt0327785|   The Killer's Game|   movie|          5.7|    8273|       2024|     Action|
| tt0368133|The Promise of Pe...|   movie|          6.6|       8|       2024|Documentary|
| tt0375008|              Remedy|   movie|          4.1|     253|       2024|      Crime|
| tt082942

In [33]:
from pyspark.sql.functions import rank, row_number

#Rank by both genre and year

window=Window.partitionBy("one_genre", "releaseYear").orderBy(desc("averageRating"))

df = df.withColumn("year_genre_rank", row_number().over(window)) 

df.show()

+----------+--------------------+--------+-------------+--------+-----------+---------+---------+---------------+
|        id|               title|    type|averageRating|numVotes|releaseYear|one_genre|year_rank|year_genre_rank|
+----------+--------------------+--------+-------------+--------+-----------+---------+---------+---------------+
|tt28356500|               Anger|   movie|         10.0|       6|       2024|   Action|        1|              1|
|tt26637593|Devaki Nandana Va...|   movie|          9.8|    2172|       2024|   Action|        3|              2|
|tt33075815|              Kaveri|   movie|          9.8|    1250|       2024|   Action|        3|              3|
|tt33663099|               Guxim|   movie|          9.8|       5|       2024|   Action|        3|              4|
|tt10958258|        Marine Drive|   movie|          9.7|      46|       2024|   Action|        4|              5|
|tt32395168|         The Visitor|   movie|          7.6|      15|       2024|    Adult| 

In [35]:
# Use the Cloud Storage bucket for temporary BigQuery export data the connector uses.
bucket = "gs://de_2024_574440/temp_de2024"
spark.conf.set('temporaryGcsBucket', bucket)

df.write.format('bigquery') \
  .option('table', 'still-entity-435508-a1.Movies.IMDB_Comprehensive') \
  .mode("overwrite") \
  .save()