In [73]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, isnan, count, udf, round as spark_round
from pyspark.sql.types import StringType, FloatType


from src.utils.s3_manager import S3Manager
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import os 
import nltk


In [77]:
data_dir = "/Users/ilan/big-data-airflow-project/data"

In [78]:
spark = SparkSession.builder \
    .appName("EDA with Spark") \
    .getOrCreate()

In [118]:
df = spark.read.parquet(data_dir+"/allocine_movies.parquet", header=True, inferSchema=True)

In [80]:
df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Release Date: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Actors: string (nullable = true)
 |-- Press Rating: string (nullable = true)
 |-- Spectator Rating: string (nullable = true)
 |-- Synopsis: string (nullable = true)


In [81]:
df.show()

+--------------------+--------+------+---------------+------------------+--------------------+------------+----------------+--------------------+
|               Title|Duration| Genre|   Release Date|          Director|              Actors|Press Rating|Spectator Rating|            Synopsis|
+--------------------+--------+------+---------------+------------------+--------------------+------------+----------------+--------------------+
|The Dark Knight, ...|2h 32min|Action|            N/A| Christopher Nolan|Christian Bale, P...|         4,0|             4,5|Batman entreprend...|
|           Gladiator|2h 35min|Action|            N/A|      Ridley Scott|Russell Crowe, Jo...|         4,3|             4,5|Le général romain...|
|Spider-Man : Acro...|2h 21min|Action|            N/A|Joaquim Dos Santos|Stéphane Bak, Sha...|         4,1|             4,4|Après avoir retro...|
|Spider-Man : New ...|1h 57min|Action|            N/A|   Bob Persichetti|        Jake Johnson|         4,2|             4,4|

In [82]:
print("The shape of the allocine dataset is ", (df.count(), len(df.columns)))

The shape of the allocine dataset is  (3379, 9)


In [83]:
df.describe().show()

+-------+--------------------+--------+-------+------------+--------------+--------------------+------------+----------------+--------------------+
|summary|               Title|Duration|  Genre|Release Date|      Director|              Actors|Press Rating|Spectator Rating|            Synopsis|
+-------+--------------------+--------+-------+------------+--------------+--------------------+------------+----------------+--------------------+
|  count|                3379|    3378|   3379|        3379|          3376|                3372|        3374|            3379|                3376|
|   mean|               796.0|    NULL|   NULL|        NULL|          NULL|                NULL|        NULL|            NULL|                NULL|
| stddev|   874.0533164515766|    NULL|   NULL|        NULL|          NULL|                NULL|        NULL|            NULL|                NULL|
|    min|                 '71|0h 06min| Action| 1 août 2001|  Costa-Gavras| Anémone, Josiane...|         1,3|   

In [84]:
df = df.drop("Release Date", "Director")

# Renaming columns to match the netflix dataset

In [85]:
df = df.withColumnRenamed("Duration", "Runtime")
df = df.withColumnRenamed("Synopsis", "Summary")

# Dealing with missing values

In [86]:
missing_values = df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns])
missing_values.show()

+-----+-------+-----+------+------------+----------------+-------+
|Title|Runtime|Genre|Actors|Press Rating|Spectator Rating|Summary|
+-----+-------+-----+------+------------+----------------+-------+
|    0|      1|    0|     7|           5|               0|      3|
+-----+-------+-----+------+------------+----------------+-------+


#### Runtime column

In [87]:
df = df.dropna(subset=["Runtime"])

In [88]:
def convert_runtime_to_interval(runtime):
    hours, minutes = map(int, runtime.replace('min', '').replace('h', '').split())
    total_hours = hours + minutes / 60
    if total_hours > 2:
        return '> 2 hrs'
    elif total_hours < 0.5:
        return '< 30 minutes'
    elif total_hours < 1 and total_hours >= 0.5 :
        return '30 - 60 mins'
    else:
        return '1-2 hour'
    

In [89]:
convert_runtime_udf = udf(convert_runtime_to_interval, StringType())

In [90]:
df = df.withColumn("Runtime", convert_runtime_udf(df["Runtime"]))

In [91]:
df.show()

+--------------------+--------+------+--------------------+------------+----------------+--------------------+
|               Title| Runtime| Genre|              Actors|Press Rating|Spectator Rating|             Summary|
+--------------------+--------+------+--------------------+------------+----------------+--------------------+
|The Dark Knight, ...| > 2 hrs|Action|Christian Bale, P...|         4,0|             4,5|Batman entreprend...|
|           Gladiator| > 2 hrs|Action|Russell Crowe, Jo...|         4,3|             4,5|Le général romain...|
|Spider-Man : Acro...| > 2 hrs|Action|Stéphane Bak, Sha...|         4,1|             4,4|Après avoir retro...|
|Spider-Man : New ...|1-2 hour|Action|        Jake Johnson|         4,2|             4,4|SPIDER-MAN : NEW ...|
|Indiana Jones et ...| > 2 hrs|Action|Harrison Ford, Se...|         4,4|             4,4|L'archéologue ave...|
|              Matrix| > 2 hrs|Action|Keanu Reeves, Lau...|         3,4|             4,4|Programmeur anony...|
|

#### Rating column

We're gonna merge the spectator rating and the press rating into one column called rating

In [92]:
from pyspark.sql.functions import regexp_replace

df = df.withColumn("Press Rating", regexp_replace(col("Press Rating"), ",", "."))
df = df.withColumn("Press Rating", when(col("Press Rating") == "--", None).otherwise(col("Press Rating")))
df = df.withColumn("Press Rating", col("Press Rating").cast(FloatType()))

df = df.withColumn("Spectator Rating", regexp_replace(col("Spectator Rating"), ",", "."))
df = df.withColumn("Spectator Rating", when(col("Spectator Rating") == "--", None).otherwise(col("Spectator Rating")))
df = df.withColumn("Spectator Rating", col("Spectator Rating").cast(FloatType()))

In [99]:
from pyspark.sql.functions import mean

mean_press = df.select(mean(col("Press Rating")).alias("mean_press")).collect()[0]["mean_press"]
df = df.na.fill({"Press Rating": mean_press})

mean_spectator = df.select(mean(col("Spectator Rating")).alias("mean_spectator")).collect()[0]["mean_spectator"]
df = df.na.fill({"Spectator Rating": mean_spectator})

In [103]:
df = df.withColumn("Rating", spark_round((col("Press Rating") + col("Spectator Rating")) / 2, 1))
df = df.drop("Press Rating", "Spectator Rating")

In [104]:
df.show()

+--------------------+--------+------+--------------------+--------------------+------+
|               Title| Runtime| Genre|              Actors|             Summary|Rating|
+--------------------+--------+------+--------------------+--------------------+------+
|The Dark Knight, ...| > 2 hrs|Action|Christian Bale, P...|Batman entreprend...|   4.3|
|           Gladiator| > 2 hrs|Action|Russell Crowe, Jo...|Le général romain...|   4.4|
|Spider-Man : Acro...| > 2 hrs|Action|Stéphane Bak, Sha...|Après avoir retro...|   4.3|
|Spider-Man : New ...|1-2 hour|Action|        Jake Johnson|SPIDER-MAN : NEW ...|   4.3|
|Indiana Jones et ...| > 2 hrs|Action|Harrison Ford, Se...|L'archéologue ave...|   4.4|
|              Matrix| > 2 hrs|Action|Keanu Reeves, Lau...|Programmeur anony...|   3.9|
|              Ip Man|1-2 hour|Action|Donnie Yen, Simon...|Film biographique...|   4.2|
|Les Aventuriers d...|1-2 hour|Action|Karen Allen, Paul...|1936. L'aventurie...|   4.5|
|   Top Gun: Maverick| > 2 hrs|A

#### Transform the genre column. We will create one col for each genre and fill it with 1 if the movie belongs to this genre, 0 otherwise

In [112]:
from pyspark.sql.functions import split

df = df.withColumn("Genre", split(col("Genre"), ", "))
genres = ["Action", "Adventure", "Drama", "Sci-Fi", "Crime", "Thriller", "Comedy", "Biography", "Documentary"]

In [115]:
from pyspark.sql.functions import array_contains

for genre in genres:
    df = df.withColumn(genre, array_contains(col("Genre"), genre).cast("integer"))

In [116]:
df.show()

+--------------------+--------+--------+---------------+------------------+--------------------+------------+----------------+--------------------+------+---------+-----+------+-----+--------+------+---------+-----------+
|               Title|Duration|   Genre|   Release Date|          Director|              Actors|Press Rating|Spectator Rating|            Synopsis|Action|Adventure|Drama|Sci-Fi|Crime|Thriller|Comedy|Biography|Documentary|
+--------------------+--------+--------+---------------+------------------+--------------------+------------+----------------+--------------------+------+---------+-----+------+-----+--------+------+---------+-----------+
|The Dark Knight, ...|2h 32min|[Action]|            N/A| Christopher Nolan|Christian Bale, P...|         4,0|             4,5|Batman entreprend...|     1|        0|    0|     0|    0|       0|     0|        0|          0|
|           Gladiator|2h 35min|[Action]|            N/A|      Ridley Scott|Russell Crowe, Jo...|         4,3|   

In [117]:
df.groupBy("Genre").count().orderBy(col("count").desc()).show()

+--------------------+-----+
|               Genre|count|
+--------------------+-----+
|           [Romance]|  200|
|          [Thriller]|  200|
|           [Famille]|  200|
|        [Historique]|  200|
|           [Comédie]|  200|
|       [Fantastique]|  200|
|[Comédie dramatique]|  200|
|          [Policier]|  200|
|         [Animation]|  200|
|            [Biopic]|  200|
|   [Science Fiction]|  200|
|             [Drame]|  200|
|            [Action]|  200|
|           [Musical]|  199|
| [Epouvante-horreur]|  195|
|           [Western]|  193|
|            [Guerre]|  192|
+--------------------+-----+


+-----------+-----+
|      Genre|count|
+-----------+-----+
|      Crime|  200|
|    Romance|  200|
|   Thriller|  200|
|    Famille|  200|
|      Drama|  200|
|Documentary|  200|
|    Comédie|  200|
|Fantastique|  200|
|  Animation|  200|
|     Biopic|  200|
|     Comedy|  200|
|     Action|  200|
|     Sci-Fi|  200|
|    Musical|  199|
|     Horror|  195|
|    Western|  193|
|     Guerre|  192|
+-----------+-----+
