In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, isnan, count, udf
from pyspark.sql.types import StringType

from src.utils.s3_manager import S3Manager
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import os 
import nltk


In [2]:
data_dir = "/Users/ilan/big-data-airflow-project/data"

In [3]:
spark = SparkSession.builder \
    .appName("EDA with Spark") \
    .getOrCreate()

24/05/18 20:16:31 WARN Utils: Your hostname, Ordinateur-portable-de-Ilan.local resolves to a loopback address: 127.0.0.1; using 192.168.1.18 instead (on interface en0)
24/05/18 20:16:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/18 20:16:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/18 20:16:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df = spark.read.parquet(data_dir+"/allocine_movies.parquet", header=True, inferSchema=True)

                                                                                

In [6]:
df.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Release Date: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Actors: string (nullable = true)
 |-- Press Rating: string (nullable = true)
 |-- Spectator Rating: string (nullable = true)
 |-- Synopsis: string (nullable = true)


In [5]:
df.show()

                                                                                

+--------------------+--------+------+---------------+------------------+--------------------+------------+----------------+--------------------+
|               Title|Duration| Genre|   Release Date|          Director|              Actors|Press Rating|Spectator Rating|            Synopsis|
+--------------------+--------+------+---------------+------------------+--------------------+------------+----------------+--------------------+
|The Dark Knight, ...|2h 32min|Action|            N/A| Christopher Nolan|Christian Bale, P...|         4,0|             4,5|Batman entreprend...|
|           Gladiator|2h 35min|Action|            N/A|      Ridley Scott|Russell Crowe, Jo...|         4,3|             4,5|Le général romain...|
|Spider-Man : Acro...|2h 21min|Action|            N/A|Joaquim Dos Santos|Stéphane Bak, Sha...|         4,1|             4,4|Après avoir retro...|
|Spider-Man : New ...|1h 57min|Action|            N/A|   Bob Persichetti|        Jake Johnson|         4,2|             4,4|

In [6]:
print("The shape of the allocine dataset is ", (df.count(), len(df.columns)))

The shape of the allocine dataset is  (3379, 9)


In [7]:
df.describe().show()

24/05/17 18:46:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 5:>                                                          (0 + 1) / 1]

+-------+--------------------+--------+-------+------------+--------------+--------------------+------------+----------------+--------------------+
|summary|               Title|Duration|  Genre|Release Date|      Director|              Actors|Press Rating|Spectator Rating|            Synopsis|
+-------+--------------------+--------+-------+------------+--------------+--------------------+------------+----------------+--------------------+
|  count|                3379|    3378|   3379|        3379|          3376|                3372|        3374|            3379|                3376|
|   mean|               796.0|    NULL|   NULL|        NULL|          NULL|                NULL|        NULL|            NULL|                NULL|
| stddev|   874.0533164515766|    NULL|   NULL|        NULL|          NULL|                NULL|        NULL|            NULL|                NULL|
|    min|                 '71|0h 06min| Action| 1 août 2001|  Costa-Gavras| Anémone, Josiane...|         1,3|   

                                                                                

In [32]:
df = df.drop("Release Date", "Director")

# Renaming columns to match the netflix dataset

In [7]:
df = df.withColumnRenamed("Duration", "Runtime")
df = df.withColumnRenamed("Synopsis", "Summary")

# Dealing with missing values

In [8]:
missing_values = df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns])
missing_values.show()

+-----+-------+-----+------------+--------+------+------------+----------------+-------+
|Title|Runtime|Genre|Release Date|Director|Actors|Press Rating|Spectator Rating|Summary|
+-----+-------+-----+------------+--------+------+------------+----------------+-------+
|    0|      1|    0|           0|       3|     7|           5|               0|      3|
+-----+-------+-----+------------+--------+------+------------+----------------+-------+


#### Runtime column

In [23]:
df = df.dropna(subset=["Runtime"])

In [24]:
def convert_runtime_to_interval(runtime):
    hours, minutes = map(int, runtime.replace('min', '').replace('h', '').split())
    total_hours = hours + minutes / 60
    if total_hours > 2:
        return '> 2 hrs'
    elif total_hours < 0.5:
        return '< 30 minutes'
    elif total_hours < 1 and total_hours >= 0.5 :
        return '30 - 60 mins'
    else:
        return '1-2 hour'
    

In [25]:
convert_runtime_udf = udf(convert_runtime_to_interval, StringType())

In [28]:
df = df.withColumn("Runtime", convert_runtime_udf(df["Runtime"]))

In [31]:
df.show()

+--------------------+--------+------+-----------------+------------------+--------------------+------------+----------------+--------------------+----------------+
|               Title| Runtime| Genre|     Release Date|          Director|              Actors|Press Rating|Spectator Rating|             Summary|Runtime Category|
+--------------------+--------+------+-----------------+------------------+--------------------+------------+----------------+--------------------+----------------+
|The Dark Knight, ...| > 2 hrs|Action|              N/A| Christopher Nolan|Christian Bale, P...|         4,0|             4,5|Batman entreprend...|         > 2 hrs|
|           Gladiator| > 2 hrs|Action|              N/A|      Ridley Scott|Russell Crowe, Jo...|         4,3|             4,5|Le général romain...|         > 2 hrs|
|Spider-Man : Acro...| > 2 hrs|Action|              N/A|Joaquim Dos Santos|Stéphane Bak, Sha...|         4,1|             4,4|Après avoir retro...|         > 2 hrs|
|Spider-Ma

#### Rating column

We're gonna merge the spectator rating and the press rating into one column called rating

In [None]:
df