<h1>Imports for Amazon Dataset Cleaning</h1>

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col, lower, when
from pyspark.sql.types import IntegerType

<h1>Initialize SparkSession</h1>

In [2]:
spark = SparkSession.builder.appName("AmazonDataCleaning").getOrCreate()

<h1>Load the Dataset</h1>

In [3]:
df = spark.read.csv("amazon prime movies.csv",
                    header = True, 
                    inferSchema = True,
                    escape = ",",
                   mode = "PERMISSIVE")
print("Dataset Schema")
df.printSchema()

Dataset Schema
root
 |-- Movie Name: string (nullable = true)
 |-- Language: string (nullable = true)
 |-- IMDb Rating: string (nullable = true)
 |-- Running Time: string (nullable = true)
 |-- Year of Release: string (nullable = true)
 |-- Maturity Rating: string (nullable = true)
 |-- Plot: string (nullable = true)



<h1><b>----- Data Cleaning -------</b></h1>

<h1>Renaming the Columns</h1>

In [4]:
for column in df.columns:
    new_col = column.strip().lower().replace(" ","_")
    df = df.withColumnRenamed(column, new_col)

<h1>Converting running_time (hr and min) data into minutes</h1>

In [5]:
df = df.withColumn(
    "hours",
    when(
        regexp_extract(col("running_time"), r"(\d+)\s*h", 1) !="",
        regexp_extract(col("running_time"), r"(\d+)\s*h", 1).cast(IntegerType())
    ).otherwise(0)
)
df = df.withColumn(
    "minutes", 
    when(
        regexp_extract(col("running_time"), r"(\d+)\s*min", 1) !="",
        regexp_extract(col("running_time"), r"(\d+)\s*min", 1).cast(IntegerType())
    ).otherwise(0)
)
df = df.withColumn("total_running_time_minutes",(col("hours")*60+col("minutes")))
df = df.drop("hours")
df = df.drop("minutes")
df = df.drop("plot")
df = df.drop("running_time")

<h1>Replacing None values of imdb_rating with 0</h1>

In [6]:
df.show()

+--------------------+--------+-----------+---------------+---------------+--------------------------+
|          movie_name|language|imdb_rating|year_of_release|maturity_rating|total_running_time_minutes|
+--------------------+--------+-----------+---------------+---------------+--------------------------+
|          John Rambo|   Tamil|          7|           2008|            18+|                        86|
|        American Pie| English|          7|           1999|            18+|                        95|
|           Bombshell| English|        6.8|           2019|            18+|                       108|
|          Love Birds|   Tamil|        5.1|           1996|            All|                       160|
|               Hippi|  Telugu|          5|           2019|            18+|                       140|
|Honey Bunny As Su...|   Tamil|       None|           2018|             7+|                        49|
|               Ayyaa|   Tamil|        5.7|           2005|            Al

In [7]:
df = df.withColumn(
    "imdb_ratings",
    when((col("imdb_rating")=='None')|(col("imdb_rating")==''),None
    ).otherwise(col("imdb_rating"))
)


In [8]:
df = df.na.fill('0')
df = df.drop("imdb_rating")


In [9]:
print("Revised Schema")
df.printSchema()

Revised Schema
root
 |-- movie_name: string (nullable = false)
 |-- language: string (nullable = false)
 |-- year_of_release: string (nullable = false)
 |-- maturity_rating: string (nullable = false)
 |-- total_running_time_minutes: integer (nullable = true)
 |-- imdb_ratings: string (nullable = false)



In [10]:
df.show()

+--------------------+--------+---------------+---------------+--------------------------+------------+
|          movie_name|language|year_of_release|maturity_rating|total_running_time_minutes|imdb_ratings|
+--------------------+--------+---------------+---------------+--------------------------+------------+
|          John Rambo|   Tamil|           2008|            18+|                        86|           7|
|        American Pie| English|           1999|            18+|                        95|           7|
|           Bombshell| English|           2019|            18+|                       108|         6.8|
|          Love Birds|   Tamil|           1996|            All|                       160|         5.1|
|               Hippi|  Telugu|           2019|            18+|                       140|           5|
|Honey Bunny As Su...|   Tamil|           2018|             7+|                        49|           0|
|               Ayyaa|   Tamil|           2005|            All| 

In [11]:
df.write.mode("overwrite").csv("C:/Users/jayshgup/Downloads/OTT RECOMNDATION DATA ANALYTICS/amazonDatasetCleaning", header=True)