**Setting up the environment**

In [None]:
# Mount with Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

In [None]:
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [None]:
!pip install -q findspark

In [None]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.1.1-bin-hadoop2.7'

**Importing Movie Data**

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName('MovieAvgRating').config('spark.ui.port', '4050').getOrCreate()

In [None]:
# Read movie data
df = spark.read.csv("/content/gdrive/Shareddrives/CSC522 Project/Data/Netflix_with_IMDB_with_customerIDcsv.csv", header=True, inferSchema=True)

In [None]:
df.printSchema()

root
 |-- Actor1: string (nullable = true)
 |-- Actor2: string (nullable = true)
 |-- Actor3: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer Id: integer (nullable = true)
 |-- Director1: string (nullable = true)
 |-- Director2: string (nullable = true)
 |-- Genre1: string (nullable = true)
 |-- Genre2: string (nullable = true)
 |-- Genre3: string (nullable = true)
 |-- Language: string (nullable = true)
 |-- Movie Id11: integer (nullable = true)
 |-- Movie ID12: integer (nullable = true)
 |-- Production Company: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Writer1: string (nullable = true)
 |-- Writer2: string (nullable = true)
 |-- Year Of Release: integer (nullable = true)
 |-- Duration: integer (nullable = true)
 |-- Rating: integer (nullable = true)



In [None]:
# Cast integer to string
from pyspark.sql.types import StringType
df_chType = df.withColumn("Customer Id",df['Customer Id'].cast(StringType())).withColumn("Movie ID",df['Movie Id11'].cast(StringType())).withColumn("Year",df['Year Of Release'].cast(StringType()))

In [None]:
df_chType.printSchema()

root
 |-- Actor1: string (nullable = true)
 |-- Actor2: string (nullable = true)
 |-- Actor3: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Customer Id: string (nullable = true)
 |-- Director1: string (nullable = true)
 |-- Director2: string (nullable = true)
 |-- Genre1: string (nullable = true)
 |-- Genre2: string (nullable = true)
 |-- Genre3: string (nullable = true)
 |-- Language: string (nullable = true)
 |-- Movie Id11: integer (nullable = true)
 |-- Movie ID12: integer (nullable = true)
 |-- Production Company: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Writer1: string (nullable = true)
 |-- Writer2: string (nullable = true)
 |-- Year Of Release: integer (nullable = true)
 |-- Duration: integer (nullable = true)
 |-- Rating: integer (nullable = true)
 |-- Movie ID: string (nullable = true)
 |-- Year: string (nullable = true)



In [None]:
# Filling NULL
my_fill_cols = df_chType.na.fill('No Name',subset=['Actor1', 'Actor2', 'Actor3', 'Director1', 'Director2', 'Writer1', 'Writer2'])
my_fill_cols = my_fill_cols.na.fill('No Genre',subset=['Genre2', 'Genre3'])
my_fill_cols = my_fill_cols.na.fill('No Country',subset=['Country'])
my_fill_cols = my_fill_cols.na.fill('No Language',subset=['Language'])
my_fill_cols = my_fill_cols.na.fill('No Company',subset=['Production Company'])
my_fill_cols = my_fill_cols.na.fill(0,subset=['Duration'])
my_fill_cols = my_fill_cols.na.fill('No Year',subset=['Year'])

In [None]:
# Ignore customerID Id and Title
my_cols = my_fill_cols.select(['Movie ID', 
 'Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Production Company',
 'Writer1',
 'Writer2',
 'Year',
 'Duration',
 'Rating'])

In [None]:
grouped_cols = my_cols.groupBy('Movie ID','Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Production Company',
 'Writer1',
 'Writer2',
 'Year').mean()

In [None]:
grouped_cols.show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+---------+--------+--------+--------------------+--------------------+-------------------+--------------------+----+-------------+------------------+
|Movie ID|              Actor1|              Actor2|              Actor3|             Country|           Director1|Director2|   Genre1|  Genre2|  Genre3|            Language|  Production Company|            Writer1|             Writer2|Year|avg(Duration)|       avg(Rating)|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+---------+--------+--------+--------------------+--------------------+-------------------+--------------------+----+-------------+------------------+
|     843|Hannah Taylor Gordon|        Talya Gordon|      Lindsay Duncan|                  UK|     Patricia Rozema|  No Name|   Comedy|   Drama| Romance|             English|A

**Working with Categorical Columns**

In [None]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)

In [None]:
# Transform strings to indexes
actors_indexer = StringIndexer(inputCols=['Actor1', 'Actor2', 'Actor3'], outputCols=['Actor1Index', 'Actor2Index', 'Actor3Index'])
country_indexer = StringIndexer(inputCol='Country', outputCol='CountryIndex')
directors_indexer = StringIndexer(inputCols=['Director1', 'Director2'], outputCols=['Director1Index', 'Director2Index'])
genres_indexer = StringIndexer(inputCols=['Genre1','Genre2','Genre3'], outputCols=['Genre1Index','Genre2Index','Genre3Index'])
language_indexer = StringIndexer(inputCol='Language', outputCol='LanguageIndex')
pc_indexer = StringIndexer(inputCol='Production Company', outputCol='PCIndex')
writers_indexer = StringIndexer(inputCols=['Writer1', 'Writer2'], outputCols=['Writer1Index', 'Writer2Index'])
year_indexer = StringIndexer(inputCol='Year', outputCol='YearIndex')

In [None]:
from pyspark.ml import Pipeline
pipeline_indexer = Pipeline(stages=[actors_indexer, 
                            country_indexer, 
                            directors_indexer, 
                            genres_indexer, 
                            language_indexer, 
                            pc_indexer, 
                            writers_indexer,
                            year_indexer])

In [None]:
my_indexer_cols = pipeline_indexer.fit(grouped_cols).transform(grouped_cols)

In [None]:
my_indexer_cols.printSchema()

root
 |-- Movie ID: string (nullable = true)
 |-- Actor1: string (nullable = false)
 |-- Actor2: string (nullable = false)
 |-- Actor3: string (nullable = false)
 |-- Country: string (nullable = false)
 |-- Director1: string (nullable = false)
 |-- Director2: string (nullable = false)
 |-- Genre1: string (nullable = true)
 |-- Genre2: string (nullable = false)
 |-- Genre3: string (nullable = false)
 |-- Language: string (nullable = false)
 |-- Production Company: string (nullable = false)
 |-- Writer1: string (nullable = false)
 |-- Writer2: string (nullable = false)
 |-- Year: string (nullable = false)
 |-- avg(Duration): double (nullable = true)
 |-- avg(Rating): double (nullable = true)
 |-- Actor1Index: double (nullable = false)
 |-- Actor2Index: double (nullable = false)
 |-- Actor3Index: double (nullable = false)
 |-- CountryIndex: double (nullable = false)
 |-- Director1Index: double (nullable = false)
 |-- Director2Index: double (nullable = false)
 |-- Genre1Index: double (null

**Feature Selection**

In [None]:
# Vector Assemble
# 13 features
# Set CustomerIndex as default feature
assembler_index = VectorAssembler(inputCols=['Actor1Index', 
 'Actor2Index', 
 'Actor3Index',
 'CountryIndex',
 'Director1Index', 
 'Director2Index',
 'Genre1Index',
 'Genre2Index',
 'Genre3Index',
 'LanguageIndex',
 'PCIndex',
 'Writer1Index', 
 'Writer2Index',
 'YearIndex'],outputCol='features')

In [None]:
my_feature_cols = assembler_index.transform(my_indexer_cols)

In [None]:
my_feature_cols.printSchema()

root
 |-- Movie ID: string (nullable = true)
 |-- Actor1: string (nullable = false)
 |-- Actor2: string (nullable = false)
 |-- Actor3: string (nullable = false)
 |-- Country: string (nullable = false)
 |-- Director1: string (nullable = false)
 |-- Director2: string (nullable = false)
 |-- Genre1: string (nullable = true)
 |-- Genre2: string (nullable = false)
 |-- Genre3: string (nullable = false)
 |-- Language: string (nullable = false)
 |-- Production Company: string (nullable = false)
 |-- Writer1: string (nullable = false)
 |-- Writer2: string (nullable = false)
 |-- Year: string (nullable = false)
 |-- avg(Duration): double (nullable = true)
 |-- avg(Rating): double (nullable = true)
 |-- Actor1Index: double (nullable = false)
 |-- Actor2Index: double (nullable = false)
 |-- Actor3Index: double (nullable = false)
 |-- CountryIndex: double (nullable = false)
 |-- Director1Index: double (nullable = false)
 |-- Director2Index: double (nullable = false)
 |-- Genre1Index: double (null

In [None]:
my_feature_cols.columns

['Movie ID',
 'Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Production Company',
 'Writer1',
 'Writer2',
 'Year',
 'avg(Duration)',
 'avg(Rating)',
 'Actor1Index',
 'Actor2Index',
 'Actor3Index',
 'CountryIndex',
 'Director1Index',
 'Director2Index',
 'Genre1Index',
 'Genre2Index',
 'Genre3Index',
 'LanguageIndex',
 'PCIndex',
 'Writer1Index',
 'Writer2Index',
 'YearIndex',
 'features']

In [None]:
feature_data = my_feature_cols.select('features', 'avg(Rating)')

In [None]:
# Applying Chi-Square Selector to select top 7 features
from pyspark.ml.feature import ChiSqSelector

selector = ChiSqSelector(numTopFeatures=7,
                         featuresCol='features',
                         outputCol='selectedFeatures', 
                         labelCol='avg(Rating)')
selected_feature_data = selector.fit(feature_data).transform(feature_data)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
selected_feature_data.head(1)

ChiSqSelector output with top 7 features selected


[Row(features=DenseVector([783.0, 1743.0, 1229.0, 2.0, 1277.0, 0.0, 1.0, 0.0, 1.0, 0.0, 13.0, 42.0, 931.0, 5.0]), avg(Rating)=3.714586439785984, selectedFeatures=DenseVector([783.0, 1229.0, 0.0, 1.0, 0.0, 42.0, 5.0]))]

In [None]:
selected_feature_data.head(5)

[Row(features=DenseVector([783.0, 1743.0, 1229.0, 2.0, 1277.0, 0.0, 1.0, 0.0, 1.0, 0.0, 13.0, 42.0, 931.0, 5.0]), avg(Rating)=3.714586439785984, selectedFeatures=DenseVector([783.0, 1229.0, 0.0, 1.0, 0.0, 42.0, 5.0])),
 Row(features=DenseVector([502.0, 770.0, 1960.0, 9.0, 424.0, 0.0, 0.0, 1.0, 0.0, 0.0, 813.0, 290.0, 277.0, 0.0]), avg(Rating)=2.848148148148148, selectedFeatures=DenseVector([502.0, 1960.0, 1.0, 0.0, 0.0, 290.0, 0.0])),
 Row(features=DenseVector([1146.0, 235.0, 1506.0, 0.0, 709.0, 0.0, 1.0, 0.0, 1.0, 0.0, 178.0, 1390.0, 932.0, 0.0]), avg(Rating)=2.3627329192546584, selectedFeatures=DenseVector([1146.0, 1506.0, 0.0, 1.0, 0.0, 1390.0, 0.0])),
 Row(features=DenseVector([41.0, 1580.0, 1670.0, 0.0, 26.0, 0.0, 3.0, 0.0, 1.0, 0.0, 1366.0, 184.0, 0.0, 28.0]), avg(Rating)=2.5395894428152492, selectedFeatures=DenseVector([41.0, 1670.0, 0.0, 1.0, 0.0, 184.0, 28.0])),
 Row(features=DenseVector([1314.0, 1139.0, 920.0, 0.0, 1049.0, 0.0, 1.0, 0.0, 1.0, 0.0, 771.0, 1123.0, 0.0, 6.0]), a

In [None]:
my_final_cols = my_feature_cols.select(['Actor1Index',  
 'Actor3Index',
 'Genre2Index',
 'Genre3Index',
 'LanguageIndex',
 'Writer1Index', 
 'YearIndex',
 'avg(Duration)',
 'avg(Rating)'])

In [None]:
my_final_cols.printSchema()

root
 |-- Actor1Index: double (nullable = false)
 |-- Actor3Index: double (nullable = false)
 |-- Genre2Index: double (nullable = false)
 |-- Genre3Index: double (nullable = false)
 |-- LanguageIndex: double (nullable = false)
 |-- Writer1Index: double (nullable = false)
 |-- YearIndex: double (nullable = false)
 |-- avg(Duration): double (nullable = true)
 |-- avg(Rating): double (nullable = true)



**Split the training, validation, and teststing sets**

In [None]:
# Split to train, valitation, and test sets
training_data, vali_data = my_final_cols.randomSplit([0.6, 0.4], seed=24) # 1343
validation_data, testing_data = vali_data.randomSplit([0.5, 0.5], seed=24) # 417 # 503

In [None]:
training_data.count()

1343

In [None]:
validation_data.count()

417

In [None]:
testing_data.count()

503

In [None]:
training_data.printSchema()

root
 |-- Actor1Index: double (nullable = false)
 |-- Actor3Index: double (nullable = false)
 |-- Genre2Index: double (nullable = false)
 |-- Genre3Index: double (nullable = false)
 |-- LanguageIndex: double (nullable = false)
 |-- Writer1Index: double (nullable = false)
 |-- YearIndex: double (nullable = false)
 |-- avg(Duration): double (nullable = true)
 |-- avg(Rating): double (nullable = true)



In [None]:
# Save data
training_data.write.format('csv').option('header',True).mode('overwrite').option('sep',',').save("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/training_avgRating.csv")
validation_data.write.format('csv').option('header',True).mode('overwrite').option('sep',',').save("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/validation_avgRating.csv")
testing_data.write.format('csv').option('header',True).mode('overwrite').option('sep',',').save("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/testing_avgRating.csv")