In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

In [4]:
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [6]:
!pip install -q findspark

In [7]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.1.1-bin-hadoop2.7'

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName('MovieTrees_avgRating').config('spark.ui.port', '4050').getOrCreate()

**Loading Data**

In [9]:
training_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/training_avgRating.csv",
                     format="csv", inferSchema=True, header=True)
validation_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/validation_avgRating.csv",
                     format="csv", inferSchema=True, header=True)
testing_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/testing_avgRating.csv",
                     format="csv", inferSchema=True, header=True)

In [34]:
training_data.printSchema()

root
 |-- Actor1Index: double (nullable = true)
 |-- Actor3Index: double (nullable = true)
 |-- Genre2Index: double (nullable = true)
 |-- Genre3Index: double (nullable = true)
 |-- LanguageIndex: double (nullable = true)
 |-- Writer1Index: double (nullable = true)
 |-- YearIndex: double (nullable = true)
 |-- avg(Duration): double (nullable = true)
 |-- avg(Rating): double (nullable = true)



**One-Hot Encoding**

In [10]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder)

In [11]:
# Encoding
actors_encoder = OneHotEncoder(inputCols=['Actor1Index', 'Actor3Index'], outputCols=['Actor1Vec', 'Actor3Vec'])
genres_encoder = OneHotEncoder(inputCols=['Genre2Index','Genre3Index'], outputCols=['Genre2Vec','Genre3Vec'])
language_encoder = OneHotEncoder(inputCol='LanguageIndex', outputCol='LanguageVec')
writers_encoder = OneHotEncoder(inputCol='Writer1Index', outputCol='Writer1Vec')
year_encoder = OneHotEncoder(inputCol='YearIndex', outputCol='YearVec')

In [12]:
assembler_encoder = VectorAssembler(inputCols=['Actor1Vec',
 'Actor3Vec',
 'Genre2Vec',
 'Genre3Vec',
 'LanguageVec',
 'Writer1Vec',
 'YearVec',
 'avg(Duration)'],outputCol='features')

**Create the Random Forests and Gradient Boosted Trees Models then fit the data**

In [13]:
from pyspark.ml.regression import (DecisionTreeRegressor, GBTRegressor, RandomForestRegressor)

In [14]:
# Hyperparameters
dtc_maxDepth = 5
dtc_maxBins = 32
rfc_maxDepth = 5
rfc_numTrees = 20
rfc_subsamplingRate = 1.0
gbt_maxDepth = 5
gbt_maxBins = 32
gbt_subsamplingRate = 1.0
gbt_lossType = 'squared'
gbt_maxIter = 40

In [16]:
dtr = DecisionTreeRegressor(featuresCol='features', labelCol='avg(Rating)', maxDepth=dtc_maxDepth, maxBins=dtc_maxBins)
rfr = RandomForestRegressor(featuresCol='features', labelCol='avg(Rating)', maxDepth=rfc_maxDepth, numTrees=rfc_numTrees, subsamplingRate=rfc_subsamplingRate)
gbr = GBTRegressor(featuresCol='features', labelCol='avg(Rating)', maxDepth=gbt_maxDepth, maxBins=gbt_maxBins, subsamplingRate=gbt_subsamplingRate, lossType=gbt_lossType, maxIter=gbt_maxIter)

In [17]:
# Setting ML Pipeline
from pyspark.ml import Pipeline
dtr_pipe = Pipeline(stages=[actors_encoder, 
                            genres_encoder, 
                            language_encoder, 
                            writers_encoder, 
                            year_encoder,
                            assembler_encoder,
                            dtr])
rfr_pipe = Pipeline(stages=[actors_encoder, 
                            genres_encoder, 
                            language_encoder, 
                            writers_encoder, 
                            year_encoder,
                            assembler_encoder,
                            rfr])
gbr_pipe = Pipeline(stages=[actors_encoder, 
                            genres_encoder, 
                            language_encoder, 
                            writers_encoder, 
                            year_encoder,
                            assembler_encoder,
                            gbr])

In [18]:
# Train the models
dtr_model = dtr_pipe.fit(training_data)

In [19]:
rfr_model = rfr_pipe.fit(training_data)

In [20]:
gbr_model = gbr_pipe.fit(training_data)

In [21]:
# Training Prediction
dtr_train_pred = dtr_model.transform(training_data)
rfr_train_pred = rfr_model.transform(training_data)
gbr_train_pred = gbr_model.transform(training_data)

In [22]:
# Validation Prediction
dtr_vali_pred = dtr_model.transform(validation_data)
rfr_vali_pred = rfr_model.transform(validation_data)
gbr_vali_pred = gbr_model.transform(validation_data)

In [23]:
# Testing Prediction
dtr_test_pred = dtr_model.transform(testing_data)
rfr_test_pred = rfr_model.transform(testing_data)
gbr_test_pred = gbr_model.transform(testing_data)

In [24]:
dtr_vali_pred.select('prediction', 'avg(Rating)').show()

+------------------+------------------+
|        prediction|       avg(Rating)|
+------------------+------------------+
| 3.271430864039803|3.5068171133051247|
|2.7181973527526435|3.0462850182704018|
|2.7181973527526435| 3.289972899728997|
|2.9896782762211562|3.6488453211450564|
|  3.06310389718217|2.9618320610687023|
| 3.460523056888338| 3.242487588189182|
|2.9896782762211562| 3.422812618364807|
| 3.460523056888338|3.3658757062146893|
|2.7181973527526435| 3.008522727272727|
|  3.06310389718217| 2.324561403508772|
|2.7181973527526435| 2.247311827956989|
|  3.06310389718217|  3.68073006932409|
| 3.460523056888338|3.5166927490871154|
| 3.460523056888338|  3.11860300797416|
|2.9896782762211562| 2.345646437994723|
| 3.271430864039803|3.4093439028503965|
|1.7520661157024793|3.5155555555555558|
| 3.271430864039803| 3.471118780353299|
|2.5259937884596084|  3.40037493304767|
| 3.271430864039803|2.1891319689484825|
+------------------+------------------+
only showing top 20 rows



**Evaluate and campare the results**

In [25]:
from pyspark.ml.evaluation import (RegressionEvaluator, RankingEvaluator)

In [39]:
my_rmse_eval = RegressionEvaluator(labelCol='avg(Rating)', predictionCol='prediction')
#my_map_eval

In [27]:
dtr_train_rmse = my_rmse_eval.evaluate(dtr_train_pred)
rfr_train_rmse = my_rmse_eval.evaluate(rfr_train_pred)
gbr_train_rmse = my_rmse_eval.evaluate(gbr_train_pred)

In [36]:
dtr_train_rmse

0.4222156236029416

In [37]:
rfr_train_rmse

0.4298285746025916

In [38]:
gbr_train_rmse

0.357348147371972

In [40]:
dtr_vali_rmse = rmse_eval.evaluate(dtr_vali_pred)
rfr_vali_rmse = rmse_eval.evaluate(rfr_vali_pred)
gbr_vali_rmse = rmse_eval.evaluate(gbr_vali_pred)

Py4JJavaError: ignored

In [None]:
print("Here are the results!")
print('-'*80)
print('A single decision tree had training and validation RMSE of: {0:2.2f}, {0:2.2f}'.format(dtr_train_rmse, dtr_vali_rmse))
print('-'*80)
print('A random forest ensemble had training and validation RMSE of: {0:2.2f}, {0:2.2f}'.format(rfr_train_rmse, rfr_vali_rmse))
print('-'*80)
print('A ensemble using GBT had training and validation RMSE of: {0:2.2f}, {0:2.2f}'.format(gbr_train_rmse, gbr_vali_rmse))