In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

In [4]:
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [8]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [5]:
!pip install -q findspark

In [9]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.1.1-bin-hadoop2.7'

In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName('MovieLTrees').config('spark.ui.port', '4050').getOrCreate()

**Loading Data**

In [11]:
training_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/training.csv",
                     format="csv", inferSchema=True, header=True)
validation_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/validation.csv",
                     format="csv", inferSchema=True, header=True)
testing_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/testing.csv",
                     format="csv", inferSchema=True, header=True)

**One-Hot Encoding**

In [12]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder)

In [13]:
# Encoding
customer_encoder = OneHotEncoder(inputCol='CustomerIndex', outputCol='CustomerVec')
actors_encoder = OneHotEncoder(inputCols=['Actor1Index', 'Actor2Index', 'Actor3Index'], outputCols=['Actor1Vec', 'Actor2Vec', 'Actor3Vec'])
country_encoder = OneHotEncoder(inputCol='CountryIndex', outputCol='CountryVec')
directors_encoder = OneHotEncoder(inputCols=['Director1Index', 'Director2Index'], outputCols=['Director1Vec', 'Director2Vec'])
genre1_encoder = OneHotEncoder(inputCol='Genre1Index', outputCol='Genre1Vec')
#genres_encoder = OneHotEncoder(inputCols=['Genre1Index','Genre2Index','Genre3Index'], outputCols=['Genre1Vec','Genre2Vec','Genre3Vec'])
#language_encoder = OneHotEncoder(inputCol='LanguageIndex', outputCol='LanguageVec')
#pc_encoder = OneHotEncoder(inputCol='PCIndex', outputCol='PCVec')
#writers_encoder = OneHotEncoder(inputCols=['Writer1Index', 'Writer2Index'], outputCols=['Writer1Vec', 'Writer2Vec'])
#year_encoder = OneHotEncoder(inputCol='YearIndex', outputCol='YearVec')

In [14]:
assembler_encoder = VectorAssembler(inputCols=['CustomerVec',
 'Actor1Vec',
 'Actor2Vec',
 'Actor3Vec',
 'CountryVec',
 'Director1Vec',
 'Director2Vec',
 'Genre1Vec',
 'Duration'],outputCol='features')

In [15]:
# Setting ML Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[customer_encoder, 
                            actors_encoder, 
                            country_encoder, 
                            directors_encoder, 
                            genre1_encoder,
                            assembler_encoder])

In [16]:
training_trans_data = pipeline.fit(training_data).transform(training_data)

**Create the Random Forests and Gradient Boosted Trees Models then fit the data**

In [17]:
from pyspark.ml.classification import (DecisionTreeClassifier, GBTClassifier, RandomForestClassifier)
from pyspark.ml.regression import (DecisionTreeRegressor, GBTRegressor, RandomForestRegressor)

In [18]:
# Hyperparameters
dtc_maxDepth = 3
dtc_maxBins = 8
rfc_maxDepth = 4
rfc_numTrees = 5
rfc_subsamplingRate = 1.0
gbt_maxDepth = 3
gbt_maxBins = 10
gbt_subsamplingRate = 1.0
gbt_lossType = 'squared'
gbt_maxIter = 20

In [19]:
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='Rating', maxDepth=dtc_maxDepth, maxBins=dtc_maxBins, maxMemoryInMB=512)
rfc = RandomForestClassifier(featuresCol='features', labelCol='Rating', maxDepth=rfc_maxDepth, numTrees=rfc_numTrees, subsamplingRate=rfc_subsamplingRate)
gbc = GBTClassifier(featuresCol='features', labelCol='Rating', maxDepth=gbt_maxDepth, maxBins=gbt_maxBins, subsamplingRate=gbt_subsamplingRate, lossType=gbt_lossType, maxIter=gbt_maxIter)

In [20]:
dtr = DecisionTreeRegressor(featuresCol='features', labelCol='Rating', maxDepth=dtc_maxDepth, maxBins=dtc_maxBins)
rfr = RandomForestRegressor(featuresCol='features', labelCol='Rating', maxDepth=rfc_maxDepth, numTrees=rfc_numTrees, subsamplingRate=rfc_subsamplingRate)
gbr = GBTRegressor(featuresCol='features', labelCol='Rating', maxDepth=gbt_maxDepth, maxBins=gbt_maxBins, subsamplingRate=gbt_subsamplingRate, lossType=gbt_lossType, maxIter=gbt_maxIter)

In [None]:
# Train the models
#dtc_model = dtc.fit(training_trans_data)

In [None]:
rfr_model = rfr.fit(training_trans_data)

In [None]:
gbr_model = gbr.fit(training_trans_data)

In [None]:
# Validation Prediction
#dtc_vali_pred = dtc_model.transform(validation_data)
rfr_vali_pred = rfr_model.transform(validation_data)
gbr_vali_pred = gbr_model.transform(validation_data)

**Evaluate and campare the results**

In [None]:
from pyspark.ml.evaluation import (RegressionEvaluator, RankingEvaluator)

In [None]:
my_rmse_eval = RegressionEvaluator(labelCol='Rating')
#my_map_eval

In [None]:
#dtc_rmse = my_rmse_eval.evaluate(dtc_vali_pred)
rfc_rmse = my_rmse_eval.evaluate(rfc_vali_pred)
gbt_rmse = my_rmse_eval.evaluate(gbt_vali_pred)

In [None]:
print("Here are the results!")
print('-'*80)
#print('A single decision tree had an RMSE of: {0:2.2f}'.format(dtc_rmse))
#print('-'*80)
print('A random forest ensemble had an RMSE of: {0:2.2f}'.format(rfc_rmse))
print('-'*80)
print('A ensemble using GBT had an RMSE of: {0:2.2f}'.format(gbt_rmse))