In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

In [4]:
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [6]:
!pip install -q findspark

In [7]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.1.1-bin-hadoop2.7'

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName('MovieLTrees').config('spark.ui.port', '4050').getOrCreate()

In [9]:
import pandas as pd

train_df = pd.read_pickle('converted_training.pkl')
test_df = pd.read_pickle('converted_test.pkl')
train_data = spark.createDataFrame(train_df)
test_data = spark.createDataFrame(test_df)

In [10]:
training = train_data.select(['Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Production_Company',
 'Writer1',
 'Writer2',
 'Year',
 'Duration',
 'Rating'])
test = test_data.select(['Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Production_Company',
 'Writer1',
 'Writer2',
 'Year',
 'Duration',
 'Rating'])

In [17]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.regression import (DecisionTreeRegressor, GBTRegressor, RandomForestRegressor)

In [12]:
assembler_encoder = VectorAssembler(inputCols=['Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Production_Company',
 'Writer1',
 'Writer2',
 'Year',
 'Duration'],outputCol='features')

encoded_train = assembler_encoder.transform(training)

In [52]:
# Create the Random Forests and Gradient Boosted Trees Models then fit the data

# Hyperparameters
dtc_maxDepth = 10
dtc_maxBins = 32
rfc_maxDepth = 10
rfc_numTrees = 32
rfc_subsamplingRate = 1.0
gbt_maxDepth = 10
gbt_maxBins = 32
gbt_subsamplingRate = 1.0
gbt_lossType = 'squared'
gbt_maxIter = 200

dtr = DecisionTreeRegressor(featuresCol='features', labelCol='Rating', maxDepth=dtc_maxDepth, maxBins=dtc_maxBins)
rfr = RandomForestRegressor(featuresCol='features', labelCol='Rating', maxDepth=rfc_maxDepth, numTrees=rfc_numTrees, subsamplingRate=rfc_subsamplingRate)
gbt = GBTRegressor(featuresCol='features', labelCol='Rating', maxDepth=gbt_maxDepth, maxBins=gbt_maxBins, subsamplingRate=gbt_subsamplingRate, lossType=gbt_lossType, maxIter=gbt_maxIter)

In [53]:
# Train the models

dtr_model = dtr.fit(encoded_train)
rfr_model = rfr.fit(encoded_train)
gbt_model = gbt.fit(encoded_train)

In [54]:
# Test Prediction
encoded_test = assembler_encoder.transform(test)

dtr_test_pred = dtr_model.transform(encoded_test)
rfr_test_pred = rfr_model.transform(encoded_test)
gbt_test_pred = gbt_model.transform(encoded_test)

In [55]:
from pyspark.sql.functions import round, col
dtr_pred = dtr_test_pred.select(['Rating', round('prediction').alias('prediction')])
rfr_pred = rfr_test_pred.select(['Rating', round('prediction').alias('prediction')])
gbt_pred = gbt_test_pred.select(['Rating', round('prediction').alias('prediction')])

In [56]:
from pyspark.ml.evaluation import (MulticlassClassificationEvaluator, RegressionEvaluator, RankingEvaluator)

my_acc_eval = MulticlassClassificationEvaluator(labelCol='Rating', metricName='accuracy')
my_rmse_eval = RegressionEvaluator(labelCol='Rating')

dtr_rmse = my_rmse_eval.evaluate(dtr_pred)
rfr_rmse = my_rmse_eval.evaluate(rfr_pred)
gbt_rmse = my_rmse_eval.evaluate(gbt_pred)

dtr_acc = my_acc_eval.evaluate(dtr_pred)
rfr_acc = my_acc_eval.evaluate(rfr_pred)
gbt_acc = my_acc_eval.evaluate(gbt_pred)

In [51]:
print("Here are the results!")
print('-'*80)
print('A single decision tree had an RMSE of: {0:2.2f}, an accuracy of: {0:2.2f}'.format(dtr_rmse, dtr_acc))
print('-'*80)
print('A random forest ensemble had an RMSE of: {0:2.2f}, an accuracy of: {0:2.2f}'.format(rfr_rmse, rfr_acc))
print('-'*80)
print('A ensemble using GBT had an RMSE of: {0:2.2f}, an accuracy of: {0:2.2f}'.format(gbt_rmse, gbt_acc))

Here are the results!
--------------------------------------------------------------------------------
A single decision tree had an RMSE of: 1.16, an accuracy of: 1.16
--------------------------------------------------------------------------------
A random forest ensemble had an RMSE of: 1.16, an accuracy of: 1.16
--------------------------------------------------------------------------------
A ensemble using GBT had an RMSE of: 1.16, an accuracy of: 1.16


In [57]:
import numpy as np

np.save('dtr_predictions.npy', np.array(dtr_pred.select('prediction').collect()))
np.save('rfr_predictions.npy', np.array(rfr_pred.select('prediction').collect()))
np.save('gbt_predictions.npy', np.array(gbt_pred.select('prediction').collect()))