In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

In [4]:
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [5]:
!pip install -q findspark

In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [7]:
import findspark
findspark.init()

In [8]:
findspark.find()

'/content/spark-3.1.1-bin-hadoop2.7'

In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName('MovieLogReg_avgRating').config('spark.ui.port', '4050').getOrCreate()

**Loading Data**

In [10]:
training_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/training_avgRating.csv",
                     format="csv", inferSchema=True, header=True)
validation_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/validation_avgRating.csv",
                     format="csv", inferSchema=True, header=True)
testing_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/testing_avgRating.csv",
                     format="csv", inferSchema=True, header=True)

In [11]:
from pyspark.sql.functions import round
train_DATA = training_data.withColumn('avgRating', round('avg(Rating)', 0))
vali_DATA = validation_data.withColumn('avgRating', round('avg(Rating)', 0))
test_DATA = testing_data.withColumn('avgRating', round('avg(Rating)', 0))

In [12]:
'''
from pyspark.sql.types import IntegerType
train_DATA = train_DATA.withColumn('avgRating',train_DATA['avgRating'].cast(IntegerType()))
vali_DATA = vali_DATA.withColumn('avgRating',vali_DATA['avgRating'].cast(IntegerType()))
test_DATA = test_DATA.withColumn('avgRating',test_DATA['avgRating'].cast(IntegerType()))
'''

"\nfrom pyspark.sql.types import IntegerType\ntrain_DATA = train_DATA.withColumn('avgRating',train_DATA['avgRating'].cast(IntegerType()))\nvali_DATA = vali_DATA.withColumn('avgRating',vali_DATA['avgRating'].cast(IntegerType()))\ntest_DATA = test_DATA.withColumn('avgRating',test_DATA['avgRating'].cast(IntegerType()))\n"

In [13]:
train_DATA.printSchema()

root
 |-- Actor1Index: double (nullable = true)
 |-- Actor3Index: double (nullable = true)
 |-- Genre2Index: double (nullable = true)
 |-- Genre3Index: double (nullable = true)
 |-- LanguageIndex: double (nullable = true)
 |-- Writer1Index: double (nullable = true)
 |-- YearIndex: double (nullable = true)
 |-- avg(Duration): double (nullable = true)
 |-- avg(Rating): double (nullable = true)
 |-- avgRating: double (nullable = true)



**One-Hot Encoding**

In [14]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer, OneHotEncoder)

In [15]:
# Encoding
#customer_encoder = OneHotEncoder(inputCol='CustomerIndex', outputCol='CustomerVec')
actors_encoder = OneHotEncoder(inputCols=['Actor1Index', 'Actor3Index'], outputCols=['Actor1Vec', 'Actor3Vec'])
#country_encoder = OneHotEncoder(inputCol='CountryIndex', outputCol='CountryVec')
#directors_encoder = OneHotEncoder(inputCols=['Director1Index', 'Director2Index'], outputCols=['Director1Vec', 'Director2Vec'])
#genre1_encoder = OneHotEncoder(inputCol='Genre1Index', outputCol='Genre1Vec')
genres_encoder = OneHotEncoder(inputCols=['Genre2Index','Genre3Index'], outputCols=['Genre2Vec','Genre3Vec'])
language_encoder = OneHotEncoder(inputCol='LanguageIndex', outputCol='LanguageVec')
#pc_encoder = OneHotEncoder(inputCol='PCIndex', outputCol='PCVec')
writers_encoder = OneHotEncoder(inputCol='Writer1Index', outputCol='Writer1Vec')
year_encoder = OneHotEncoder(inputCol='YearIndex', outputCol='YearVec')

In [16]:
assembler_encoder = VectorAssembler(inputCols=['Actor1Vec',
 'Actor3Vec',
 'Genre2Vec',
 'Genre3Vec',
 'LanguageVec',
 'Writer1Vec',
 'YearVec',
 'avg(Duration)'],outputCol='features')

**Create the Logistic Regression Model and fit the data**

In [17]:
from pyspark.ml.classification import LogisticRegression

In [18]:
# Hyperparameters
iteration = 200
regParam = 0.3
elasticNetParam = 0.8

In [19]:
log_reg_movie = LogisticRegression(featuresCol='features', labelCol='avgRating', maxIter=iteration, regParam=regParam, elasticNetParam=elasticNetParam)

In [20]:
# Setting ML Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[actors_encoder, 
                            genres_encoder, 
                            language_encoder, 
                            writers_encoder, 
                            year_encoder,
                            assembler_encoder,
                            log_reg_movie])

In [21]:
# Fit the data
log_reg_model = pipeline.fit(train_DATA)

In [22]:
# Validation Prediction
vali_pred = log_reg_model.transform(vali_DATA)

In [23]:
vali_pred.select('prediction', 'avgRating').show()

+----------+---------+
|prediction|avgRating|
+----------+---------+
|       3.0|      4.0|
|       3.0|      3.0|
|       3.0|      3.0|
|       3.0|      4.0|
|       3.0|      3.0|
|       3.0|      3.0|
|       3.0|      3.0|
|       3.0|      3.0|
|       3.0|      3.0|
|       3.0|      2.0|
|       3.0|      2.0|
|       3.0|      4.0|
|       3.0|      4.0|
|       3.0|      3.0|
|       3.0|      2.0|
|       3.0|      3.0|
|       3.0|      4.0|
|       3.0|      3.0|
|       3.0|      3.0|
|       3.0|      2.0|
+----------+---------+
only showing top 20 rows



**Evaluating The Results**

In [24]:
from pyspark.ml.evaluation import (MulticlassClassificationEvaluator, RegressionEvaluator, RankingEvaluator)

In [25]:
my_f1_eval = MulticlassClassificationEvaluator(labelCol='avgRating')
my_rmse_eval = RegressionEvaluator(labelCol='avgRating')

In [26]:
#f1 = my_f1_eval.evaluate(vali_pred)
rmse = my_rmse_eval.evaluate(vali_pred)

Py4JJavaError: ignored

In [None]:
f1

In [None]:
rmse