In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

In [4]:
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [5]:
!pip install -q findspark

In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [7]:
import findspark
findspark.init()

In [8]:
findspark.find()

'/content/spark-3.1.1-bin-hadoop2.7'

In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName('MovieLogReg').config('spark.ui.port', '4050').getOrCreate()

**Loading Data**

In [10]:
training_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/training.csv",
                     format="csv", inferSchema=True, header=True)
validation_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/validation.csv",
                     format="csv", inferSchema=True, header=True)
testing_data = spark.read.load("/content/gdrive/MyDrive/Colab Notebooks/CSC 522/testing.csv",
                     format="csv", inferSchema=True, header=True)

**One-Hot Encoding**

In [12]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer, OneHotEncoder)

In [13]:
# Encoding
customer_encoder = OneHotEncoder(inputCol='CustomerIndex', outputCol='CustomerVec')
actors_encoder = OneHotEncoder(inputCols=['Actor1Index', 'Actor2Index', 'Actor3Index'], outputCols=['Actor1Vec', 'Actor2Vec', 'Actor3Vec'])
country_encoder = OneHotEncoder(inputCol='CountryIndex', outputCol='CountryVec')
directors_encoder = OneHotEncoder(inputCols=['Director1Index', 'Director2Index'], outputCols=['Director1Vec', 'Director2Vec'])
genre1_encoder = OneHotEncoder(inputCol='Genre1Index', outputCol='Genre1Vec')
#genres_encoder = OneHotEncoder(inputCols=['Genre1Index','Genre2Index','Genre3Index'], outputCols=['Genre1Vec','Genre2Vec','Genre3Vec'])
#language_encoder = OneHotEncoder(inputCol='LanguageIndex', outputCol='LanguageVec')
#pc_encoder = OneHotEncoder(inputCol='PCIndex', outputCol='PCVec')
#writers_encoder = OneHotEncoder(inputCols=['Writer1Index', 'Writer2Index'], outputCols=['Writer1Vec', 'Writer2Vec'])
#year_encoder = OneHotEncoder(inputCol='YearIndex', outputCol='YearVec')

In [14]:
assembler_encoder = VectorAssembler(inputCols=['CustomerVec',
 'Actor1Vec',
 'Actor2Vec',
 'Actor3Vec',
 'CountryVec',
 'Director1Vec',
 'Director2Vec',
 'Genre1Vec',
 'Duration'],outputCol='features')

**Create the Logistic Regression Model and fit the data**

In [15]:
from pyspark.ml.classification import LogisticRegression

In [16]:
# Hyperparameters
iteration = 200
regParam = 0.3
elasticNetParam = 0.8

In [17]:
log_reg_movie = LogisticRegression(featuresCol='features', labelCol='Rating', maxIter=iteration, regParam=regParam, elasticNetParam=elasticNetParam)

In [20]:
# Setting ML Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[customer_encoder, 
                            actors_encoder, 
                            country_encoder, 
                            directors_encoder, 
                            genre1_encoder,
                            assembler_encoder,
                            log_reg_movie])

In [21]:
# Fit the data
log_reg_model = pipeline.fit(training_data)

In [22]:
# Validation Prediction
vali_pred = log_reg_model.transform(validation_data)

**Evaluating The Results**

In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [30]:
my_eval = MulticlassClassificationEvaluator(labelCol='Rating')

In [31]:
f1 = my_eval.evaluate(vali_pred)

In [32]:
f1

0.17212975781284987