In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

In [4]:
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [5]:
!pip install -q findspark

In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [7]:
import findspark
findspark.init()

In [8]:
findspark.find()

'/content/spark-3.1.1-bin-hadoop2.7'

In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName('MovieLogReg').config('spark.ui.port', '4050').getOrCreate()

In [10]:
import pandas as pd

train_df = pd.read_pickle('converted_training.pkl')
test_df = pd.read_pickle('converted_test.pkl')
train_data = spark.createDataFrame(train_df)
test_data = spark.createDataFrame(test_df)

In [11]:
train_data.columns

['Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Customer_ID',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Movie_ID',
 'Production_Company',
 'Title',
 'Writer1',
 'Writer2',
 'Year',
 'Duration',
 'Rating']

In [12]:
training = train_data.select(['Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Production_Company',
 'Writer1',
 'Writer2',
 'Year',
 'Duration',
 'Rating'])
test = test_data.select(['Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Production_Company',
 'Writer1',
 'Writer2',
 'Year',
 'Duration',
 'Rating'])

In [14]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [15]:
assembler_encoder = VectorAssembler(inputCols=['Actor1',
 'Actor2',
 'Actor3',
 'Country',
 'Director1',
 'Director2',
 'Genre1',
 'Genre2',
 'Genre3',
 'Language',
 'Production_Company',
 'Writer1',
 'Writer2',
 'Year',
 'Duration'],outputCol='features')

In [16]:
# Create the Logistic Regression Model and fit the data

# Hyperparameters
iteration = 200
regParam = 0.3
elasticNetParam = 0.8

log_reg_movie = LogisticRegression(featuresCol='features', labelCol='Rating', maxIter=iteration, regParam=regParam, elasticNetParam=elasticNetParam)

# Setting ML Pipeline
pipeline = Pipeline(stages=[assembler_encoder,
                            log_reg_movie])

# Fit the data
log_reg_model = pipeline.fit(training)

# Validation Prediction
test_pred = log_reg_model.transform(test)

In [18]:
# Evaluateing the results
from pyspark.ml.evaluation import (MulticlassClassificationEvaluator, RegressionEvaluator, RankingEvaluator)

my_acc_eval = MulticlassClassificationEvaluator(labelCol='Rating', metricName='accuracy')
my_rmse_eval = RegressionEvaluator(labelCol='Rating')

acc = my_acc_eval.evaluate(test_pred)
rmse = my_rmse_eval.evaluate(test_pred)

print(acc)
print(rmse)

0.30569585551532547
1.222404728067127


In [20]:
test_pred.select(['Rating', 'prediction']).toPandas().to_pickle('log_reg_predictions.pkl')

In [22]:
import numpy as np

np.save('log_reg_predictions.npy', np.array(test_pred.select('prediction').collect()))