## Required installations

In [None]:
!pip install gdown
!pip install pyspark

## Imports

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import Imputer
from sklearn.metrics import recall_score, precision_score, f1_score
from statistics import mean
import math
import numpy as np


## Spark Session


In [None]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Recommender") \
        .config("spark.driver.memory", "16g") \
        .config("spark.executor.memory", "16g") \
        .getOrCreate()

## Load and format the ratings data

### 100k

In [None]:
!gdown 1lwPW7OefaJnwsaqYBQs-wgcIGiatYLXb

def load_100k() :
    data = spark.read.option("delimiter", "\t")\
                    .option("header", "False")\
                    .csv('/kaggle/working/u.data')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_100k()

### 1M

In [None]:
!gdown 18sHWE7Eu28hDqXib2PvesBYMea5AQmZs

def load_1m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_1m()

### 10M

In [None]:
!gdown 1e064MFX83PYtPDcISjYQw4fTQtv-PG38

def load_10m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_10m()

## Split/Train/Test

In [None]:
train, test = ratings_df.randomSplit([0.9,0.1],2345)

als = ALS(
        userCol="userId", 
        itemCol="movieId", 
        ratingCol="rating", 
        coldStartStrategy="nan",
        maxIter=10,
        regParam=.01,
        rank=8
)

# TRAIN
model = als.fit(train)

# TEST
prediction = model.transform(test)

## Evaluation

In [None]:
def create_binarised_output(ratings):
    binary = []
    for rating in ratings:
        if rating >= treshold:
            binary.append(1)
        else:
            binary.append(0)
    return binary


treshold = 3.5

y = test.select('rating').rdd.flatMap(lambda x : x).collect()
pred = prediction.select('prediction').rdd.flatMap(lambda x : x).collect()

filtered_y = []
filtered_pred = []

for i in range(len(pred)):
    if not np.isnan(pred[i]):
        filtered_y.append(y[i])
        filtered_pred.append(pred[i])
        
imputer = Imputer(inputCols=["prediction"], outputCols=["prediction_imputed"])
imputer.setStrategy("mean")
prediction = imputer.fit(prediction).transform(prediction)

# EVALUATE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",  predictionCol="prediction_imputed")

y_binary = create_binarised_output(filtered_y)
pred_binary = create_binarised_output(filtered_pred)

# Calcuate RMSE
rmse = evaluator.evaluate(prediction)
print(rmse)

precision = precision_score(y_binary, pred_binary)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_binary, pred_binary)
print("Recall:", recall)

# Calculate f1
f1 = f1_score(y_binary, pred_binary)
print("f1-score:", f1)