## Required installations

In [None]:
!pip install gdown
!pip install pyspark

## Required imports

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sklearn.metrics import recall_score, precision_score, f1_score,mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from statistics import mean
import math
import numpy as np
import pandas as pd

## Spark Session


In [None]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Recommender") \
        .config("spark.driver.memory", "16g") \
        .config("spark.executor.memory", "16g") \
        .getOrCreate()

## Load and format the ratings data


### 100k

In [None]:
!gdown 1lwPW7OefaJnwsaqYBQs-wgcIGiatYLXb

def load_100k() :
    data = spark.read.option("delimiter", "\t")\
                    .option("header", "False")\
                    .csv('/kaggle/working/u.data')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_100k()

### 1M

In [None]:
!gdown 18sHWE7Eu28hDqXib2PvesBYMea5AQmZs

def load_1m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_1m()

### 10M

In [None]:
!gdown 1e064MFX83PYtPDcISjYQw4fTQtv-PG38

def load_10m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_10m()

## Split/Train/Test

In [None]:
train, test = ratings_df.randomSplit([0.9,0.1],2345)

matrix = train.toPandas().pivot_table(index='userId', columns='movieId', values='rating')
cosine_sim_df = pd.DataFrame(cosine_similarity(matrix.fillna(0)) , index = matrix.index , columns = matrix.index)

predictions = []
for row in test.collect() :
    user_id = row['userId']
    item_id = row['movieId']

    if user_id not in cosine_sim_df.index :
        predictions.append(None)
    else :
        if item_id not in matrix.columns:
            predictions.append(None)
        else :  
            item_ratings = matrix.loc[:,item_id]
            users_with_ratings = item_ratings[item_ratings.notnull() & item_ratings.apply(lambda x: not isinstance(x, str)) ]
            sim_scores = cosine_sim_df[user_id][users_with_ratings.index]
            sim_scores = [score if score > 0 else 0 for score in sim_scores]
            if len(sim_scores) == 0 :
                predictions.append(None)
            else :
                if np.average(np.array(sim_scores)) == 0 :
                    predictions.append(None)
                else :
                    predictions.append(np.average(np.array(users_with_ratings.values), weights=np.array(sim_scores)))


## Evaluate

In [None]:
def fillNa(lst):
    non_none_values = [x for x in lst if x is not None]
    if len(non_none_values) == 0:
        return lst
    mean_value = sum(non_none_values) / len(non_none_values)
    return [mean_value if x is None else x for x in lst]

def create_binarised_output(ratings):
    binary = []
    for rating in ratings:
        if rating >= treshold:
            binary.append(1)
        else:
            binary.append(0)
    return binary


treshold = 3.5

y = test.select('rating').rdd.flatMap(lambda x : x ).collect()

filtered_y = []
filtered_pred = []

for i in range(len(predictions)):
    if predictions[i] != None :
        filtered_y.append(y[i])
        filtered_pred.append(predictions[i])
        
y_binary = create_binarised_output(filtered_y)
pred_binary = create_binarised_output(filtered_pred)

# Calcuate RMSE
rmse = np.sqrt(mean_squared_error(y, fillNa(predictions)))
print(rmse)

precision = precision_score(y_binary, pred_binary)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_binary, pred_binary)
print("Recall:", recall)

# Calculate f1
f1 = f1_score(y_binary, pred_binary)
print("f1-score:", f1)