## Required installations

In [None]:
!pip install gdown
!pip install pyspark

## Imports

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sklearn.metrics import recall_score, precision_score, f1_score,mean_squared_error
from pyspark.sql.functions import mean,count, sum,col
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np

## Spark Session


In [None]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Recommender") \
        .config("spark.driver.memory", "16g") \
        .config("spark.executor.memory", "16g") \
        .getOrCreate()

## Load and format the ratings data


### 100k

In [None]:
!gdown 1lwPW7OefaJnwsaqYBQs-wgcIGiatYLXb

def load_100k() :
    data = spark.read.option("delimiter", "\t")\
                    .option("header", "False")\
                    .csv('/kaggle/working/u.data')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_100k()

### 1M

In [None]:
!gdown 18sHWE7Eu28hDqXib2PvesBYMea5AQmZs

def load_1m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_1m()

### 10M

In [None]:
!gdown 1e064MFX83PYtPDcISjYQw4fTQtv-PG38

def load_10m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_10m()

## Split/Train/Test

In [None]:
train, test = ratings_df.randomSplit([0.9,0.1],2345)
lambdas = np.arange(0, 10.25, 0.25)
results = []


for l in lambdas:
    
    mu = ratings_df.select(mean('rating')).collect()[0][0]
    train_users = train.select('userId').distinct().rdd.flatMap(lambda x: x).collect()
    train_movies = train.select('movieId').distinct().rdd.flatMap(lambda x: x).collect()
    # calculate b_i
    b_i = train.groupBy('movieId').agg(sum('rating').alias('sum_rating'), count('rating').alias('count_rating'))\
        .withColumn('b_i', (col('sum_rating') - mu * col('count_rating')) / (col('count_rating') + l))

    # calculate b_u
    b_u = train.join(b_i, on='movieId').groupBy('userId')\
        .agg(sum('rating').alias('sum_rating'), count('rating').alias('count_rating'), sum('b_i').alias('sum_bi'))\
        .withColumn('b_u', (col('sum_rating') - mu * col('count_rating') - col('sum_bi')) / (col('count_rating') + l))
    
    # Exclude users and movies that were not in the training phase
    test_filtered = test.filter(col('userId').isin(train_users) & col('movieId').isin(train_movies))
    
    # calculate predicted ratings and RMSE
    predicted_ratings = test_filtered.join(b_i, on='movieId').join(b_u, on='userId')\
                        .withColumn('pred', mu + col('b_i') + col('b_u'))\
                        .select('rating', 'pred')
    pred = predicted_ratings.select('pred').rdd.flatMap(lambda x: x).collect() 
    rmse = RegressionEvaluator(labelCol='rating', predictionCol='pred', metricName='rmse').evaluate(predicted_ratings)
    results.append([l,rmse,pred])



## Evaluation

In [None]:
def create_binarised_output(ratings):
    binary = []
    for rating in ratings:
        
        if rating >= treshold:
            binary.append(1)
        else:
            binary.append(0)
    return binary

treshold = 3.5

y = test_filtered.select('rating').rdd.flatMap(lambda x : x ).collect()

best_lambda, best_rmse,pred = min(results, key=lambda x: x[1])

pred = pred 
        
y_binary = create_binarised_output(y)
pred_binary = create_binarised_output(pred)

# Print RMSE
print("Best lambda : ",best_lambda,"\nBest RMSE : ",best_rmse)

precision = precision_score(y_binary, pred_binary)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_binary, pred_binary)
print("Recall:", recall)

# Calculate f1
f1 = f1_score(y_binary, pred_binary)
print("f1-score:", f1)
