## Required installations

In [None]:
!pip install gdown
!pip install pyspark

## Imports

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, f1_score
from pyspark.ml.evaluation import RegressionEvaluator

## Spark Session


In [None]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Recommender") \
        .config("spark.driver.memory", "16g") \
        .config("spark.executor.memory", "16g") \
        .getOrCreate()

## Load and format the ratings data

### 100k

In [None]:
!gdown 1lwPW7OefaJnwsaqYBQs-wgcIGiatYLXb
!gdown 1zCAfXEzy9uQYVQWrJ-BpPwQdEUqQaeSK

def load_100k() :
    data = spark.read.option("delimiter", "\t")\
                    .option("header", "False")\
                    .csv('/kaggle/working/u.data')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    
    movies = spark.read.option("delimiter", "|")\
                    .option("header", "False")\
                    .csv('/kaggle/working/u.item')\
                    .withColumnRenamed('_c0','movieId')\
                    .drop('_c1','_c2','_c3','_c4')
    
    data =  data.join(movies, on='movieId')
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_100k()

### 1M

In [None]:
!gdown 18sHWE7Eu28hDqXib2PvesBYMea5AQmZs
!gdown 1PtKj4n-sL1PjvbXUqJ_rC_W5MDzQbrgn

def load_1m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    
    movies = spark.read.option("delimiter", ",")\
                    .option("header", "True")\
                    .csv('/kaggle/working/movies.csv')\
                    .withColumnRenamed('movie_id','movieId')\
                    .drop('movie_title','release_date','imdb_url')
    data =  data.join(movies, on='movieId')
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_1m()

### 10M

In [None]:
!gdown 1e064MFX83PYtPDcISjYQw4fTQtv-PG38
!gdown 1lksdO8vXSpkE1DUlXQKsgtLiTQzShF2O

def load_10m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    
    movies = spark.read.option("delimiter", ",")\
                    .option("header", "True")\
                    .csv('/kaggle/working/movies.csv')\
                    .withColumnRenamed('movie_id','movieId')\
                    .drop('movie_title','release_date','imdb_url')
    data =  data.join(movies, on='movieId')
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
ratings_df = load_10m()

## Split/Train/Test

In [None]:
train, test = ratings_df.randomSplit([0.9,0.1],2345)

columns = [col for col in ratings_df.columns if col != 'rating']

assembler = VectorAssembler(inputCols=columns, outputCol="features", handleInvalid="keep")
assembled_df = assembler.transform(train).select("rating","features")
assembled_testSet = assembler.transform(test).select("rating","features")

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

dt = DecisionTreeClassifier(featuresCol="features", labelCol='rating', impurity="entropy",maxDepth=len(columns)+2)
dt.fit(assembled_df)


## Evaluation

In [None]:
def create_binarised_output(ratings):
    binary = []
    for rating in ratings:
        if rating >= treshold:
            binary.append(1)
        else:
            binary.append(0)
    return binary


pipeline = Pipeline(stages=[dt])
pipeline = pipeline.fit(assembled_testSet)
prediction = pipeline.transform(assembled_testSet)

y = test.select('rating').rdd.flatMap(lambda x : x).collect()
pred = prediction.select('prediction').rdd.flatMap(lambda x : x).collect()

treshold = 3.5

y_binary = create_binarised_output(y)
pred_binary = create_binarised_output(pred)

# Calcuate RMSE
rmse = evaluator.evaluate(prediction)
print("RMSE:", rmse)

precision = precision_score(y_binary, pred_binary)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_binary, pred_binary)
print("Recall:", recall)

# Calculate accuracy
f1 = f1_score(y_binary, pred_binary)
print("F1 score:", f1)
