In [3]:
import os
import sys
import subprocess
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

spark = SparkSession.builder.appName("MovieLensALS").config("spark.driver.host", "localhost").getOrCreate()
spark

# Apache Spark

In [4]:
ratings = spark.read.option("sep", "\t").option("inferSchema", "true").csv("ml-100k/u.data").toDF("userId", "movieId", "rating", "timestamp")

ratings.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows


In [5]:
train, test = ratings.randomSplit([0.8, 0.2], seed=42)

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank=10,
    maxIter=10,
    regParam=0.1,
    coldStartStrategy="drop"
)

model = als.fit(train)

In [8]:
predictions = model.transform(test)

metrics = {}

for metric in ["rmse", "mae"]:
    evaluator = RegressionEvaluator(
        metricName=metric,
        labelCol="rating",
        predictionCol="prediction"
    )
    metrics[f"spark_{metric}"] = evaluator.evaluate(predictions)

metrics

{'spark_rmse': 0.9200193184406374, 'spark_mae': 0.728176449245273}

# Surprise library

## SVD

In [9]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(
    ratings.select("userId", "movieId", "rating").toPandas(),
    reader
)

algo = SVD(random_state=42)

cv_results = cross_validate(
    algo,
    data,
    measures=["RMSE", "MAE"],
    cv=5,
    verbose=False
)

surprise_metrics = {
    "surprise_rmse": cv_results["test_rmse"].mean(),
    "surprise_mae": cv_results["test_mae"].mean()
}

surprise_metrics

{'surprise_rmse': 0.9380461120899574, 'surprise_mae': 0.7393794987741529}

## KNN User-Based

In [16]:
from surprise import KNNBasic

sim_options = {
    "name": "cosine",
    "user_based": True
}

knn_user = KNNBasic(sim_options=sim_options)

knn_user_results = cross_validate(
    knn_user,
    data,
    measures=["RMSE", "MAE"],
    cv=3,
    verbose=False
)
knn_user_results= {
    "knn_user_rmse": knn_user_results["test_rmse"].mean(),
    "knn_user_mae": knn_user_results["test_mae"].mean()
}
knn_user_results

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


{'knn_user_rmse': 1.022054068759951, 'knn_user_mae': 0.8091106181274809}

## KNN Item-Based

In [18]:
sim_options = {
    "name": "cosine",
    "user_based": False
}

knn_item = KNNBasic(sim_options=sim_options)

knn_item_results = cross_validate(
    knn_item,
    data,
    measures=["RMSE", "MAE"],
    cv=3,
    verbose=False
)
knn_item_metrics= {
    "knn_item_rmse": knn_item_results["test_rmse"].mean(),
    "knn_item_mae": knn_item_results["test_mae"].mean()
}
knn_item_metrics

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


{'knn_item_rmse': 1.036685878694282, 'knn_item_mae': 0.822683719914772}

# Results Comparison

In [19]:
import pandas as pd

comparison = pd.DataFrame({
    "Model": ["Spark ALS", "Surprise SVD", "Surprise KNN User-Based", "Surprise KNN Item-Based"],
    "RMSE": [
        metrics["spark_rmse"],
        surprise_metrics["surprise_rmse"],
        knn_user_results["knn_user_rmse"].mean(),
        knn_item_metrics["knn_item_rmse"].mean()
    ],
    "MAE": [
        metrics["spark_mae"],
        surprise_metrics["surprise_mae"],
        knn_user_results["knn_user_mae"].mean(),
        knn_item_metrics["knn_item_mae"].mean()
    ]
})

comparison

Unnamed: 0,Model,RMSE,MAE
0,Spark ALS,0.920019,0.728176
1,Surprise SVD,0.938046,0.739379
2,Surprise KNN User-Based,1.022054,0.809111
3,Surprise KNN Item-Based,1.036686,0.822684
