In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import numpy as np
import surprise
import papermill as pm
import scrapbook as sb
import pandas as pd


import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType, StructType, StructField
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.ml.feature import HashingTF, CountVectorizer, VectorAssembler
from pyspark.sql.window import Window
import pyspark.sql.functions as F

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.python_splitters import python_random_split
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation, SparkDiversityEvaluation
from recommenders.utils.spark_utils import start_or_get_spark

In [13]:
import fun

In [3]:
#constants
COL_USER= "UserId"
COL_ITEM= "MovieId"
COL_RATING= "Rating"
COL_TITLE = "title"
COL_GENRES ="genres"
TOP_K = 10

In [4]:
MOVIELENS_DATA_SIZE = '100k'

# topk, user, item column names
TOP_K = 10

spark = start_or_get_spark("ALS PySpark", memory="16g")
spark.conf.set("spark.sql.crossJoin.enabled", "true")
spark

data_full = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
    title_col='title',
    genres_col='genres'
)

data_full.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 14.6kKB/s]


Unnamed: 0,UserId,MovieId,Rating,Timestamp,title,genres
0,196,242,3.0,881250949,Kolya (1996),Comedy
1,63,242,3.0,875747190,Kolya (1996),Comedy
2,226,242,5.0,883888671,Kolya (1996),Comedy
3,154,242,3.0,879138235,Kolya (1996),Comedy
4,306,242,5.0,876503793,Kolya (1996),Comedy


## Dataset conversion

In [5]:
data_full_spark = fun.movielens_to_spark(data_full, schema = fun.get_movielens_schema())

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+------+-------+------+---------+------------+------+
|UserId|MovieId|Rating|Timestamp|       title|genres|
+------+-------+------+---------+------------+------+
|   196|    242|   3.0|881250949|Kolya (1996)|Comedy|
|    63|    242|   3.0|875747190|Kolya (1996)|Comedy|
|   226|    242|   5.0|883888671|Kolya (1996)|Comedy|
|   154|    242|   3.0|879138235|Kolya (1996)|Comedy|
|   306|    242|   5.0|876503793|Kolya (1996)|Comedy|
+------+-------+------+---------+------------+------+
only showing top 5 rows



In [10]:
data_full_reversed = fun.movielens_to_pandas(data_full_spark)

In [11]:
data_full_reversed.head(5)

Unnamed: 0,UserId,MovieId,Rating,Timestamp,title,genres
0,196,242,3.0,881250949,Kolya (1996),Comedy
1,63,242,3.0,875747190,Kolya (1996),Comedy
2,226,242,5.0,883888671,Kolya (1996),Comedy
3,154,242,3.0,879138235,Kolya (1996),Comedy
4,306,242,5.0,876503793,Kolya (1996),Comedy


In [6]:
feature_data = fun.create_feature_data(data_full_spark)

In [26]:
feature_data.show(5)

+-------+--------------------+
|MovieId|            features|
+-------+--------------------+
|    673|(1043,[169,690,10...|
|    879|(1043,[909,1026,1...|
|     66|(1043,[256,1025,1...|
|      9|(1043,[11,342,101...|
|    605|(1043,[754,848,94...|
+-------+--------------------+
only showing top 5 rows



## Suprise Train

In [7]:
data = data_full[['UserId', 'MovieId', 'Rating']]
train, test = python_random_split(data, 0.75)
train_set = surprise.Dataset.load_from_df(train, reader=surprise.Reader('ml-100k')).build_full_trainset()
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print("Took {} seconds for training.".format(train_time.interval))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 7.443470682948828 seconds for training.


In [8]:
train_df_spark, test_df_spark, user_item = fun.split_spark(data_full_spark)
user_item_pd = fun.movielens_to_pandas(user_item)
predictions = predict(svd, user_item_pd, usercol='UserId', itemcol='MovieId')
pred_spark = fun.movielens_to_spark(predictions, schema = fun.get_predictions_schema())
pred_spark.show(5)

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- prediction: float (nullable = true)

+------+-------+----------+
|UserId|MovieId|prediction|
+------+-------+----------+
|   148|    496| 3.4756436|
|   148|    471| 3.4078066|
|   148|    463|  4.163447|
|   148|    148| 3.0047324|
|   148|   1342| 3.2007117|
+------+-------+----------+
only showing top 5 rows

+------+-------+----------+
|UserId|MovieId|prediction|
+------+-------+----------+
|   148|    496| 3.4756436|
|   148|    471| 3.4078066|
|   148|    463|  4.163447|
|   148|    148| 3.0047324|
|   148|   1342| 3.2007117|
+------+-------+----------+
only showing top 5 rows



## Create Top K

In [9]:
top_k, top_all = fun.create_topk_topall(pred_df_spark = pred_spark , train_df_spark = train_df_spark,top_k = TOP_K )
top_k.show()

+------+-------+----------+
|UserId|MovieId|prediction|
+------+-------+----------+
|   148|    124|       5.0|
|   148|    129| 4.9517055|
|   148|     71| 4.6859555|
|   148|    511|  4.666545|
|   148|     12| 4.6350036|
|   148|    170|  4.633393|
|   148|   1240| 4.5800695|
|   148|    523| 4.5740476|
|   148|    199|  4.559443|
|   148|     64|  4.517385|
|   463|    183| 4.3592577|
|   463|     98| 4.3387527|
|   463|    887|  4.316803|
|   463|    511|  4.290304|
|   463|    275|  4.266462|
|   463|    515| 4.1848965|
|   463|    202| 4.1561847|
|   463|    199|    4.1174|
|   463|    169| 4.0986457|
|   463|    272|  4.072341|
+------+-------+----------+
only showing top 20 rows



## Get metrics

In [10]:
diversity_collaborative, diversity_content, ranking, rating = fun.get_metrics(train_df_spark, test_df_spark, top_k, top_all, feature_data, top_k = TOP_K)

## Display metrics

In [14]:
met_res = fun.display_metrics(diversity_collaborative, diversity_content, ranking, rating)

In [15]:
met_res

Unnamed: 0,Metric,Score,Range,Criteria
0,Collaborative Diversity,0.722865,"[0,1]",The closer to 1 the better
1,Collaborative Serendipity,0.78984,"[0,1]",The closer to 1 the better
2,Collaborative Novelty,9.26111,>=0,Inverse popularity. The higher the better
3,Content Diversity,0.869231,"[0,1]",The closer to 1 the better
4,Content Serendipity,0.880999,"[0,1]",The closer to 1 the better
5,Content Novelty,9.26111,>=0,Inverse popularity. The higher the better
6,RMSE,0.584161,>0,The smaller the better
7,MAE,0.424652,>=0,The smaller the better
8,R Squared,0.72836,<=1,The closer to 1 the better
9,Precision@k,0.205101,"[0,1]",The closer to 1 the better. Grows with k


In [20]:
met_res.to_csv("svd_20k_results.csv", index = False)