# SAR
https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/sar_deep_dive.ipynb

## Prepare

In [1]:
# set the environment path to find Recommenders
import sys

import itertools
import logging
import os

import numpy as np
import pandas as pd
import papermill as pm

from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.sar.sar_singlenode import SARSingleNode

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.7.0 (default, Oct  9 2018, 10:31:47) 
[GCC 7.3.0]
Pandas version: 1.1.5


In [2]:
import fun
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType, StructType, StructField
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation, SparkDiversityEvaluation

In [3]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

## Import and prepare data

In [4]:
data_full = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
    title_col='Title',
    genres_col='Genres'
)

data_full_spark = fun.movielens_to_spark(data_full, schema = fun.get_movielens_schema())

data = data_full[['UserId', 'MovieId', 'Rating', 'Timestamp', 'Title']]

# Convert the float precision to 32-bit in order to reduce memory consumption 
data.loc[:, 'Rating'] = data['Rating'].astype(np.float32)

data.head()

100%|██████████| 4.81k/4.81k [00:01<00:00, 3.78kKB/s]


Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+------+-------+------+---------+------------+------+
|UserId|MovieId|Rating|Timestamp|       title|genres|
+------+-------+------+---------+------------+------+
|   196|    242|   3.0|881250949|Kolya (1996)|Comedy|
|    63|    242|   3.0|875747190|Kolya (1996)|Comedy|
|   226|    242|   5.0|883888671|Kolya (1996)|Comedy|
|   154|    242|   3.0|879138235|Kolya (1996)|Comedy|
|   306|    242|   5.0|876503793|Kolya (1996)|Comedy|
+------+-------+------+---------+------------+------+
only showing top 5 rows



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,UserId,MovieId,Rating,Timestamp,Title
0,196,242,3.0,881250949,Kolya (1996)
1,63,242,3.0,875747190,Kolya (1996)
2,226,242,5.0,883888671,Kolya (1996)
3,154,242,3.0,879138235,Kolya (1996)
4,306,242,5.0,876503793,Kolya (1996)


In [5]:
header = {
    "col_user": "UserId",
    "col_item": "MovieId",
    "col_rating": "Rating",
    "col_timestamp": "Timestamp",
    "col_prediction": "Prediction",
}

train, test = python_stratified_split(data, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42)

In [6]:
traintestschema = StructType(
    (StructField("UserId", IntegerType()),
    StructField("MovieId", IntegerType()),
    StructField("Rating", FloatType()),
    StructField("Timestamp", LongType())))

In [7]:
train_df_spark = fun.movielens_to_spark(train[["UserId", "MovieId", "Rating","Timestamp"]], schema = traintestschema)
test_df_spark = fun.movielens_to_spark(test[["UserId", "MovieId", "Rating","Timestamp"]], schema = traintestschema)

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: long (nullable = true)

+------+-------+------+---------+
|UserId|MovieId|Rating|Timestamp|
+------+-------+------+---------+
|     1|    232|   3.0|878543196|
|     1|     66|   4.0|878543030|
|     1|    106|   4.0|875241390|
|     1|     97|   3.0|875073128|
|     1|     73|   3.0|876892774|
+------+-------+------+---------+
only showing top 5 rows

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: long (nullable = true)

+------+-------+------+---------+
|UserId|MovieId|Rating|Timestamp|
+------+-------+------+---------+
|     1|     49|   3.0|878542478|
|     1|     69|   3.0|875072262|
|     1|    221|   5.0|887431921|
|     1|      5|   3.0|889751712|
|     1|    139|   3.0|878543216|
+------+-------+------+

In [7]:
feature_data = fun.create_feature_data(data_full_spark)

## Train and predict SAR 

In [8]:
model = SARSingleNode(
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    time_now=None, 
    timedecay_formula=True, 
    **header
)

In [9]:
model.fit(train)

In [10]:
top_k = model.recommend_k_items(test, remove_seen=True, top_k = TOP_K)

## Make topk reco with examples

In [11]:
top_k_with_info = (top_k.join(data_full[['MovieId', 'Title', 'Genres']].drop_duplicates().set_index('MovieId'), 
                                on='MovieId', 
                                how='inner').sort_values(by=['UserId', 'Prediction'], ascending=False))
display(top_k_with_info.head(30))

Unnamed: 0,UserId,MovieId,Prediction,Title,Genres
9420,943,82,21.313228,Jurassic Park (1993),Action|Adventure|Sci-Fi
9421,943,403,21.158839,Batman (1989),Action|Adventure|Crime|Drama
9422,943,568,20.962922,Speed (1994),Action|Romance|Thriller
9423,943,423,20.16217,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
9424,943,89,19.890513,Blade Runner (1982),Film-Noir|Sci-Fi
9425,943,393,19.832944,Mrs. Doubtfire (1993),Comedy
9426,943,11,19.570244,Seven (Se7en) (1995),Crime|Thriller
9427,943,71,19.553877,"Lion King, The (1994)",Animation|Children's|Musical
9428,943,202,19.422129,Groundhog Day (1993),Comedy|Romance
9429,943,238,19.115604,Raising Arizona (1987),Comedy


In [12]:
top_k_with_info.to_csv("sar_topk_visual_results.csv", index = False)

## Evaluate

### Use recommenders example approach for precision and recall

In [13]:
# all ranking metrics have the same arguments
args = [test, top_k]
kwargs = dict(col_user='UserId', 
              col_item='MovieId', 
              col_rating='Rating', 
              col_prediction='Prediction', 
              relevancy_method='top_k', 
              k=TOP_K)

eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [14]:
print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model:
Top K:		 20
Precision@K:	 0.247826
Recall@K:	 0.253035


### Use our methods for diversity metrics,leave ratings as NA

In [15]:
topk_spark = fun.movielens_to_spark(top_k, schema = fun.get_predictions_schema())

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- prediction: float (nullable = true)

+------+-------+----------+
|UserId|MovieId|prediction|
+------+-------+----------+
|     1|    238| 3.2744997|
|     1|     69| 3.2352996|
|     1|    423| 3.1427104|
|     1|    204| 3.1217399|
|     1|     56|  3.116177|
+------+-------+----------+
only showing top 5 rows



In [16]:
def get_diversity_results(diversity_eval):
    metrics = {
        "catalog_coverage":diversity_eval.catalog_coverage(),
        "distributional_coverage":diversity_eval.distributional_coverage(), 
        "novelty": diversity_eval.novelty(), 
        "diversity": diversity_eval.diversity(), 
        "serendipity": diversity_eval.serendipity()
    }
    return metrics

In [17]:
collaborative_diversity_eval = SparkDiversityEvaluation(
        train_df = train_df_spark, 
        reco_df = topk_spark,
        col_user = "UserId", 
        col_item = "MovieId"
    )
diversity_collaborative = get_diversity_results(collaborative_diversity_eval)

In [18]:
print("Collaborative diversity:\t%f" % diversity_collaborative['diversity'],
      "Collaborative serendipity:\t%f" % diversity_collaborative['serendipity'], 
     "Collaborative novelty:\t%f" % diversity_collaborative['novelty'], sep='\n')

Collaborative diversity:	0.571075
Collaborative serendipity:	0.697473
Collaborative novelty:	8.483059


Content based

In [19]:
content_diversity_eval = SparkDiversityEvaluation(
        train_df = train_df_spark, 
        reco_df = topk_spark,
        item_feature_df = feature_data, 
        item_sim_measure="item_feature_vector",
        col_user = "UserId", 
        col_item = "MovieId"
    )
diversity_content = get_diversity_results(content_diversity_eval)

In [20]:
print("Content diversity:\t%f" % diversity_content['diversity'],
      "Content serendipity:\t%f" % diversity_content['serendipity'], 
     "Content novelty:\t%f" % diversity_content['novelty'], sep='\n')

Content diversity:	0.858798
Content serendipity:	0.869371
Content novelty:	8.483059
