In [None]:
import sys
import logging
import numpy as np
import pandas as pd
from sklearn.preprocessing import minmax_scale

from recommenders.utils.timer import Timer
from recommenders.utils.python_utils import binarize
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.models.sar import SAR
from recommenders.evaluation.python_evaluation import (
    map,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    rmse,
    mae,
    logloss,
    rsquared,
    exp_var
)
from recommenders.utils.notebook_utils import store_metadata

%load_ext autoreload
%autoreload 2

print(f"System version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

In [None]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
data = pd.read_csv('data/training_set.csv')
data.head()

data['UserId'] = data['UserId'].astype('category')
data['ItemId'] = data['ItemId'].astype('category')
data['Rating'] = data['Click'] + 5* data['Purchase']
data["Rating"] = data["Rating"].astype(np.float32)
data = data[['UserId', 'ItemId', 'Rating']]
user_list = data.groupby('UserId')['ItemId'].nunique()
# user_list_denoise = user_list[(user_list>=3) & (user_list<=30)].index.to_list()
user_list_denoise = user_list[user_list <= 30].index.to_list()
test_df = pd.read_csv('data/public_testset.csv', names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
test_user_id = test_df['user_id'].values
final_user_list = []
final_user_list.extend(user_list_denoise)
final_user_list.extend(test_user_id)
user_list_denoise = list(set(final_user_list))
data = data[data.UserId.isin(user_list_denoise)]

In [None]:
train, test = python_stratified_split(data, ratio=0.9, col_user="UserId", col_item="ItemId", seed=42)

In [None]:
model = SAR(
    col_user="UserId",
    col_item="ItemId",
    col_rating="Rating",
    # col_timestamp="timestamp",
    similarity_type="jaccard", 
    # time_decay_coefficient=5, 
    timedecay_formula=False,
    normalize=True
)

with Timer() as train_time:
    model.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

In [None]:
import os
save_path = "runs/SAR"
os.makedirs(save_path, exist_ok=True)

In [None]:
import pickle
with open(f"{save_path}/model.pkl", "wb") as f:
    pickle.dump(model, f)