# Import

In [None]:
import sys
from recommenders.utils.timer import Timer
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var,
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    get_top_k_items,
    serendipity,
    diversity,
    novelty,
)

In [None]:
from surprise import Dataset, SVDpp, SVD, NormalPredictor, accuracy,  Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from recommenders.models.surprise.surprise_utils import (
    predict,
    # compute_ranking_predictions,
)
import surprise

import os
import sys
import cornac
import pandas as pd
# from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer


import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

import numpy as np
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
def compute_ranking_predictions(
    algo,
    users,
    items,
    data,
    usercol='userID',
    itemcol='itemID',
    predcol='prediction',
    remove_seen=False,
):

    preds_lst = []
    # users = data[usercol].unique()
    # items = data[itemcol].unique()

    for user in users:
      for item in items:
          preds_lst.append([user, item, algo.predict(user, item).est])

    all_predictions = pd.DataFrame(data=preds_lst, columns=[usercol, itemcol, predcol])

    if remove_seen:
        tempdf = pd.concat(
            [
                data[[usercol, itemcol]],
                pd.DataFrame(
                    data=np.ones(data.shape[0]), columns=["dummycol"], index=data.index
                ),
            ],
            axis=1,
        )
        merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
        return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
    else:
        return all_predictions

In [None]:
def predict_ranking(
    model,
    users,
    items,
    data,
    usercol='userID',
    itemcol='itemID',
    predcol='prediction',
    remove_seen=False,
):

    users_list, items_list, preds_list = [], [], []

    for uid, user_idx in users:
        user = [uid] * len(items)
        users_list.extend(user)
        items_list.extend(items)
        preds_list.extend(model.score(user_idx).tolist())

    all_predictions = pd.DataFrame(
        data={usercol: users_list, itemcol: items_list, predcol: preds_list}
    )

    if remove_seen:
        tempdf = pd.concat(
            [
                data[[usercol, itemcol]],
                pd.DataFrame(
                    data=np.ones(data.shape[0]), columns=["dummycol"], index=data.index
                ),
            ],
            axis=1,
        )
        merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
        return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
    else:
        return all_predictions

# Parameters

In [None]:
train_file_path = ''
test_file_path = ''

TOP_K = 10
threshold =20
relevancy_method='by_threshold'

implicit_threshold = 5

#for surprise models
reader = Reader(rating_scale=(1, 10))

# Use the SVD++ algorithm
svdpp = SVDpp(n_factors=50,n_epochs=40,lr_all=0.007, reg_all=0.02, cache_ratings=True, verbose = True)

# Use the SVD algorithm
svd = SVD(n_factors=50,n_epochs=40,lr_all=0.005, reg_all=0.02,verbose=True)

# Use the random algorithm
random = NormalPredictor()

#bpr
bpr = cornac.models.BPR(
    k=200,
    max_iter=100,
    learning_rate=0.01,
    lambda_reg=0.001#,
    # verbose=True
)

#lightgcn
hparams = prepare_hparams(model_type = "lightgcn",
                          embed_size = 40,
                          n_layers=7,
                          batch_size=1024,
                          epochs=40,
                          decay = 0.0001,
                          learning_rate=0.003,
                          eval_epoch=5,
                          top_k=TOP_K,
                          save_model= False,
                          save_epoch=100,
                          metrics = ["recall", "ndcg", "precision", "map"],
                          MODEL_DIR = './tests/resources/deeprec/lightgcn/model/lightgcn_model/'
                         )

# Load data

In [None]:
import pandas as pd
# Load the CSV file
train = pd.read_csv(train_file_path, sep=',')
test = pd.read_csv(test_file_path, sep=',')

print(train.info())

In [None]:
print(test.info())

In [None]:
impli_train = train[train['rating'] >= implicit_threshold]

#  SVD+

https://github.com/NicolasHug/Surprise/issues/123

In [None]:
trainset = Dataset.load_from_df(train[['userID', 'itemID', 'rating']], reader).build_full_trainset()

# Train the algorithm on the trainset
svdpp.fit(trainset)

predictions = predict(svdpp, test, usercol="userID", itemcol="itemID")


In [None]:
eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

print(
    "RMSE:\t\t%f" % eval_rmse,
    "MAE:\t\t%f" % eval_mae,
    "rsquared:\t%f" % eval_rsquared,
    "exp var:\t%f" % eval_exp_var,
    sep="\n",
)

print("----")

# Get the unique userIDs
unique_user_ids = train['userID'].unique()

# Split into chunks of 100 unique userIDs
user_chunks = [unique_user_ids[i:i + 200] for i in range(0, len(unique_user_ids), 200)]

# Create a list to store the resulting DataFrames
train_chunks = []
test_chunks = []

# Loop through each chunk and filter the DataFrame
for chunk in user_chunks:
    train_chunks.append(train[train['userID'].isin(chunk)])
    test_chunks.append(test[test['userID'].isin(chunk)])


eval_map_chunk = []
eval_ndcg_chunk = []
eval_precision_chunk = []
eval_recall_chunk = []

eval_serendipity_chunk = []
eval_diversity_chunk = []
eval_novelty_chunk = []

n = len(user_chunks)
items = train.itemID.unique()
i = 0

for i in range(0,n):
    users = user_chunks[i]
    user_data = train_chunks[i]
    test_data = test_chunks[i]

    all_predictions_chunk = compute_ranking_predictions(
        svdpp, users, items, user_data, usercol="userID", itemcol="itemID", remove_seen=True
    )


    eval_map_chunk.append(map_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                    relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_ndcg_chunk.append(ndcg_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))

    eval_precision_chunk.append(precision_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_recall_chunk.append(recall_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                          relevancy_method='by_threshold', threshold=threshold, k=TOP_K))

    rec_df = get_top_k_items(all_predictions_chunk, col_rating='prediction', k=TOP_K)
    eval_serendipity_chunk.append(serendipity(user_data, rec_df))
    eval_diversity_chunk.append(diversity(user_data, rec_df))
    eval_novelty_chunk.append(novelty(user_data, rec_df))

    i += 1
    print(i)

# Aggregate results
eval_map = np.mean(eval_map_chunk)
eval_ndcg = np.mean(eval_ndcg_chunk)
eval_precision = np.mean(eval_precision_chunk)
eval_recall = np.mean(eval_recall_chunk)

eval_serendipity = np.mean(eval_serendipity_chunk)
eval_diversity = np.mean(eval_diversity_chunk)
eval_novelty = np.mean(eval_novelty_chunk)

# Print results
print(
    "MAP:\t\t%f" % eval_map,
    "NDCG:\t\t%f" % eval_ndcg,
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)

print("----")

print(
    "Diversity:\t\t%f" % eval_diversity,
    "Novelty:\t\t%f" % eval_novelty,
    "Serendipity:\t\t%f" % eval_serendipity,
    sep="\n",
)

# SVD

In [None]:
trainset = Dataset.load_from_df(train[['userID', 'itemID', 'rating']], reader).build_full_trainset()

# Train the algorithm on the trainset
svd.fit(trainset)

predictions = predict(svd, test, usercol="userID", itemcol="itemID")



In [None]:
eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

print(
    "RMSE:\t\t%f" % eval_rmse,
    "MAE:\t\t%f" % eval_mae,
    "rsquared:\t%f" % eval_rsquared,
    "exp var:\t%f" % eval_exp_var,
    sep="\n",
)

print("----")

# Get the unique userIDs
unique_user_ids = train['userID'].unique()

# Split into chunks of 100 unique userIDs
user_chunks = [unique_user_ids[i:i + 200] for i in range(0, len(unique_user_ids), 200)]

# Create a list to store the resulting DataFrames
train_chunks = []
test_chunks = []

# Loop through each chunk and filter the DataFrame
for chunk in user_chunks:
    train_chunks.append(train[train['userID'].isin(chunk)])
    test_chunks.append(test[test['userID'].isin(chunk)])

import numpy as np
import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

eval_map_chunk = []
eval_ndcg_chunk = []
eval_precision_chunk = []
eval_recall_chunk = []

eval_serendipity_chunk = []
eval_diversity_chunk = []
eval_novelty_chunk = []

n = len(user_chunks)
items = train.itemID.unique()
i = 0

for i in range(0,n):
    users = user_chunks[i]
    user_data = train_chunks[i]
    test_data = test_chunks[i]

    all_predictions_chunk = compute_ranking_predictions(
        svd, users, items, user_data, usercol="userID", itemcol="itemID", remove_seen=True
    )


    eval_map_chunk.append(map_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                    relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_ndcg_chunk.append(ndcg_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))

    eval_precision_chunk.append(precision_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_recall_chunk.append(recall_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                          relevancy_method='by_threshold', threshold=threshold, k=TOP_K))

    rec_df = get_top_k_items(all_predictions_chunk, col_rating='prediction', k=TOP_K)
    eval_serendipity_chunk.append(serendipity(user_data, rec_df))
    eval_diversity_chunk.append(diversity(user_data, rec_df))
    eval_novelty_chunk.append(novelty(user_data, rec_df))

    i += 1
    print(i)

# Aggregate results
eval_map = np.mean(eval_map_chunk)
eval_ndcg = np.mean(eval_ndcg_chunk)
eval_precision = np.mean(eval_precision_chunk)
eval_recall = np.mean(eval_recall_chunk)

eval_serendipity = np.mean(eval_serendipity_chunk)
eval_diversity = np.mean(eval_diversity_chunk)
eval_novelty = np.mean(eval_novelty_chunk)

# Print results
print(
    "MAP:\t\t%f" % eval_map,
    "NDCG:\t\t%f" % eval_ndcg,
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)

print("----")

print(
    "Diversity:\t\t%f" % eval_diversity,
    "Novelty:\t\t%f" % eval_novelty,
    "Serendipity:\t\t%f" % eval_serendipity,
    sep="\n",
)

# Random

In [None]:
trainset = Dataset.load_from_df(train[['userID', 'itemID', 'rating']], reader).build_full_trainset()

# Train the algorithm on the trainset
random.fit(trainset)

predictions = predict(random, test, usercol='userID', itemcol="itemID")



<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x7ff20d7c5030>

In [None]:
eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

print(
    "RMSE:\t\t%f" % eval_rmse,
    "MAE:\t\t%f" % eval_mae,
    "rsquared:\t%f" % eval_rsquared,
    "exp var:\t%f" % eval_exp_var,
    sep="\n",
)

print("----")

# Get the unique userIDs
unique_user_ids = train['userID'].unique()

# Split into chunks of 100 unique userIDs
user_chunks = [unique_user_ids[i:i + 200] for i in range(0, len(unique_user_ids), 200)]

# Create a list to store the resulting DataFrames
train_chunks = []
test_chunks = []

# Loop through each chunk and filter the DataFrame
for chunk in user_chunks:
    train_chunks.append(train[train['userID'].isin(chunk)])
    test_chunks.append(test[test['userID'].isin(chunk)])


eval_map_chunk = []
eval_ndcg_chunk = []
eval_precision_chunk = []
eval_recall_chunk = []

eval_serendipity_chunk = []
eval_diversity_chunk = []
eval_novelty_chunk = []

n = len(user_chunks)
items = train.itemID.unique()
i = 0

for i in range(0,n):
    users = user_chunks[i]
    user_data = train_chunks[i]
    test_data = test_chunks[i]

    all_predictions_chunk = compute_ranking_predictions(
        random, users, items, user_data, usercol="userID", itemcol="itemID", remove_seen=True
    )


    eval_map_chunk.append(map_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                    relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_ndcg_chunk.append(ndcg_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))

    eval_precision_chunk.append(precision_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_recall_chunk.append(recall_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                          relevancy_method='by_threshold', threshold=threshold, k=TOP_K))

    rec_df = get_top_k_items(all_predictions_chunk, col_rating='prediction', k=TOP_K)
    eval_serendipity_chunk.append(serendipity(user_data, rec_df))
    eval_diversity_chunk.append(diversity(user_data, rec_df))
    eval_novelty_chunk.append(novelty(user_data, rec_df))

    i += 1
    print(i)

# Aggregate results
eval_map = np.mean(eval_map_chunk)
eval_ndcg = np.mean(eval_ndcg_chunk)
eval_precision = np.mean(eval_precision_chunk)
eval_recall = np.mean(eval_recall_chunk)

eval_serendipity = np.mean(eval_serendipity_chunk)
eval_diversity = np.mean(eval_diversity_chunk)
eval_novelty = np.mean(eval_novelty_chunk)

# Print results
print(
    "MAP:\t\t%f" % eval_map,
    "NDCG:\t\t%f" % eval_ndcg,
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)

print("----")

print(
    "Diversity:\t\t%f" % eval_diversity,
    "Novelty:\t\t%f" % eval_novelty,
    "Serendipity:\t\t%f" % eval_serendipity,
    sep="\n",
)

# BPR

In [None]:
train_set = cornac.data.Dataset.from_uir(impli_train.itertuples(index=False))

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

In [None]:
# Get the unique userIDs
unique_user_ids = train['userID'].unique()
cor_internal_user_ids = [bpr.train_set.uid_map[uid] for uid in unique_user_ids]
in_ex_unique_users_ids = list(zip(unique_user_ids,cor_internal_user_ids))

# Split into chunks of 100 unique userIDs
user_chunks = [in_ex_unique_users_ids[i:i + 200] for i in range(0, len(unique_user_ids), 200)]
chunks = [unique_user_ids[i:i + 200] for i in range(0, len(unique_user_ids), 200)]

# Create a list to store the resulting DataFrames
train_chunks = []
test_chunks = []

# Loop through each chunk and filter the DataFrame
for chunk in chunks:
    train_chunks.append(train[train['userID'].isin(chunk)])
    test_chunks.append(test[test['userID'].isin(chunk)])

items = list(bpr.train_set.iid_map.keys())

import numpy as np

eval_map_chunk = []
eval_ndcg_chunk = []
eval_precision_chunk = []
eval_recall_chunk = []

eval_serendipity_chunk = []
eval_diversity_chunk = []
eval_novelty_chunk = []

n = len(user_chunks)
items = train.itemID.unique()
i = 0

for i in range(0,n):
    users = user_chunks[i]
    # print(users)
    user_data = train_chunks[i]
    # print(user_data)
    test_data = test_chunks[i]

    all_predictions_chunk = predict_ranking(
        bpr, users, items, user_data, usercol="userID", itemcol="itemID", remove_seen=True
    )

    eval_map_chunk.append(map_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                    relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_ndcg_chunk.append(ndcg_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    # print(eval)
    eval_precision_chunk.append(precision_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_recall_chunk.append(recall_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                          relevancy_method='by_threshold', threshold=threshold, k=TOP_K))

    rec_df = get_top_k_items(all_predictions_chunk, col_rating='prediction', k=TOP_K)
    eval_serendipity_chunk.append(serendipity(user_data, rec_df))
    eval_diversity_chunk.append(diversity(user_data, rec_df))
    eval_novelty_chunk.append(novelty(user_data, rec_df))


    i += 1
    print(i)


eval_map = np.mean(eval_map_chunk)
eval_ndcg = np.mean(eval_ndcg_chunk)
eval_precision = np.mean(eval_precision_chunk)
eval_recall = np.mean(eval_recall_chunk)

eval_serendipity = np.mean(eval_serendipity_chunk)
eval_diversity = np.mean(eval_diversity_chunk)
eval_novelty = np.mean(eval_novelty_chunk)

# Print results
print(
    "MAP:\t\t%f" % eval_map,
    "NDCG:\t\t%f" % eval_ndcg,
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)

print("----")

print(
    "Diversity:\t\t%f" % eval_diversity,
    "Novelty:\t\t%f" % eval_novelty,
    "Serendipity:\t\t%f" % eval_serendipity,
    sep="\n",
)

# LightGCN

In [None]:
data = ImplicitCF(train= impli_train, test=test)

In [None]:
model = LightGCN(hparams, data)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [None]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

In [None]:
# Get the unique userIDs
unique_user_ids = train['userID'].unique()

# Split into chunks of 100 unique userIDs
user_chunks = [unique_user_ids[i:i + 200] for i in range(0, len(unique_user_ids), 200)]

# Create a list to store the resulting DataFrames
train_chunks = []
test_chunks = []

# Loop through each chunk and filter the DataFrame
for chunk in user_chunks:
    train_chunks.append(train[train['userID'].isin(chunk)])
    test_chunks.append(test[test['userID'].isin(chunk)])


eval_map_chunk = []
eval_ndcg_chunk = []
eval_precision_chunk = []
eval_recall_chunk = []

eval_serendipity_chunk = []
eval_diversity_chunk = []
eval_novelty_chunk = []

n = len(user_chunks)
items = train.itemID.unique()
i = 0

for i in range(0,n):
    users = user_chunks[i]
    # print(users)
    user_data = train_chunks[i]
    # print(user_data)
    test_data = test_chunks[i]

    all_predictions_chunk = model.recommend_k_items(test_data, top_k=TOP_K, remove_seen=True)


    eval_map_chunk.append(map_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                    relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_ndcg_chunk.append(ndcg_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    # print(eval)
    eval_precision_chunk.append(precision_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                      relevancy_method='by_threshold', threshold=threshold, k=TOP_K))
    eval_recall_chunk.append(recall_at_k(test_data, all_predictions_chunk, col_prediction="prediction",
                          relevancy_method='by_threshold', threshold=threshold, k=TOP_K))

    rec_df = get_top_k_items(all_predictions_chunk, col_rating='prediction', k=TOP_K)
    eval_serendipity_chunk.append(serendipity(user_data, rec_df))
    eval_diversity_chunk.append(diversity(user_data, rec_df))
    eval_novelty_chunk.append(novelty(user_data, rec_df))


    i += 1
    print(i)


eval_map = np.mean(eval_map_chunk)
eval_ndcg = np.mean(eval_ndcg_chunk)
eval_precision = np.mean(eval_precision_chunk)
eval_recall = np.mean(eval_recall_chunk)

eval_serendipity = np.mean(eval_serendipity_chunk)
eval_diversity = np.mean(eval_diversity_chunk)
eval_novelty = np.mean(eval_novelty_chunk)

# Print results
print(
    "MAP:\t\t%f" % eval_map,
    "NDCG:\t\t%f" % eval_ndcg,
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)

print("----")

print(
    "Diversity:\t\t%f" % eval_diversity,
    "Novelty:\t\t%f" % eval_novelty,
    "Serendipity:\t\t%f" % eval_serendipity,
    sep="\n",
)