<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>

<i>Licensed under the MIT License.</i>

<i>This notebook has been taken from Microsoft's recommender system library: [Source](https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/lightgcn_deep_dive.ipynb). It has been modified to fit the context of our study. The modifications include the addition of new evaluation methods, the ability to add new datasets, and cluster validation process.</i>

# LightGCN - simplified GCN model for recommendation

This notebook serves as an introduction to LightGCN [1], which is an simple, linear and neat Graph Convolution Network (GCN) [3] model for recommendation.

## 0 Global Settings and Imports

In [1]:
from recommenders.datasets.python_splitters import python_stratified_split
import sys
import os
import codecs
import scrapbook as sb
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages


from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
#from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    serendipity,
    user_serendipity,
    user_item_serendipity,
    catalog_coverage
)

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))
#os.chdir('../')

System version: 3.10.0 (tags/v3.10.0:b494f59, Oct  4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)]
Pandas version: 2.2.2
Tensorflow version: 2.17.0


In [2]:
os.getcwd()

'C:\\Users\\clari\\Desktop\\M2 - Thesis\\Research\\Dr Jacques Bou Abdo\\Recommender System\\5 - Ensemble Learning Model\\Accuracy Metrics\\serendipity-main\\notebooks'

In [3]:
DEFAULT_SEED = None

In [4]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
#MOVIELENS_DATA_SIZE = 'framework-ml-25m-subset'
MOVIELENS_DATA_SIZE = 'framework-ml-25m-subset(3) (EL5)'
#MOVIELENS_DATA_SIZE = 'ml-25m-subset(3)-#5'
#MOVIELENS_DATA_SIZE = 'ml-25m-subset(3)-nf-1-2-3-4'
OG_DT = 'ml-25m'
# Model parameters
EPOCHS = 15
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "./models/lightgcn/config/lightgcn.yaml"
user_file = "./models/lightgcn/output/tests/user_embeddings.csv"
item_file = "./models/lightgcn/output/item_embeddings.csv"

In [5]:

yaml_file = os.path.join(os.getcwd(), ".." ,"models", "lightgcn", "config", "lightgcn.yaml")
user_file = os.path.join(os.getcwd(), ".." ,"models","lightgcn","output_CH",MOVIELENS_DATA_SIZE,"user_embedding_CH.csv")
item_file = os.path.join(os.getcwd(), ".." ,"models","lightgcn","output_CH",MOVIELENS_DATA_SIZE,"item_embedding_CH.csv")
print(os.path.exists(yaml_file))
print(os.path.exists(user_file))
print(os.path.exists(item_file))



True
True
True


In [6]:
from sklearn.model_selection import train_test_split

ratio = 0.85
#dataset = 'ml-5m-#1'
dataset_name = MOVIELENS_DATA_SIZE
dataset_path = os.path.join('datasets', dataset_name)

#ratings_path = os.path.join(dataset_path, 'u.data')
#ratings_file = codecs.open(ratings_path, 'r', 'UTF-8')
#df = pd.read_csv(ratings_file, sep='\t', names=('userID', 'itemID', 'rating', 'timestamp'))
#dataset_name = 'ml-5m-#2'
#dataset = '5m'
dataset_path = os.path.join('datasets', dataset_name)
output_path_exp1 = './output/exp-4/'
#ratings_path = os.path.join(dataset_path, 'u.data')
#ratings_path = os.path.join(dataset_path, '//clean//ratings.csv')
#ratings_file = codecs.open(ratings_path, 'r', 'UTF-8')
ratings_path = "../datasets/"+dataset_name+ "/clean/ratings.csv"



#df = pd.read_csv(ratings_file, sep='\t', names=('userID', 'itemID', 'rating', 'timestamp'))

if (MOVIELENS_DATA_SIZE == 'ml-25m-subset-nf-1-2-3-3.1-4'):
    cos = ['itemId',	'userId', 'title',	'rating']
    df = pd.read_csv(ratings_path, usecols=cos).rename(columns = {'userId':'userID', 'itemId':'itemID'})
elif (MOVIELENS_DATA_SIZE == 'ml-25m-subset(3)-nf-1-2-3-4'):
    df = pd.read_csv(ratings_path).rename(columns = {'userId':'userID', 'itemId':'itemID'})
else:
    df = pd.read_csv(ratings_path).rename(columns = {'userId':'userID', 'movieId':'itemID'}).copy()

if '#2' in dataset_name:
     df = df[(df['isNoisy'] == "[0.]") | (df['isNoisy'] == 0)]
elif 'nf-1' in dataset_name:
     #df = df[(df['1&2&3&4 = 0'] == 0) & (df['1&2&3&4 = 1'] == "0")]
    df = df[(df['1&2&3&4 = 1'] == 0)]
    print(len(df))
#elif ('nf1' in dataset_name) or ('nf2' in dataset_name) or ('nf3' in dataset_name) or ('nf4' in dataset_name):
#    df = df
elif (not 'framework' in dataset_name) & ('ml-25m-subset(3)-' in dataset_name):
    df = df[df['isNoisy'] == 0] 
elif 'framework' in dataset_name:
    df = df[df['layer3_result'] == 0]
len(df['userID'].unique())
if 'itemID' not in df.columns:
    df = df.rename(columns = {'itemId':'itemID'})

In [7]:
df.columns

Index(['Unnamed: 0.1', 'Column1', 'userID', 'itemID', 'rating', 'timestamp',
       'user_cat', 'rating_group', 'item_cat', 'nf1', 'user_group',
       'coherence', 'title', 'genres', 'RND', 'thresh', 'nf2', 'Unnamed: 0',
       'prediction', 'nf3', 'nf4', 'noisedegree', '1&2', '1&3', '1&4', '2&3',
       '2&4', '3&4', '1&2&3', '2&3&4', '1&2&4', '1&3&4', '1&2&3&4 = 0',
       '1&2&3&4 = 1', 'isNoisy', 'FinalOutput', 'OptOut', 'layer3_result'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,Unnamed: 0.1,Column1,userID,itemID,rating,timestamp,user_cat,rating_group,item_cat,nf1,...,1&2&3,2&3&4,1&2&4,1&3&4,1&2&3&4 = 0,1&2&3&4 = 1,isNoisy,FinalOutput,OptOut,layer3_result
0,0,0,107576,3253,4.5,1544828239,Benevolent,Su,Strongly-preferred,0,...,0,1,0,0,0,0,0,0,0,0
1,1,1,116343,741,3.0,1106870586,Benevolent,Su,Strongly-preferred,0,...,0,1,0,0,0,0,0,0,0,0
2,2,2,87441,18,4.0,1496483617,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,92046,4378,4.0,1016643855,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0
4,4,4,137299,94896,4.0,1489231026,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
len(df)

585253

In [10]:

# Normal train/test split (random portion) 
train, test = python_stratified_split(df, ratio=ratio)

#train2, test2 = train_test_split(df, test_size=(1 - ratio), random_state=42)
train.to_csv('../output/exp-4/'+dataset_name+'/train.csv')
test.to_csv('../output/exp-4/'+dataset_name+'/test.csv')
train_df = pd.read_csv('../output/exp-4/'+dataset_name+'/train.csv')
test_df = pd.read_csv('../output/exp-4/'+dataset_name+'/test.csv')

In [11]:
df

Unnamed: 0,Unnamed: 0.1,Column1,userID,itemID,rating,timestamp,user_cat,rating_group,item_cat,nf1,...,2&3&4,1&2&4,1&3&4,1&2&3&4 = 0,1&2&3&4 = 1,isNoisy,FinalOutput,OptOut,layer3_result,random
0,0,0,107576,3253,4.5,1544828239,Benevolent,Su,Strongly-preferred,0,...,1,0,0,0,0,0,0,0,0,0.374540
1,1,1,116343,741,3.0,1106870586,Benevolent,Su,Strongly-preferred,0,...,1,0,0,0,0,0,0,0,0,0.950714
2,2,2,87441,18,4.0,1496483617,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.731994
3,3,3,92046,4378,4.0,1016643855,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.598658
4,4,4,137299,94896,4.0,1489231026,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.156019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634336,634336,634336,111945,2786,3.0,980566705,Benevolent,Su,Weakly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.194559
634337,634337,634337,123473,10,3.5,1165210768,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.727604
634338,634338,634338,75587,45499,4.0,1271238048,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.310441
634339,634339,634339,82097,1265,4.0,1465691808,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.263769


In [12]:
clusters = pd.read_csv('../output/exp-4/'+dataset_name+'/means_output_clusters_CH.csv').drop(columns=['rating'])
train_clusters = train_df.reset_index().merge(clusters, left_on=['userID','itemID'], right_on=['userId','movieId']).drop(columns=['userId'])
total_groups = set(clusters.cluster.to_list())
print("Total groups:", len(total_groups))

Total groups: 100


In [13]:
train_clusters

Unnamed: 0,index,Unnamed: 0.2,Unnamed: 0.1,Column1,userID,itemID,rating,timestamp_x,user_cat,rating_group,...,1&3&4,1&2&3&4 = 0,1&2&3&4 = 1,isNoisy,FinalOutput,OptOut,layer3_result,movieId,timestamp_y,cluster
0,0,515412,515412,515412,107,8784,5.0,1112451972,Benevolent,Su,...,0,0,0,0,0,0,0,8784,1112451972,38
1,1,415668,415668,415668,107,2028,3.0,1146426880,Benevolent,Su,...,0,0,0,0,0,0,0,2028,1146426880,61
2,2,255383,255383,255383,107,318,4.5,1112452064,Benevolent,Su,...,0,0,0,0,0,0,0,318,1112452064,38
3,3,218367,218367,218367,107,1230,4.0,1112450308,Benevolent,Su,...,0,0,0,0,0,0,0,1230,1112450308,38
4,4,79228,79228,79228,107,27432,3.0,1162221547,Benevolent,Su,...,0,0,0,0,0,0,0,27432,1162221547,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497415,497415,333180,333180,333180,162488,1089,5.0,1202822376,Benevolent,Su,...,0,0,0,0,0,0,0,1089,1202822376,10
497416,497416,350236,350236,350236,162488,5418,5.0,1202822677,Benevolent,Su,...,0,0,0,0,0,0,0,5418,1202822677,10
497417,497417,447714,447714,447714,162488,2232,4.5,1202824481,Benevolent,Su,...,0,1,0,0,0,0,0,2232,1202824481,10
497418,497418,290277,290277,290277,162488,8633,4.0,1202824719,Benevolent,Su,...,0,0,0,0,0,0,0,8633,1202824719,10


In [14]:
# Target group cluster (we iterate over all of them in every run)
target_group = 4

# Train data
#target_group_df = train_clusters[train_clusters['group_clusters'] == target_group]
#train = target_group_df[['userID', 'itemID', 'rating', 'timestamp']]
target_group_df = train_clusters
if 'timestamp' not in target_group_df.columns:
    rt = pd.read_csv( r"C:\Users\clari\Desktop\M2 - Thesis\Research\Dr Jacques Bou Abdo\Recommender System\4 - Review\dataset\\"+OG_DT+"\\Full Data set (Used with NF3 First)\\ratings.csv"
).rename(columns = {'userId':'userID', 'movieId':'itemID'})
    target_group_df = target_group_df.merge(rt, on=['userID','itemID','rating'], how='left')
train = target_group_df[['userID', 'itemID', 'rating', 'timestamp']]



# Test data Choose only ratings that can be predicted
users_in_train = list(set(train.userID.to_list()))
test = test_df[test_df.userID.isin(users_in_train)]
# This gave a memory error, but it seems its not actually used
#test2 = test_df[test_df - test_df.userID.isin(users_in_train)]

In [15]:
len(users_in_train)

5624

In [16]:
target_group_df

Unnamed: 0,index,Unnamed: 0.2,Unnamed: 0.1,Column1,userID,itemID,rating,timestamp_x,user_cat,rating_group,...,1&2&3&4 = 0,1&2&3&4 = 1,isNoisy,FinalOutput,OptOut,layer3_result,movieId,timestamp_y,cluster,timestamp
0,0,515412,515412,515412,107,8784,5.0,1112451972,Benevolent,Su,...,0,0,0,0,0,0,8784,1112451972,38,1112451972
1,1,415668,415668,415668,107,2028,3.0,1146426880,Benevolent,Su,...,0,0,0,0,0,0,2028,1146426880,61,1146426880
2,2,255383,255383,255383,107,318,4.5,1112452064,Benevolent,Su,...,0,0,0,0,0,0,318,1112452064,38,1112452064
3,3,218367,218367,218367,107,1230,4.0,1112450308,Benevolent,Su,...,0,0,0,0,0,0,1230,1112450308,38,1112450308
4,4,79228,79228,79228,107,27432,3.0,1162221547,Benevolent,Su,...,0,0,0,0,0,0,27432,1162221547,66,1162221547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497415,497415,333180,333180,333180,162488,1089,5.0,1202822376,Benevolent,Su,...,0,0,0,0,0,0,1089,1202822376,10,1202822376
497416,497416,350236,350236,350236,162488,5418,5.0,1202822677,Benevolent,Su,...,0,0,0,0,0,0,5418,1202822677,10,1202822677
497417,497417,447714,447714,447714,162488,2232,4.5,1202824481,Benevolent,Su,...,1,0,0,0,0,0,2232,1202824481,10,1202824481
497418,497418,290277,290277,290277,162488,8633,4.0,1202824719,Benevolent,Su,...,0,0,0,0,0,0,8633,1202824719,10,1202824719


In [17]:
train

Unnamed: 0,userID,itemID,rating,timestamp
0,107,8784,5.0,1112451972
1,107,2028,3.0,1146426880
2,107,318,4.5,1112452064
3,107,1230,4.0,1112450308
4,107,27432,3.0,1162221547
...,...,...,...,...
497415,162488,1089,5.0,1202822376
497416,162488,5418,5.0,1202822677
497417,162488,2232,4.5,1202824481
497418,162488,8633,4.0,1202824719


In [18]:
print("total users in main dataset:", len(list(set(df.userID.to_list()))))
print("total users in train dataset:", len(list(set(train.userID.to_list()))))
print("total users in test dataset:", len(list(set(test.userID.to_list()))))
# train = train.set_index('index')
# test = test.set_index('index')

total users in main dataset: 5624
total users in train dataset: 5624
total users in test dataset: 5624


In [19]:
test

Unnamed: 0,Unnamed: 0.2,Unnamed: 0.1,Column1,userID,itemID,rating,timestamp,user_cat,rating_group,item_cat,...,1&2&3,2&3&4,1&2&4,1&3&4,1&2&3&4 = 0,1&2&3&4 = 1,isNoisy,FinalOutput,OptOut,layer3_result
0,471944,471944,471944,107,7323,3.0,1112452135,Benevolent,Su,Strongly-preferred,...,0,1,0,0,0,0,0,0,0,0
1,415641,415641,415641,107,6571,3.0,1112452211,Benevolent,Su,Strongly-preferred,...,0,0,0,0,0,0,0,0,0,0
2,205922,205922,205922,107,6385,5.0,1116042148,Benevolent,Su,Strongly-preferred,...,0,0,0,0,0,0,0,0,0,0
3,111872,111872,111872,107,1172,3.5,1116042312,Benevolent,Su,Strongly-preferred,...,0,0,0,0,0,0,0,0,0,0
4,6087,6087,6087,107,6879,2.5,1113518113,Benevolent,Au,Strongly-preferred,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87828,73700,73700,73700,162488,1214,5.0,1202821802,Benevolent,Su,Strongly-preferred,...,0,0,0,0,0,0,0,0,0,0
87829,499496,499496,499496,162488,7373,4.5,1202824760,Benevolent,Su,Strongly-preferred,...,0,0,0,0,0,0,0,0,0,0
87830,185849,185849,185849,162488,1222,4.0,1202823756,Benevolent,Su,Strongly-preferred,...,0,0,0,0,0,0,0,0,0,0
87831,167963,167963,167963,162488,1285,5.0,1202822575,Benevolent,Su,Strongly-preferred,...,0,0,0,0,0,0,0,0,0,0


In [20]:
train

Unnamed: 0,userID,itemID,rating,timestamp
0,107,8784,5.0,1112451972
1,107,2028,3.0,1146426880
2,107,318,4.5,1112452064
3,107,1230,4.0,1112450308
4,107,27432,3.0,1162221547
...,...,...,...,...
497415,162488,1089,5.0,1202822376
497416,162488,5418,5.0,1202822677
497417,162488,2232,4.5,1202824481
497418,162488,8633,4.0,1202824719


In [21]:
set(train_clusters[train_clusters['cluster'] == target_group].cluster.to_list())

{4}

### 2.2 Process data

`ImplicitCF` is a class that intializes and loads data for the training process. During the initialization of this class, user IDs and item IDs are reindexed, ratings greater than zero are converted into implicit positive interaction, and adjacency matrix $R$ of user-item graph is created. Some important methods of `ImplicitCF` are:

`get_norm_adj_mat`, load normalized adjacency matrix of user-item graph if it already exists in `adj_dir`, otherwise call `create_norm_adj_mat` to create the matrix and save the matrix if `adj_dir` is not `None`. This method will be called during the initialization process of LightGCN model.

`create_norm_adj_mat`, create normalized adjacency matrix of user-item graph by calculating $D^{-\frac{1}{2}} A D^{-\frac{1}{2}}$, where $\mathbf{A}=\left(\begin{array}{cc}\mathbf{0} & \mathbf{R} \\ \mathbf{R}^{T} & \mathbf{0}\end{array}\right)$.

`train_loader`, generate a batch of training data — sample a batch of users and then sample one positive item and one negative item for each user. This method will be called before each epoch of the training process.


In [22]:
data = ImplicitCF(train=train, test=test, seed=SEED)

### 2.3 Prepare hyper-parameters

Important parameters of `LightGCN` model are:

`data`, initialized LightGCNDataset object.

`epochs`, number of epochs for training.

`n_layers`, number of layers of the model.

`eval_epoch`, if it is not None, evaluation metrics will be calculated on test set every "eval_epoch" epochs. In this way, we can observe the effect of the model during the training process.

`top_k`, the number of items to be recommended for each user when calculating ranking metrics.

A complete list of parameters can be found in `yaml_file`. We use `prepare_hparams` to read the yaml file and prepare a full set of parameters for the model. Parameters passed as the function's parameters will overwrite yaml settings.

In [23]:
hparams = prepare_hparams(
    yaml_file,
    n_layers=3,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    learning_rate=0.015,
    eval_epoch=5,
    top_k=TOP_K,
)

### 2.4 Create and train model

With data and parameters prepared, we can create the LightGCN model.

To train the model, we simply need to call the `fit()` method.

In [24]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [25]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)53.1s: train loss = 0.17562 = (mf)0.17467 + (embed)0.00095
Epoch 2 (train)60.1s: train loss = 0.12796 = (mf)0.12611 + (embed)0.00185
Epoch 3 (train)58.6s: train loss = 0.11312 = (mf)0.11045 + (embed)0.00267
Epoch 4 (train)58.8s: train loss = 0.09810 = (mf)0.09450 + (embed)0.00360


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 5 (train)54.8s + (eval)4.6s: train loss = 0.08666 = (mf)0.08208 + (embed)0.00459, recall = 0.03265, ndcg = 0.04248, precision = 0.03910, map = 0.01439
Epoch 6 (train)58.9s: train loss = 0.07891 = (mf)0.07339 + (embed)0.00552
Epoch 7 (train)59.3s: train loss = 0.07411 = (mf)0.06775 + (embed)0.00635
Epoch 8 (train)59.7s: train loss = 0.06974 = (mf)0.06261 + (embed)0.00714
Epoch 9 (train)55.0s: train loss = 0.06535 = (mf)0.05750 + (embed)0.00785


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 10 (train)59.6s + (eval)3.3s: train loss = 0.06318 = (mf)0.05467 + (embed)0.00851, recall = 0.03292, ndcg = 0.04261, precision = 0.03951, map = 0.01444
Epoch 11 (train)60.3s: train loss = 0.06154 = (mf)0.05244 + (embed)0.00910
Epoch 12 (train)56.5s: train loss = 0.05932 = (mf)0.04965 + (embed)0.00967
Epoch 13 (train)52.7s: train loss = 0.05829 = (mf)0.04809 + (embed)0.01021
Epoch 14 (train)57.7s: train loss = 0.05632 = (mf)0.04562 + (embed)0.01070


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 15 (train)58.9s + (eval)3.2s: train loss = 0.05618 = (mf)0.04504 + (embed)0.01114, recall = 0.03127, ndcg = 0.04148, precision = 0.03787, map = 0.01423
Took 875.1254412999842 seconds for training.


### 2.5 Recommendation and Evaluation

Recommendation and evaluation have been performed on the specified test set during training. After training, we can also use the model to perform recommendation and evalution on other data. Here we still use `test` as test data, but `test` can be replaced by other data with similar data structure.

#### 2.5.1 Recommendation

We can call `recommend_k_items` to recommend k items for each user passed in this function. We set `remove_seen=True` to remove the items already seen by the user. The function returns a dataframe, containing each user and top k items recommended to them and the corresponding ranking scores.

In [26]:
topk_scores = model.recommend_k_items(train, top_k=TOP_K, remove_seen=True)
topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,107,4226,11.136578
1,107,6620,10.579941
2,107,44694,10.358736
3,107,3083,10.241936
4,107,1172,9.942299


#### 2.5.2 Evaluation

With `topk_scores` predicted by the model, we can evaluate how LightGCN performs on this test set.

In [27]:
columns_to_keep = ['userID', 'itemID', 'rating', 'timestamp']

# Create a new DataFrame with only the selected columns
#test1 = test[columns_to_keep].astype(float)


In [28]:
'''
import json

eval_map = map_at_k(test1, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test1, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test1, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test1, topk_scores, k=TOP_K)
#eval_serendipity = serendipity(train, topk_scores)
#eval_coverage = catalog_coverage(train, topk_scores)
eval_serendipity = serendipity(train, topk_scores)
eval_coverage = catalog_coverage(train, topk_scores)

metric_results = {
    'MAP': eval_map,
    'NDCG': eval_ndcg,
    'Precision': eval_precision,
    'Recall': eval_recall,
    'User Serendipity': eval_serendipity,
    'Coverage': eval_coverage
}

print(json.dumps(metric_results, indent=4))
with open("../output/exp-4/"+dataset_name+"/metric_results.txt", "w") as fp:
    json.dump(metric_results, fp, indent=4)
'''

'\nimport json\n\neval_map = map_at_k(test1, topk_scores, k=TOP_K)\neval_ndcg = ndcg_at_k(test1, topk_scores, k=TOP_K)\neval_precision = precision_at_k(test1, topk_scores, k=TOP_K)\neval_recall = recall_at_k(test1, topk_scores, k=TOP_K)\n#eval_serendipity = serendipity(train, topk_scores)\n#eval_coverage = catalog_coverage(train, topk_scores)\neval_serendipity = serendipity(train, topk_scores)\neval_coverage = catalog_coverage(train, topk_scores)\n\nmetric_results = {\n    \'MAP\': eval_map,\n    \'NDCG\': eval_ndcg,\n    \'Precision\': eval_precision,\n    \'Recall\': eval_recall,\n    \'User Serendipity\': eval_serendipity,\n    \'Coverage\': eval_coverage\n}\n\nprint(json.dumps(metric_results, indent=4))\nwith open("../output/exp-4/"+dataset_name+"/metric_results.txt", "w") as fp:\n    json.dump(metric_results, fp, indent=4)\n'

In [29]:
train.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,107,8784,5.0,1112451972
1,107,2028,3.0,1146426880
2,107,318,4.5,1112452064
3,107,1230,4.0,1112450308
4,107,27432,3.0,1162221547


In [30]:
#test2.head()

In [31]:
# load clusters if not previously loaded
# clusters = pd.read_csv('./output/exp-2/group_clusters.csv', usecols=['user_id', 'group'])

# get per-user serendipity score
#eval_serendipity = user_serendipity(train, topk_scores)
eval_serendipity = user_serendipity(train, topk_scores)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

# calculate per-cluster serendipity score
eval_serendipity_clulsters = clusters.merge(eval_serendipity, left_on=['userId'], right_on=['userID']).drop(columns=['userID'])
cluster_serendipity = eval_serendipity_clulsters.groupby('cluster')
cluster_serendipity_df = cluster_serendipity['user_serendipity'].agg('mean').reset_index()
cluster_serendipity_df.to_csv('../output/exp-4/'+dataset_name+'/cluster_serendipity2.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reco_train_user_item_sim[col_sim].fillna(0, inplace=True)
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


In [32]:
len(train['userID'].unique())

5624

In [33]:
len(eval_serendipity['userID'].unique())

5624

In [34]:
len(cluster_serendipity['userId'].unique())

100

In [35]:
eval_serendipity.to_csv('../output/exp-4/'+dataset_name+'/user_serendipity3.csv', index=False)

### 2.6 Infer embeddings

With `infer_embedding` method of LightGCN model, we can export the embeddings of users and items in the training set to CSV files for future use.

In [36]:
model.infer_embedding(user_file, item_file)

## 3. Compare LightGCN with SAR and NCF

Here there are the performances of LightGCN compared to [SAR](../00_quick_start/sar_movielens.ipynb) and [NCF](../00_quick_start/ncf_movielens.ipynb) on MovieLens dataset of 100k and 1m. The method of data loading and splitting is the same as that described above and the GPU used was a GeForce GTX 1080Ti.

Settings common to the three models: `epochs=15, seed=42`.

Settings for LightGCN: `embed_size=64, n_layers=3, batch_size=1024, decay=0.0001, learning_rate=0.015 `.

Settings for SAR: `similarity_type="jaccard", time_decay_coefficient=30, time_now=None, timedecay_formula=True`.

Settings for NCF: `n_factors=4, layer_sizes=[16, 8, 4], batch_size=1024, learning_rate=0.001`.

| Data Size | Model    | Training time | Recommending time | MAP@10   | nDCG@10  | Precision@10 | Recall@10 |
| --------- | -------- | ------------- | ----------------- | -------- | -------- | ------------ | --------- |
| 100k      | LightGCN | 27.8865       | 0.6445            | 0.129236 | 0.436297 | 0.381866     | 0.205816  |
| 100k      | SAR      | 0.4895        | 0.1144            | 0.110591 | 0.382461 | 0.330753     | 0.176385  |
| 100k      | NCF      | 116.3174      | 7.7660            | 0.105725 | 0.387603 | 0.342100     | 0.174580  |
| 1m        | LightGCN | 396.7298      | 1.4343            | 0.075012 | 0.377501 | 0.345679     | 0.128096  |
| 1m        | SAR      | 4.5593        | 2.8357            | 0.060579 | 0.299245 | 0.270116     | 0.104350  |
| 1m        | NCF      | 1601.5846     | 85.4567           | 0.062821 | 0.348770 | 0.320613     | 0.108121  |

From the above results, we can see that LightGCN performs better than the other two models.

### References: 
1. Xiangnan He, Kuan Deng, Xiang Wang, Yan Li, Yongdong Zhang & Meng Wang, LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation, 2020, https://arxiv.org/abs/2002.02126
2. LightGCN implementation [TensorFlow]: https://github.com/kuandeng/lightgcn
3. Thomas N. Kipf and Max Welling, Semi-Supervised Classification with Graph Convolutional Networks, ICLR, 2017, https://arxiv.org/abs/1609.02907
4. Xiang Wang, Xiangnan He, Meng Wang, Fuli Feng, and Tat-Seng Chua, Neural Graph Collaborative Filtering, SIGIR, 2019, https://arxiv.org/abs/1905.08108
5. Y. Koren, R. Bell and C. Volinsky, "Matrix Factorization Techniques for Recommender Systems", in Computer, vol. 42, no. 8, pp. 30-37, Aug. 2009, doi: 10.1109/MC.2009.263.  url: https://datajobs.com/data-science-repo/Recommender-Systems-%5BNetflix%5D.pdf

### Group Validation on System Metrics
Step 1: Clustering is already done
Step 2: All predictions file
Step 3: generating the values not done as well

In [37]:
# Let's generate the all_predictions file using lightGCN

import sys
import os
import surprise
import papermill as pm
import scrapbook as sb
import pandas as pd
from surprise import Dataset, Reader
from joblib import Parallel, delayed
from timeit import default_timer as timer
#import dask.dataframe as dd
#from dask.distributed import Client
from contextlib import contextmanager

from recommenders.utils.timer import Timer
# from recommenders.datasets import movielens -- I commented this line because it gave errors on the library; and its not used // error: module pandera has no attribute 'SchemasModel'
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

In [38]:
# model.predict(test_data, usercol='userId', itemcol='movieId')
#predictions = model.predict(test, usercol='userId', itemcol='movieId')
#predictions.head()

In [39]:
df.head()

Unnamed: 0,Unnamed: 0.1,Column1,userID,itemID,rating,timestamp,user_cat,rating_group,item_cat,nf1,...,2&3&4,1&2&4,1&3&4,1&2&3&4 = 0,1&2&3&4 = 1,isNoisy,FinalOutput,OptOut,layer3_result,random
0,0,0,107576,3253,4.5,1544828239,Benevolent,Su,Strongly-preferred,0,...,1,0,0,0,0,0,0,0,0,0.37454
1,1,1,116343,741,3.0,1106870586,Benevolent,Su,Strongly-preferred,0,...,1,0,0,0,0,0,0,0,0,0.950714
2,2,2,87441,18,4.0,1496483617,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.731994
3,3,3,92046,4378,4.0,1016643855,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.598658
4,4,4,137299,94896,4.0,1489231026,Benevolent,Su,Strongly-preferred,0,...,0,0,0,0,0,0,0,0,0,0.156019


In [40]:
train.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,107,8784,5.0,1112451972
1,107,2028,3.0,1146426880
2,107,318,4.5,1112452064
3,107,1230,4.0,1112450308
4,107,27432,3.0,1162221547


In [41]:
clustered_df = clusters
print(clustered_df)
# group clusters into another dataframe with different representation
grouped_clusters = clustered_df.groupby('cluster')['userId'].apply(list).reset_index(name='users_list')
grouped_clusters['users_per_cluster'] = grouped_clusters.apply(lambda x: list(set(x.users_list)), axis=1)
grouped_clusters = grouped_clusters[['cluster', 'users_per_cluster']]

        userId  movieId   timestamp  cluster
0          107     1299  1116042277       54
1          107     2599  1112450353       38
2          107     6218  1112450576       38
3          107     7096  1116042392       54
4          107      337  1116042439       54
...        ...      ...         ...      ...
585248  162488     3793  1202822956       10
585249  162488     6448  1202824032       10
585250  162488     1027  1202821393       10
585251  162488      180  1202822988       10
585252  162488     2858  1202822410       10

[585253 rows x 4 columns]


In [42]:
grouped_clusters

Unnamed: 0,cluster,users_per_cluster
0,0,"[98816, 89091, 65029, 125961, 93195, 57356, 94..."
1,1,"[61953, 34819, 60931, 90115, 48646, 66057, 153..."
2,2,"[51200, 89091, 58884, 119816, 76813, 134163, 6..."
3,3,"[55808, 89091, 151558, 66057, 86031, 44048, 68..."
4,4,"[52736, 65538, 89091, 76292, 87556, 98818, 302..."
...,...,...
95,95,"[77828, 92172, 32788, 59418, 98334, 38943, 532..."
96,96,"[114177, 106498, 89091, 66057, 103948, 126477,..."
97,97,"[55808, 89091, 27141, 151558, 22535, 66057, 13..."
98,98,"[51200, 52736, 89091, 87556, 25095, 19981, 105..."


In [43]:
def ndcg_at_k(df_cluster, topk_scores, k=10):
    # Calculate NDCG for the top-k items
    dcg = 0.0
    idcg = 0.0
    for i in range(k):
        dcg += df_cluster["hit"].iloc[i] / np.log2(i + 2)  # Discounted Cumulative Gain (DCG)
        idcg += df_cluster["actual"].iloc[i] / np.log2(i + 2)  # Ideal DCG (IDCG)

    return dcg / idcg if idcg != 0 else 0

def precision_at_k(df_cluster, topk_scores, k=10):
    # Precision at k
    hits_at_k = df_cluster["hit"].iloc[:k].sum()
    return hits_at_k / k

def recall_at_k(df_cluster, topk_scores, k=10):
    # Recall at k
    relevant_items = df_cluster["actual"].iloc[:k].sum()
    return relevant_items / df_cluster["actual"].sum() if df_cluster["actual"].sum() > 0 else 0


In [46]:
group_metric = {}
all_clusters_list = grouped_clusters.users_per_cluster.to_list()
all_users = len(set(df.userID.to_list()))

for cluster_id, cluster in enumerate(all_clusters_list):
    n_cluster_users = len(cluster)
    n_cluster_users_equiv = all_users - n_cluster_users

    # Get predicted scores for users in the current cluster
    predicted_scores_cluster = model.predict(cluster, user_embeddings, item_embeddings)

    # Get actual recommendations for users in the cluster
    df_ndcg_cluster = df_ndcg.loc[df_ndcg["userId"].isin(cluster)]
    df_ndcg_cluster_equiv = df_ndcg.loc[~df_ndcg['userId'].isin(cluster)]

    # Group metrics for the cluster
    cluster_ndcg = (df_ndcg_cluster["dcg"] / df_ndcg_cluster["idcg"]).sum() / n_cluster_users
    cluster_precision = (df_ndcg_cluster["hit"] / k).sum() / n_cluster_users
    cluster_recall = (df_ndcg_cluster["hit"] / df_ndcg_cluster["actual"]).sum() / n_cluster_users

    # Group metrics for the equivalent group
    cluster_ndcg_equiv = (df_ndcg_cluster_equiv["dcg"] / df_ndcg_cluster_equiv["idcg"]).sum() / n_cluster_users_equiv
    cluster_precision_equiv = (df_ndcg_cluster_equiv["hit"] / k).sum() / n_cluster_users_equiv
    cluster_recall_equiv = (df_ndcg_cluster_equiv["hit"] / df_ndcg_cluster_equiv["actual"]).sum() / n_cluster_users_equiv

    # Store the metrics for the current cluster
    group_metric[cluster_id] = [
        cluster_ndcg,
        cluster_ndcg_equiv,
        cluster_precision,
        cluster_precision_equiv,
        cluster_recall,
        cluster_recall_equiv
    ]

AttributeError: 'LightGCN' object has no attribute 'predict'

In [None]:
group_metric_df = pd.DataFrame.from_dict(group_metric, orient='index')\
    .reset_index()\
    .rename({
        'index': 'cluster',
        0: 'cluster-nDCG',
        1: 'cluster-nDCG-eq',
        2: 'cluster-precision',
        3: 'cluster-precision-eq',
        4: 'cluster-recall',
        5: 'cluster-recall-eq'
        }, axis=1)
group_metric_df['ndcg'] = eval_ndcg
group_metric_df['precision'] = eval_precision
group_metric_df['recall'] = eval_recall

# save results in csv
group_metric_df.to_csv('../output/exp-4/'+dataset_name+'/gv6.csv', index=False)