<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>

<i>Licensed under the MIT License.</i>

<i>This notebook has been taken from Microsoft's recommender system library: [Source](https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/lightgcn_deep_dive.ipynb). It has been modified to fit the context of our study. The modifications include the addition of new evaluation methods, the ability to add new datasets, and cluster validation process.</i>

# LightGCN - simplified GCN model for recommendation

This notebook serves as an introduction to LightGCN [1], which is an simple, linear and neat Graph Convolution Network (GCN) [3] model for recommendation.

## 0 Global Settings and Imports

In [1]:
from recommenders.datasets.python_splitters import python_stratified_split
import sys
import os
import codecs
import scrapbook as sb
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages


from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
#from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    serendipity,
    user_serendipity,
    user_item_serendipity,
    catalog_coverage
)

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))
#os.chdir('../')

System version: 3.10.0 (tags/v3.10.0:b494f59, Oct  4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)]
Pandas version: 2.2.2
Tensorflow version: 2.17.0


In [2]:
os.getcwd()

'C:\\Users\\clari\\Desktop\\M2 - Thesis\\Research\\Dr Jacques Bou Abdo\\Recommender System\\5 - Ensemble Learning Model\\Accuracy Metrics\\serendipity-main\\notebooks'

In [3]:
DEFAULT_SEED = None

In [4]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
#MOVIELENS_DATA_SIZE = 'framework-ml-25m-subset'
MOVIELENS_DATA_SIZE = 'ml-25m-subset(3)-#4.1'
OG_DT = 'ml-25m'
# Model parameters
EPOCHS = 20
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "./models/lightgcn/config/lightgcn.yaml"
user_file = "./models/lightgcn/output/tests/user_embeddings.csv"
item_file = "./models/lightgcn/output/item_embeddings.csv"

In [5]:

yaml_file = os.path.join(os.getcwd(), ".." ,"models", "lightgcn", "config", "lightgcn.yaml")
user_file = os.path.join(os.getcwd(), ".." ,"models","lightgcn","output_CH",MOVIELENS_DATA_SIZE,"user_embedding_CH.csv")
item_file = os.path.join(os.getcwd(), ".." ,"models","lightgcn","output_CH",MOVIELENS_DATA_SIZE,"item_embedding_CH.csv")
print(os.path.exists(yaml_file))
print(os.path.exists(user_file))
print(os.path.exists(item_file))



True
True
True


In [6]:
from sklearn.model_selection import train_test_split

ratio = 0.85
#dataset = 'ml-5m-#1'
dataset_name = MOVIELENS_DATA_SIZE
dataset_path = os.path.join('datasets', dataset_name)

#ratings_path = os.path.join(dataset_path, 'u.data')
#ratings_file = codecs.open(ratings_path, 'r', 'UTF-8')
#df = pd.read_csv(ratings_file, sep='\t', names=('userID', 'itemID', 'rating', 'timestamp'))
#dataset_name = 'ml-5m-#2'
#dataset = '5m'
dataset_path = os.path.join('datasets', dataset_name)
output_path_exp1 = './output/exp-3/'
#ratings_path = os.path.join(dataset_path, 'u.data')
#ratings_path = os.path.join(dataset_path, '//clean//ratings.csv')
#ratings_file = codecs.open(ratings_path, 'r', 'UTF-8')
ratings_path = "../datasets/"+dataset_name+ "/clean/ratings.csv"



#df = pd.read_csv(ratings_file, sep='\t', names=('userID', 'itemID', 'rating', 'timestamp'))

if MOVIELENS_DATA_SIZE == 'ml-25m-subset-nf-1-2-3-3.1-4':
    cos = ['itemId',	'userId', 'title',	'rating']
    df = pd.read_csv(ratings_path, usecols=cos).rename(columns = {'userId':'userID', 'itemId':'itemID'})
else:
    df = pd.read_csv(ratings_path).rename(columns = {'userId':'userID', 'movieId':'itemID'}).copy()

if '#2' in dataset_name:
     df = df[df['isNoisy'] == "[0.]"]
elif 'nf-1' in dataset_name:
    df =  df[(df['1&2&3&4 = 0'] == 0) & (df['1&2&3&4 = 1'] == 0)]
elif not 'framework' in dataset_name:
    df = df[df['isNoisy'] == 0]    
len(df['userID'].unique())
if 'itemID' not in df.columns:
    df = df.rename(columns = {'itemId':'itemID'})

In [7]:
df.columns

Index(['userID', 'itemID', 'rating', 'timestamp', 'genres_bin', 'isNoisy'], dtype='object')

In [8]:

# Normal train/test split (random portion) 
train, test = python_stratified_split(df, ratio=ratio)

#train2, test2 = train_test_split(df, test_size=(1 - ratio), random_state=42)
train.to_csv('../output/exp-3/'+dataset_name+'/train.csv')
test.to_csv('../output/exp-3/'+dataset_name+'/test.csv')
train_df = pd.read_csv('../output/exp-3/'+dataset_name+'/train.csv')
test_df = pd.read_csv('../output/exp-3/'+dataset_name+'/test.csv')

In [9]:
df

Unnamed: 0,userID,itemID,rating,timestamp,genres_bin,isNoisy,random
0,107576,3253,4.5,1544828239,69793636673,0,0.374540
1,116343,741,3.0,1106870586,221463938504,0,0.950714
2,87441,18,4.0,1496483617,69793636673,0,0.731994
3,92046,4378,4.0,1016643855,216359445792,0,0.598658
4,137299,94896,4.0,1489231026,216359462241,0,0.156019
...,...,...,...,...,...,...,...
531753,111945,2786,3.0,980566705,69793636673,0,0.198208
531754,123473,10,3.5,1165210768,216898399982,0,0.823946
531755,75587,45499,4.0,1271238048,221465980648,0,0.033212
531756,82097,1265,4.0,1465691808,216099447257,0,0.460556


In [10]:
clusters = pd.read_csv('../output/exp-3/'+dataset_name+'/means_output_clusters_CH.csv', usecols=['userId', 'cluster_shc', 'group_clusters'])
train_clusters = train_df.reset_index().merge(clusters, left_on='userID', right_on='userId').drop(columns=['userId'])
total_groups = set(clusters.group_clusters.to_list())
print("Total groups:", len(total_groups))

Total groups: 20


In [11]:
train_clusters

Unnamed: 0.1,index,Unnamed: 0,userID,itemID,rating,timestamp,genres_bin,isNoisy,cluster_shc,group_clusters
0,0,455334,107,2579,3.0,1120911156,215858163513,0,0.666695,16
1,1,53955,107,8970,5.0,1112452270,69256806688,0,0.666695,16
2,2,149115,107,337,3.5,1116042439,69256806688,0,0.666695,16
3,3,379717,107,42004,3.5,1162221330,218506945007,0,0.666695,16
4,4,183376,107,2677,4.0,1146426915,215319291901,0,0.666695,16
...,...,...,...,...,...,...,...,...,...,...
426680,426680,229407,162488,2021,5.0,1202824734,221463897262,0,0.675700,9
426681,426681,234578,162488,7076,4.5,1202823943,218509062120,0,0.675700,9
426682,426682,464699,162488,592,5.0,1202822963,217972125672,0,0.675700,9
426683,426683,279556,162488,1089,5.0,1202822376,215858163513,0,0.675700,9


In [12]:
# Target group cluster (we iterate over all of them in every run)
target_group = 4

# Train data
#target_group_df = train_clusters[train_clusters['group_clusters'] == target_group]
#train = target_group_df[['userID', 'itemID', 'rating', 'timestamp']]
target_group_df = train_clusters
if 'timestamp' not in target_group_df.columns:
    rt = pd.read_csv( r"C:\Users\clari\Desktop\M2 - Thesis\Research\Dr Jacques Bou Abdo\Recommender System\4 - Review\dataset\\"+OG_DT+"\\Full Data set (Used with NF3 First)\\ratings.csv"
).rename(columns = {'userId':'userID', 'movieId':'itemID'})
    target_group_df = target_group_df.merge(rt, on=['userID','itemID','rating'], how='left')
train = target_group_df[['userID', 'itemID', 'rating', 'timestamp']]



# Test data Choose only ratings that can be predicted
users_in_train = list(set(train.userID.to_list()))
test = test_df[test_df.userID.isin(users_in_train)]
# This gave a memory error, but it seems its not actually used
#test2 = test_df[test_df - test_df.userID.isin(users_in_train)]

In [13]:
len(users_in_train)

5624

In [14]:
target_group_df

Unnamed: 0.1,index,Unnamed: 0,userID,itemID,rating,timestamp,genres_bin,isNoisy,cluster_shc,group_clusters
0,0,455334,107,2579,3.0,1120911156,215858163513,0,0.666695,16
1,1,53955,107,8970,5.0,1112452270,69256806688,0,0.666695,16
2,2,149115,107,337,3.5,1116042439,69256806688,0,0.666695,16
3,3,379717,107,42004,3.5,1162221330,218506945007,0,0.666695,16
4,4,183376,107,2677,4.0,1146426915,215319291901,0,0.666695,16
...,...,...,...,...,...,...,...,...,...,...
426680,426680,229407,162488,2021,5.0,1202824734,221463897262,0,0.675700,9
426681,426681,234578,162488,7076,4.5,1202823943,218509062120,0,0.675700,9
426682,426682,464699,162488,592,5.0,1202822963,217972125672,0,0.675700,9
426683,426683,279556,162488,1089,5.0,1202822376,215858163513,0,0.675700,9


In [15]:
train

Unnamed: 0,userID,itemID,rating,timestamp
0,107,2579,3.0,1120911156
1,107,8970,5.0,1112452270
2,107,337,3.5,1116042439
3,107,42004,3.5,1162221330
4,107,2677,4.0,1146426915
...,...,...,...,...
426680,162488,2021,5.0,1202824734
426681,162488,7076,4.5,1202823943
426682,162488,592,5.0,1202822963
426683,162488,1089,5.0,1202822376


In [16]:
print("total users in main dataset:", len(list(set(df.userID.to_list()))))
print("total users in train dataset:", len(list(set(train.userID.to_list()))))
print("total users in test dataset:", len(list(set(test.userID.to_list()))))
# train = train.set_index('index')
# test = test.set_index('index')

total users in main dataset: 5624
total users in train dataset: 5624
total users in test dataset: 5623


In [17]:
test

Unnamed: 0.1,Unnamed: 0,userID,itemID,rating,timestamp,genres_bin,isNoisy
0,348542,107,6571,3.0,1112452211,69256806688,0
1,66498,107,27432,3.0,1162221547,69256806688,0
2,115797,107,5377,3.5,1146426679,216367882721,0
3,34747,107,8254,4.0,1136781359,216636318201,0
4,93876,107,1172,3.5,1116042312,69256806688,0
...,...,...,...,...,...,...,...
75301,411435,162488,1704,4.5,1202823491,215294124512,0
75302,307421,162488,1036,5.0,1202823585,217972125672,0
75303,267431,162488,6305,5.0,1202822076,219853325600,0
75304,171426,162488,180,4.5,1202822988,215831011777,0


In [18]:
train

Unnamed: 0,userID,itemID,rating,timestamp
0,107,2579,3.0,1120911156
1,107,8970,5.0,1112452270
2,107,337,3.5,1116042439
3,107,42004,3.5,1162221330
4,107,2677,4.0,1146426915
...,...,...,...,...
426680,162488,2021,5.0,1202824734
426681,162488,7076,4.5,1202823943
426682,162488,592,5.0,1202822963
426683,162488,1089,5.0,1202822376


In [19]:
set(train_clusters[train_clusters['group_clusters'] == target_group].cluster_shc.to_list())

{0.6002271524894276}

### 2.2 Process data

`ImplicitCF` is a class that intializes and loads data for the training process. During the initialization of this class, user IDs and item IDs are reindexed, ratings greater than zero are converted into implicit positive interaction, and adjacency matrix $R$ of user-item graph is created. Some important methods of `ImplicitCF` are:

`get_norm_adj_mat`, load normalized adjacency matrix of user-item graph if it already exists in `adj_dir`, otherwise call `create_norm_adj_mat` to create the matrix and save the matrix if `adj_dir` is not `None`. This method will be called during the initialization process of LightGCN model.

`create_norm_adj_mat`, create normalized adjacency matrix of user-item graph by calculating $D^{-\frac{1}{2}} A D^{-\frac{1}{2}}$, where $\mathbf{A}=\left(\begin{array}{cc}\mathbf{0} & \mathbf{R} \\ \mathbf{R}^{T} & \mathbf{0}\end{array}\right)$.

`train_loader`, generate a batch of training data — sample a batch of users and then sample one positive item and one negative item for each user. This method will be called before each epoch of the training process.


In [20]:
data = ImplicitCF(train=train, test=test, seed=SEED)

### 2.3 Prepare hyper-parameters

Important parameters of `LightGCN` model are:

`data`, initialized LightGCNDataset object.

`epochs`, number of epochs for training.

`n_layers`, number of layers of the model.

`eval_epoch`, if it is not None, evaluation metrics will be calculated on test set every "eval_epoch" epochs. In this way, we can observe the effect of the model during the training process.

`top_k`, the number of items to be recommended for each user when calculating ranking metrics.

A complete list of parameters can be found in `yaml_file`. We use `prepare_hparams` to read the yaml file and prepare a full set of parameters for the model. Parameters passed as the function's parameters will overwrite yaml settings.

In [21]:
hparams = prepare_hparams(
    yaml_file,
    n_layers=3,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    learning_rate=0.01,
    eval_epoch=5,
    top_k=TOP_K,
)

### 2.4 Create and train model

With data and parameters prepared, we can create the LightGCN model.

To train the model, we simply need to call the `fit()` method.

In [22]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [23]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)24.1s: train loss = 0.20863 = (mf)0.20801 + (embed)0.00062
Epoch 2 (train)23.8s: train loss = 0.14586 = (mf)0.14463 + (embed)0.00123
Epoch 3 (train)23.7s: train loss = 0.13162 = (mf)0.12994 + (embed)0.00169
Epoch 4 (train)24.7s: train loss = 0.12012 = (mf)0.11799 + (embed)0.00213


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 5 (train)23.7s + (eval)2.3s: train loss = 0.10882 = (mf)0.10622 + (embed)0.00259, recall = 0.03523, ndcg = 0.04197, precision = 0.03758, map = 0.01449
Epoch 6 (train)23.6s: train loss = 0.10149 = (mf)0.09841 + (embed)0.00308
Epoch 7 (train)22.5s: train loss = 0.09282 = (mf)0.08919 + (embed)0.00362
Epoch 8 (train)22.7s: train loss = 0.08474 = (mf)0.08057 + (embed)0.00417
Epoch 9 (train)22.5s: train loss = 0.07916 = (mf)0.07447 + (embed)0.00469


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 10 (train)22.3s + (eval)1.7s: train loss = 0.07549 = (mf)0.07034 + (embed)0.00515, recall = 0.03792, ndcg = 0.04396, precision = 0.03941, map = 0.01527
Epoch 11 (train)23.0s: train loss = 0.07108 = (mf)0.06549 + (embed)0.00559
Epoch 12 (train)22.7s: train loss = 0.06767 = (mf)0.06166 + (embed)0.00601
Epoch 13 (train)23.2s: train loss = 0.06589 = (mf)0.05950 + (embed)0.00639
Epoch 14 (train)24.8s: train loss = 0.06300 = (mf)0.05622 + (embed)0.00678


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 15 (train)22.6s + (eval)1.8s: train loss = 0.06043 = (mf)0.05329 + (embed)0.00714, recall = 0.03608, ndcg = 0.04362, precision = 0.03857, map = 0.01531
Epoch 16 (train)22.8s: train loss = 0.05921 = (mf)0.05172 + (embed)0.00749
Epoch 17 (train)22.7s: train loss = 0.05766 = (mf)0.04982 + (embed)0.00785
Epoch 18 (train)22.6s: train loss = 0.05592 = (mf)0.04773 + (embed)0.00819
Epoch 19 (train)23.1s: train loss = 0.05457 = (mf)0.04607 + (embed)0.00850


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


Epoch 20 (train)22.7s + (eval)1.9s: train loss = 0.05352 = (mf)0.04471 + (embed)0.00881, recall = 0.03577, ndcg = 0.04226, precision = 0.03752, map = 0.01478
Took 471.1930119000026 seconds for training.


### 2.5 Recommendation and Evaluation

Recommendation and evaluation have been performed on the specified test set during training. After training, we can also use the model to perform recommendation and evalution on other data. Here we still use `test` as test data, but `test` can be replaced by other data with similar data structure.

#### 2.5.1 Recommendation

We can call `recommend_k_items` to recommend k items for each user passed in this function. We set `remove_seen=True` to remove the items already seen by the user. The function returns a dataframe, containing each user and top k items recommended to them and the corresponding ranking scores.

In [24]:
topk_scores = model.recommend_k_items(train, top_k=TOP_K, remove_seen=True)
topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,107,6620,11.488093
1,107,296,10.400379
2,107,4973,10.39462
3,107,4226,10.098305
4,107,3911,9.91567


#### 2.5.2 Evaluation

With `topk_scores` predicted by the model, we can evaluate how LightGCN performs on this test set.

In [25]:
columns_to_keep = ['userID', 'itemID', 'rating', 'timestamp']

# Create a new DataFrame with only the selected columns
#test1 = test[columns_to_keep].astype(float)


In [26]:
'''
import json

eval_map = map_at_k(test1, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test1, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test1, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test1, topk_scores, k=TOP_K)
#eval_serendipity = serendipity(train, topk_scores)
#eval_coverage = catalog_coverage(train, topk_scores)
eval_serendipity = serendipity(train, topk_scores)
eval_coverage = catalog_coverage(train, topk_scores)

metric_results = {
    'MAP': eval_map,
    'NDCG': eval_ndcg,
    'Precision': eval_precision,
    'Recall': eval_recall,
    'User Serendipity': eval_serendipity,
    'Coverage': eval_coverage
}

print(json.dumps(metric_results, indent=4))
with open("../output/exp-3/"+dataset_name+"/metric_results.txt", "w") as fp:
    json.dump(metric_results, fp, indent=4)
'''

'\nimport json\n\neval_map = map_at_k(test1, topk_scores, k=TOP_K)\neval_ndcg = ndcg_at_k(test1, topk_scores, k=TOP_K)\neval_precision = precision_at_k(test1, topk_scores, k=TOP_K)\neval_recall = recall_at_k(test1, topk_scores, k=TOP_K)\n#eval_serendipity = serendipity(train, topk_scores)\n#eval_coverage = catalog_coverage(train, topk_scores)\neval_serendipity = serendipity(train, topk_scores)\neval_coverage = catalog_coverage(train, topk_scores)\n\nmetric_results = {\n    \'MAP\': eval_map,\n    \'NDCG\': eval_ndcg,\n    \'Precision\': eval_precision,\n    \'Recall\': eval_recall,\n    \'User Serendipity\': eval_serendipity,\n    \'Coverage\': eval_coverage\n}\n\nprint(json.dumps(metric_results, indent=4))\nwith open("../output/exp-3/"+dataset_name+"/metric_results.txt", "w") as fp:\n    json.dump(metric_results, fp, indent=4)\n'

In [27]:
train.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,107,2579,3.0,1120911156
1,107,8970,5.0,1112452270
2,107,337,3.5,1116042439
3,107,42004,3.5,1162221330
4,107,2677,4.0,1146426915


In [28]:
#test2.head()

In [29]:
# load clusters if not previously loaded
# clusters = pd.read_csv('./output/exp-2/group_clusters.csv', usecols=['user_id', 'group'])

# get per-user serendipity score
#eval_serendipity = user_serendipity(train, topk_scores)
eval_serendipity = user_serendipity(train, topk_scores)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

# calculate per-cluster serendipity score
eval_serendipity_clulsters = clusters.merge(eval_serendipity, left_on='userId', right_on='userID').drop(columns=['userID'])
cluster_serendipity = eval_serendipity_clulsters.groupby('cluster')
cluster_serendipity[['user_serendipity']].to_csv('../output/exp-3/'+dataset_name+'/cluster_serendipity2.csv')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reco_train_user_item_sim[col_sim].fillna(0, inplace=True)
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


In [30]:
len(train['userID'].unique())

5624

In [31]:
len(eval_serendipity['userID'].unique())

5624

In [32]:
len(cluster_serendipity['userId'].unique())

5624

In [33]:
eval_serendipity.to_csv('../output/exp-3/'+dataset_name+'/user_serendipity3.csv', index=False)

### 2.6 Infer embeddings

With `infer_embedding` method of LightGCN model, we can export the embeddings of users and items in the training set to CSV files for future use.

In [34]:
model.infer_embedding(user_file, item_file)

## 3. Compare LightGCN with SAR and NCF

Here there are the performances of LightGCN compared to [SAR](../00_quick_start/sar_movielens.ipynb) and [NCF](../00_quick_start/ncf_movielens.ipynb) on MovieLens dataset of 100k and 1m. The method of data loading and splitting is the same as that described above and the GPU used was a GeForce GTX 1080Ti.

Settings common to the three models: `epochs=15, seed=42`.

Settings for LightGCN: `embed_size=64, n_layers=3, batch_size=1024, decay=0.0001, learning_rate=0.015 `.

Settings for SAR: `similarity_type="jaccard", time_decay_coefficient=30, time_now=None, timedecay_formula=True`.

Settings for NCF: `n_factors=4, layer_sizes=[16, 8, 4], batch_size=1024, learning_rate=0.001`.

| Data Size | Model    | Training time | Recommending time | MAP@10   | nDCG@10  | Precision@10 | Recall@10 |
| --------- | -------- | ------------- | ----------------- | -------- | -------- | ------------ | --------- |
| 100k      | LightGCN | 27.8865       | 0.6445            | 0.129236 | 0.436297 | 0.381866     | 0.205816  |
| 100k      | SAR      | 0.4895        | 0.1144            | 0.110591 | 0.382461 | 0.330753     | 0.176385  |
| 100k      | NCF      | 116.3174      | 7.7660            | 0.105725 | 0.387603 | 0.342100     | 0.174580  |
| 1m        | LightGCN | 396.7298      | 1.4343            | 0.075012 | 0.377501 | 0.345679     | 0.128096  |
| 1m        | SAR      | 4.5593        | 2.8357            | 0.060579 | 0.299245 | 0.270116     | 0.104350  |
| 1m        | NCF      | 1601.5846     | 85.4567           | 0.062821 | 0.348770 | 0.320613     | 0.108121  |

From the above results, we can see that LightGCN performs better than the other two models.

### References: 
1. Xiangnan He, Kuan Deng, Xiang Wang, Yan Li, Yongdong Zhang & Meng Wang, LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation, 2020, https://arxiv.org/abs/2002.02126
2. LightGCN implementation [TensorFlow]: https://github.com/kuandeng/lightgcn
3. Thomas N. Kipf and Max Welling, Semi-Supervised Classification with Graph Convolutional Networks, ICLR, 2017, https://arxiv.org/abs/1609.02907
4. Xiang Wang, Xiangnan He, Meng Wang, Fuli Feng, and Tat-Seng Chua, Neural Graph Collaborative Filtering, SIGIR, 2019, https://arxiv.org/abs/1905.08108
5. Y. Koren, R. Bell and C. Volinsky, "Matrix Factorization Techniques for Recommender Systems", in Computer, vol. 42, no. 8, pp. 30-37, Aug. 2009, doi: 10.1109/MC.2009.263.  url: https://datajobs.com/data-science-repo/Recommender-Systems-%5BNetflix%5D.pdf

### Group Validation on System Metrics
Step 1: Clustering is already done
Step 2: All predictions file
Step 3: generating the values not done as well

In [35]:
# Let's generate the all_predictions file using lightGCN

import sys
import os
import surprise
import papermill as pm
import scrapbook as sb
import pandas as pd
from surprise import Dataset, Reader
from joblib import Parallel, delayed
from timeit import default_timer as timer
#import dask.dataframe as dd
#from dask.distributed import Client
from contextlib import contextmanager

from recommenders.utils.timer import Timer
# from recommenders.datasets import movielens -- I commented this line because it gave errors on the library; and its not used // error: module pandera has no attribute 'SchemasModel'
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

In [36]:
# model.predict(test_data, usercol='userId', itemcol='movieId')
#predictions = model.predict(test, usercol='userId', itemcol='movieId')
#predictions.head()

In [37]:
df.head()

Unnamed: 0,userID,itemID,rating,timestamp,genres_bin,isNoisy,random
0,107576,3253,4.5,1544828239,69793636673,0,0.37454
1,116343,741,3.0,1106870586,221463938504,0,0.950714
2,87441,18,4.0,1496483617,69793636673,0,0.731994
3,92046,4378,4.0,1016643855,216359445792,0,0.598658
4,137299,94896,4.0,1489231026,216359462241,0,0.156019


In [38]:
train.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,107,2579,3.0,1120911156
1,107,8970,5.0,1112452270
2,107,337,3.5,1116042439
3,107,42004,3.5,1162221330
4,107,2677,4.0,1146426915


In [39]:
clustered_df = clusters
print(clustered_df)
# group clusters into another dataframe with different representation
grouped_clusters = clustered_df.groupby('group_clusters')['userId'].apply(list).reset_index(name='users_list')
grouped_clusters['users_per_cluster'] = grouped_clusters.apply(lambda x: list(set(x.users_list)), axis=1)
grouped_clusters = grouped_clusters[['group_clusters', 'users_per_cluster']]

      userId  cluster_shc  group_clusters
0        107     0.666695              16
1        120     0.667395              15
2        125     0.667716              12
3        177     0.670654              16
4        318     0.678580              12
...      ...          ...             ...
5619  162375     0.682684               1
5620  162380     0.682345               9
5621  162447     0.678252              16
5622  162476     0.676434              16
5623  162488     0.675700               9

[5624 rows x 3 columns]


In [40]:
grouped_clusters

Unnamed: 0,group_clusters,users_per_cluster
0,0,"[89091, 126477, 86031, 65554, 68114, 72210, 18..."
1,1,"[22529, 8208, 77842, 124948, 153621, 38935, 28..."
2,2,[25857]
3,3,[30820]
4,4,[103199]
5,5,"[6118, 152107, 92046, 32855, 150333, 30111]"
6,6,[108989]
7,7,[94154]
8,8,[79293]
9,9,"[10242, 34819, 77828, 90115, 151558, 22535, 10..."


In [41]:
#print(grouped_clusters)
group_metric = {}
all_clusters_list = grouped_clusters.users_per_cluster.to_list()
#grouped_clusters.users_per_cluster.to_list()
all_users = len(set(df.userID.to_list()))
#print(all_clusters_list)
for index, row in grouped_clusters.iterrows():
    cluster_id = row['group_clusters']
    users_list = row['users_per_cluster']
    # users in the cluster vs. users in the equiv group
    #print('clust'+users_list)
    n_cluster_users = len(users_list)
    print('n_cluster_users' + str(n_cluster_users))
    n_cluster_users_equiv = all_users - n_cluster_users
    cluster_scores = topk_scores[topk_scores['userID'].isin(users_list)]
    
   
    cluster_df = df[df['userID'].isin(users_list)]
    
    #df_ndcg_cluster = ndcg_at_k(cluster_df, cluster_scores, k=TOP_K)
    #df_ndcg_cluster_equiv = df_ndcg_cluster.loc[~df_ndcg_cluster['userID'].isin(cluster)]

    #print(df_ndcg_cluster)

    
    # group metrics
    cluster_ndcg = ndcg_at_k(cluster_df, topk_scores, k=TOP_K)
    cluster_precision = precision_at_k(cluster_df, topk_scores, k=TOP_K)
    cluster_recall = recall_at_k(cluster_df, topk_scores, k=TOP_K)

    # group equiv. metrics
    cluster_ndcg_equiv = (cluster_ndcg*n_cluster_users) / n_cluster_users_equiv
    cluster_precision_equiv = (cluster_precision*n_cluster_users) / n_cluster_users_equiv
    cluster_recall_equiv = (cluster_recall*n_cluster_users) / n_cluster_users_equiv
    
    group_metric[cluster_id] = [
        cluster_ndcg,
        cluster_ndcg_equiv,
        cluster_precision,
        cluster_precision_equiv,
        cluster_recall,
        cluster_recall_equiv
    ]

n_cluster_users143


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users916


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users1
n_cluster_users1


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users1
n_cluster_users6


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users1
n_cluster_users1


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users1
n_cluster_users951


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users1
n_cluster_users777


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users885


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users1
n_cluster_users1


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users247


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users1687


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users1
n_cluster_users1


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


n_cluster_users1


  df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
  rating_true_common.groupby(col_user, as_index=False)[col_user].agg(


In [42]:
group_metric_df = pd.DataFrame.from_dict(group_metric, orient='index')\
    .reset_index()\
    .rename({
        'index': 'cluster',
        0: 'cluster-nDCG',
        1: 'cluster-nDCG-eq',
        2: 'cluster-precision',
        3: 'cluster-precision-eq',
        4: 'cluster-recall',
        5: 'cluster-recall-eq'
        }, axis=1)
group_metric_df['ndcg'] = eval_ndcg
group_metric_df['precision'] = eval_precision
group_metric_df['recall'] = eval_recall

# save results in csv
group_metric_df.to_csv('../output/exp-3/'+dataset_name+'/gv5.csv', index=False)