In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from tqdm.auto import tqdm
import random
from sklearn.metrics.pairwise import cosine_similarity
import ast

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

import tensorflow as tf

import itertools

import time
import sys

import matplotlib.pyplot as plt

In [None]:
# Specify the datasets we want to use
lingorank = True
ml_100k = False
goodreads = False
tomplay = False

# Create the cold start test dataset
cold_start = False

#Content-based
content_based = False

#CF
als = False
als_params = None

bpr = False
bpr_params = None

lmf = False
lmf_params = None

cf = als or bpr or lmf # If at least one, we do the CF

#LightGCN
ada = True
bert = False
xavier_ada = True
xavier_bert = False
lightgcn = ada or bert or xavier_ada or xavier_bert # If at least one, we do the LightGCN 

lightgcn_params = {
    'n_layers': list(range(1,21)),
    'n': [1],
    'lr': [0.001]
}

In [None]:
def split_user_data(group, test_size):
        if len(group) == 1:
            return pd.DataFrame(), pd.DataFrame(), group
        else:
            n = max(1, int(round(len(group) * test_size)))
            return group.iloc[:-n], group.iloc[-n:], pd.DataFrame()

In [None]:
def train_test_split(df, n=None, test_size=0.2, one_core=False):
    # Sort by user_id and timestamp
    # df.sort_values(by=['user_id', 'timestamp'], ascending=[True, True], inplace=True)

    # Sort by user_id and timestamp
    df.sort_values(by=['user_id', 'timestamp'], ascending=[True, True], inplace=True)

    

    tqdm.pandas(desc="Splitting data")
    splits = df.groupby('user_id').progress_apply(lambda x: split_user_data(x, test_size))

    # Concatenate the results
    train_data = pd.concat([split[0] for split in splits])
    test_data = pd.concat([split[1] for split in splits])
    other_data = pd.concat([split[2] for split in splits])

    # Remove other_data from data by checking user_id and item_id
    data = pd.concat([train_data, test_data])
    # Reset index
    data.reset_index(drop=True, inplace=True)


    # Find number of articles in test_data but not in train_data
    unique_test_articles = set(test_data['item_id'])
    unique_train_articles = set(train_data['item_id'])
    unique_articles_in_test_not_train = unique_test_articles - unique_train_articles
    num_unique_articles_in_test_not_train = len(unique_articles_in_test_not_train)

    # Calculate number of unique user_ids in test_data but not in train_data
    unique_test_user_ids = set(test_data['user_id'])
    unique_train_user_ids = set(train_data['user_id'])
    unique_users_in_test_not_train = unique_test_user_ids - unique_train_user_ids
    num_unique_users_in_test_not_train = len(unique_users_in_test_not_train)
    num_unique_users_in_other_data = len(other_data['user_id'].unique()) if not other_data.empty else 0
    num_unique_articles_in_other_data_not_train = len(other_data[~ other_data['item_id'].isin(unique_train_articles)]['item_id'].unique()) if not other_data.empty else 0

    print(f"Number of articles in test_data but not in train_data: {num_unique_articles_in_test_not_train}")
    print(f"Number of user_ids in test_data but not in train_data: {num_unique_users_in_test_not_train}")
    print(f"Number of user_ids in other_data: {num_unique_users_in_other_data}")
    print(f"Number of articles in other_data but not in train_data: {num_unique_articles_in_other_data_not_train}")

    # Calculate size of test data compared to train data
    size_test_data = len(test_data)
    size_data = len(train_data) + len(test_data)
    percentage_test_data = (size_test_data / size_data) * 100

    print(f"Size of test data: {percentage_test_data:.2f}%")
    print(f"Size of other data: {len(other_data) / size_data * 100:.2f}%")
    print(f"Size of all test data: {(size_test_data + len(other_data)) / size_data * 100:.2f}%")
    return train_data, test_data, other_data

In [None]:
def map_ids(data, train_data, test_data):
    # Create mapping for userIds and itemIds based on train_df
    unique_user_ids = train_data['user_id'].unique()
    unique_item_ids = train_data['item_id'].unique()

    user_id_mapping = {userId: i for i, userId in enumerate(unique_user_ids)}
    item_id_mapping = {itemId: i for i, itemId in enumerate(unique_item_ids)}

    # Map userIds and itemIds in train_df to new consecutive IDs
    train_data['mapped_user_id'] = train_data['user_id'].map(user_id_mapping)
    train_data['mapped_item_id'] = train_data['item_id'].map(item_id_mapping)

    # Function to map new IDs in test_df and update the mapping accordingly
    def map_and_update_id(id_value, current_mapping):
        if id_value not in current_mapping:
            current_mapping[id_value] = max(current_mapping.values()) + 1
        return current_mapping[id_value]

    test_data['mapped_user_id'] = test_data['user_id'].apply(lambda x: map_and_update_id(x, user_id_mapping))
    test_data['mapped_item_id'] = test_data['item_id'].apply(lambda x: map_and_update_id(x, item_id_mapping))

    if data is not None:
        data['mapped_user_id'] = data['user_id'].map(user_id_mapping).astype('Int64')
        data['mapped_item_id'] = data['item_id'].map(item_id_mapping).astype('Int64')
    else:
        data = pd.concat([train_data, test_data])
    
    data.reset_index(drop=True, inplace=True)
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return data, train_data, test_data, user_id_mapping, item_id_mapping

In [None]:
def load_LingoRank(strategy: int = 2):
    data_full = pd.read_csv(f"../results/recommendation/Zeegu/strategy{strategy}.csv")
    
    # Rename data_full.article_id as data_full.item_id
    data_full.rename(columns={"article_id": "item_id"}, inplace=True)
    ## Remove the articles for which there is no positive rating 
    # Before removing articles, count the unique articles
    original_unique_articles = data_full['item_id'].nunique()

    # Identify articles that have maximum rating <= 0
    articles_to_remove = data_full.groupby('item_id')['rating'].max()
    articles_to_remove = articles_to_remove[articles_to_remove <= 0].index.tolist()
    articles_to_remove.append(2223234) # Cannot use it for content-based because content is wrong. So we don't consider it at all

    # Remove these articles from data_full
    data_full = data_full[~data_full['item_id'].isin(articles_to_remove)]

    data = data_full[(data_full['rating'] != 0)].copy()
    
    train_data, test_data, other_data = train_test_split(data)
    print(f"Strategy {strategy} - Proportion of positive ratings affected to test set: {round(len(test_data)/(len(test_data)+len(train_data))*100,2)} %")

    data, train_data, test_data, user_id_mapping, item_id_mapping = map_ids(data, train_data, test_data)

    # Compute the number of unique users and items and print
    num_unique_users = data['mapped_user_id'].nunique()
    num_unique_items = data['mapped_item_id'].nunique()
    print(f"Number of unique users: {num_unique_users}")
    print(f"Number of unique items: {num_unique_items}")
    # Print the number of interactions
    print(f"Number of interactions: {len(data)}")
    # Compute sparsity
    sparsity = 1 - len(data) / (num_unique_users * num_unique_items)
    print(f"Sparsity: {sparsity:.2%}")


    ##### Load embeddings #####

    items_embeddings = None
    if ada or bert or content_based:
        items_embeddings = {}
        embeddings_file = f"../results/recommendation/embeddings_strategy{strategy}.csv.gz"

        embedding_models = {"ada": ada, "bert": bert}
        if content_based:
            embedding_models = {"ada": True, "bert": True}

        df = pd.read_csv(embeddings_file)
        df = df.rename(columns={'id': 'item_id'})
        # Iterate over the dictionary
        for embedding_model, is_enabled in embedding_models.items():
            if not is_enabled:
                continue

            embedding_key = embedding_model + "_embedding"
            # Remove rows for which ada_embedding is null
            assert data[data['item_id'].isin(df[df[embedding_key].isnull()]['item_id'])].empty # Already removed the article
            assert data_full[data_full['item_id'].isin(df[df[embedding_key].isnull()]['item_id'])].empty # Already removed the article
            
            df2 = df[~ df[embedding_key].isnull()]
            df2 = df2.rename(columns={embedding_key: 'embedding'})
            df2['embedding'] = df2['embedding'].apply(ast.literal_eval) #Convert list stored as a str to real list type
            df2['mapped_item_id'] = df2['item_id'].map(item_id_mapping).astype('Int64')
            df2 = df2.dropna(subset=['mapped_item_id']) # If NA, item is not in train set nor test set
            items_embeddings[embedding_model] = df2[['item_id', 'mapped_item_id', 'embedding']]

    ###########################

    return {
            'data': data,
            'train_data': train_data,
            'test_data': test_data,
            'user_id_mapping': user_id_mapping,
            'item_id_mapping': item_id_mapping,
            'items_embeddings': items_embeddings
        }

In [None]:
def ndcg_at_k(recommendations, real_items, k):
    """
    Compute NDCG at rank k
    """
    ranked_list = [1 if item_id in real_items else 0 for item_id in recommendations]
    num_relevant = min(k, len(real_items))
    ideal_list = [1] * num_relevant + [0] * (k - num_relevant)
    dcg = sum((2 ** rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(ranked_list[:k]))
    idcg = sum((2 ** rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(ideal_list[:k]))
    return dcg / idcg if idcg > 0 else 0

def mrr_at_k(recommendations, real_items, k):
    """
    Compute MRR at rank k
    """
    ranked_list = [1 if item_id in real_items else 0 for item_id in recommendations]
    for idx, rel in enumerate(ranked_list[:k]):
        if rel > 0:
            return 1 / (idx + 1)
    return 0

def precision_at_k(recommendations, real_items, k):
    """
    Compute Precision at rank k
    """
    return sum([1 if item_id in real_items else 0 for item_id in recommendations[:k]]) / min(k, len(recommendations))

def recall_at_k(recommendations, real_items, k):
    """
    Compute Recall at rank k
    """
    return sum([1 if item_id in real_items else 0 for item_id in recommendations[:k]]) / len(real_items)

def f1_at_k(precision_at_k, recall_at_k):
    """
    Compute F1 at rank k
    """
    if precision_at_k == 0 and recall_at_k == 0:
        return 0

    # Calculate F1 score
    f1_score = 2 * (precision_at_k * recall_at_k) / (precision_at_k + recall_at_k)
    return f1_score

def average_precision_at_k(recommendations, test_items, k):

    recommendations_presence = [1 if item_id in test_items else 0 for item_id in recommendations[:k]]

    # Keep track of the number of relevant items found
    num_relevant = 0
    # Keep track of the sum of precisions
    sum_precisions = 0
    # We are interested in the precision at each point a relevant document is retrieved
    for idx in range(min(k, len(recommendations_presence))):
        # Check if the item is relevant
        if recommendations_presence[idx] == 1:
            # Increment the count of relevant items
            num_relevant += 1
            # Update the sum of precisions
            sum_precisions += num_relevant / (idx + 1)

    return sum_precisions / min(k, len(test_items))

In [None]:
def evaluate(test_data, data_full, data_strategy, RS_strategy_name, RS_strategy, model_name, model, dataset, k=5, progress=True):

    global user_embedding
    global item_embeddings
    
    ndcgs = []
    rrs = []
    aps = []  
    precisions = []
    recalls = []
    f1s = []
    all_recommendations = []
    all_really_in_test = []

    # Loop for each user in test_df
    if RS_strategy_name=="CF" and data_strategy=="implicit" or True:
        user_ids = test_data['mapped_user_id'].unique()
    else:
        user_ids = test_data['user_id'].unique()

    if progress:
        user_ids_for_loop = tqdm(user_ids, desc="Users")
    else:
        user_ids_for_loop = user_ids   
    for user in user_ids_for_loop:

        if RS_strategy_name=="CF" and data_strategy=="implicit":
            top_k_recommendations = model['model'].recommend(user, RS_strategy['user_item_train_data'][user], k)[0]
            user_test_items = test_data[test_data['mapped_user_id'] == user].mapped_item_id.tolist()
            
        elif RS_strategy_name=="content-based":

            top_k_recommendations = np.array(model['sim_matrix'].loc[user].sort_values(ascending=False).index.tolist()[:k])
            user_test_items = test_data[test_data['mapped_user_id'] == user].mapped_item_id.tolist()

        elif RS_strategy_name=="graph-based":

            top_k_recommendations = model['model'].recommend_k_items(pd.DataFrame({'userID': [user]}), top_k=k, remove_seen=True)
        
            top_k_recommendations = top_k_recommendations['itemID'].tolist()
            user_test_items = test_data[test_data['mapped_user_id'] == user].mapped_item_id.tolist()

        ndcg = ndcg_at_k(top_k_recommendations, user_test_items, k)
        rr = mrr_at_k(top_k_recommendations, user_test_items, k)
        ap = average_precision_at_k(top_k_recommendations, user_test_items, k)
        precision = precision_at_k(top_k_recommendations, user_test_items, k)
        recall = recall_at_k(top_k_recommendations, user_test_items, k)
        f1 = f1_at_k(precision, recall)
    
        ndcgs.append(ndcg)
        rrs.append(rr)
        aps.append(ap)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    mndcg = np.mean(ndcgs)
    mrr = np.mean(rrs)
    map = np.mean(aps)
    mprecision = np.mean(precisions)
    mrecall = np.mean(recalls)
    mf1 = np.mean(f1s)

    return {
        'mndcg': mndcg,
        'mrr': mrr,
        'map': map,
        'mrecall': mrecall,
        'mprecision': mprecision,
        'mf1': mf1,
        'ndcgs': ndcgs,
        'rrs': rrs,
        'aps': aps,
        'recalls': recalls,
        'precisions': precisions,
        'f1s': f1s,
        'all_recommendations': all_recommendations,
        'all_really_in_test': all_really_in_test
    }


In [None]:
datasets = {}

# 1. LingoRank
if lingorank:
    datasets['LingoRank'] = {}
    datasets['LingoRank']['implicit'] = load_LingoRank()
    assert set(datasets['LingoRank']['implicit']['data'].dropna(subset=['mapped_user_id'])['mapped_user_id'].astype(int)) == set(range(int(datasets['LingoRank']['implicit']['data']['mapped_user_id'].max()) + 1)), "User IDs are not continuous and ordered."
    assert set(datasets['LingoRank']['implicit']['data'].dropna(subset=['mapped_item_id'])['mapped_item_id'].astype(int)) == set(range(int(datasets['LingoRank']['implicit']['data']['mapped_item_id'].max()) + 1)), "Item IDs are not continuous and ordered."

In [None]:
# Display the structure of our dict
def display_keys(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + str(key))
        if isinstance(value, dict) and key != "user_id_mapping" and key != "item_id_mapping":
            display_keys(value, indent+1)

display_keys(datasets)

In [None]:
for key_dataset, dataset in tqdm(datasets.items()):
    dataset['implicit']['RS_strategy'] = {}

In [None]:
if cf:
    # Create user-item sparse matrix
    for key_dataset, dataset in tqdm(datasets.items()):
        dataset['implicit']['RS_strategy']['CF'] = {}
        cf = dataset['implicit']['RS_strategy']['CF']
        train_data = dataset['implicit']['train_data']

        cf['user_item_train_data'] = sparse.csr_matrix(
            (train_data['rating'].astype(float), (train_data['mapped_user_id'], train_data['mapped_item_id']))
        )

In [None]:
# User embeddings
def compute_user_embeddings(train_data, embeddings_df, n):
    # Group train_data by user_id and sort by timestamp
    train_data_sorted = train_data.sort_values(by=['mapped_user_id', 'timestamp'], ascending=[True, False])
    train_data_grouped = train_data_sorted.groupby('mapped_user_id')

    # Initialize empty DataFrame to store user embeddings
    user_embeddings = pd.DataFrame(columns=['mapped_user_id', 'embedding', 'num_items'])

    real_nb_neigh = []
    # Iterate through each user in train_data
    for user_id, user_data in train_data_grouped:
        # Get the last n articles the user interacted with
        item_ids = user_data['mapped_item_id'].head(n).tolist()
        real_nb_neigh.append(len(item_ids))

        # Get the embeddings for these articles
        item_embeddings = np.stack(embeddings_df[embeddings_df['mapped_item_id'].isin(item_ids)]['embedding'])

        # Compute the mean embedding for the user
        user_embedding = np.mean(item_embeddings, axis=0)

        # Add the user embedding to the DataFrame
        num_items = len(item_embeddings)
        user_embeddings.loc[len(user_embeddings)] = [user_id, user_embedding, num_items]

    real_nb_neigh = sum(real_nb_neigh)/len(real_nb_neigh)

    return user_embeddings, real_nb_neigh

def compute_similarity_matrix(items_embeddings, users_embeddings):
    # Convert the embeddings to numpy arrays
    item_embeddings_array = np.vstack(items_embeddings['embedding'].apply(np.array))
    user_embeddings_array = np.vstack(users_embeddings['embedding'].apply(np.array))

    # Compute the cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(user_embeddings_array, item_embeddings_array)

    # Convert the cosine similarity matrix into a DataFrame
    # Use user_ids for the index and item_ids for the columns
    cosine_sim_df = pd.DataFrame(cosine_sim_matrix,
                                index=users_embeddings['mapped_user_id'],
                                columns=items_embeddings['mapped_item_id'])
    
    return cosine_sim_df



In [None]:
def new_init(self, hparams, data, initial_embeddings=None ,seed=None):
    
    """Initializing the model. Create parameters, placeholders, embeddings and loss function.

    Args:
        hparams (HParams): A HParams object, hold the entire set of hyperparameters.
        data (object): A recommenders.models.deeprec.DataModel.ImplicitCF object, load and process data.
        seed (int): Seed.

    """

    tf.compat.v1.set_random_seed(seed)
    np.random.seed(seed)

    self.data = data
    self.epochs = hparams.epochs
    self.lr = hparams.learning_rate
    self.emb_dim = hparams.embed_size
    self.batch_size = hparams.batch_size
    self.n_layers = hparams.n_layers
    self.decay = hparams.decay
    self.eval_epoch = hparams.eval_epoch
    self.top_k = hparams.top_k
    self.save_model = hparams.save_model
    self.save_epoch = hparams.save_epoch
    self.metrics = hparams.metrics
    self.model_dir = hparams.MODEL_DIR
    self.initial_embeddings = initial_embeddings

    metric_options = ["map", "ndcg", "precision", "recall"]
    for metric in self.metrics:
        if metric not in metric_options:
            raise ValueError(
                "Wrong metric(s), please select one of this list: {}".format(
                    metric_options
                )
            )

    self.norm_adj = data.get_norm_adj_mat()

    self.n_users = data.n_users
    self.n_items = data.n_items

    self.users = tf.compat.v1.placeholder(tf.int32, shape=(None,))
    self.pos_items = tf.compat.v1.placeholder(tf.int32, shape=(None,))
    self.neg_items = tf.compat.v1.placeholder(tf.int32, shape=(None,))

    self.weights = self._init_weights()
    self.ua_embeddings, self.ia_embeddings = self._create_lightgcn_embed()

    self.u_g_embeddings = tf.nn.embedding_lookup(
        params=self.ua_embeddings, ids=self.users
    )
    self.pos_i_g_embeddings = tf.nn.embedding_lookup(
        params=self.ia_embeddings, ids=self.pos_items
    )
    self.neg_i_g_embeddings = tf.nn.embedding_lookup(
        params=self.ia_embeddings, ids=self.neg_items
    )
    self.u_g_embeddings_pre = tf.nn.embedding_lookup(
        params=self.weights["user_embedding"], ids=self.users
    )
    self.pos_i_g_embeddings_pre = tf.nn.embedding_lookup(
        params=self.weights["item_embedding"], ids=self.pos_items
    )
    self.neg_i_g_embeddings_pre = tf.nn.embedding_lookup(
        params=self.weights["item_embedding"], ids=self.neg_items
    )

    self.batch_ratings = tf.matmul(
        self.u_g_embeddings,
        self.pos_i_g_embeddings,
        transpose_a=False,
        transpose_b=True,
    )

    self.mf_loss, self.emb_loss = self._create_bpr_loss(
        self.u_g_embeddings, self.pos_i_g_embeddings, self.neg_i_g_embeddings
    )
    self.loss = self.mf_loss + self.emb_loss

    self.opt = tf.compat.v1.train.AdamOptimizer(learning_rate=self.lr).minimize(
        self.loss
    )
    self.saver = tf.compat.v1.train.Saver(max_to_keep=1)

    gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
    self.sess = tf.compat.v1.Session(
        config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)
    )
    self.sess.run(tf.compat.v1.global_variables_initializer())

In [None]:
def new_init_weights(self):
    """Initialize user and item embeddings.

    Returns:
        dict: With keys `user_embedding` and `item_embedding`, embeddings of all users and items.

    """
    all_weights = dict()
    initializer = tf.compat.v1.keras.initializers.VarianceScaling(
        scale=1.0, mode="fan_avg", distribution="uniform"
    )

    

    if self.initial_embeddings is not None:
        all_weights["item_embedding"] = tf.Variable(self.initial_embeddings['item_embedding'], dtype=tf.float32, name="item_embedding")
        all_weights["user_embedding"] = tf.Variable(self.initial_embeddings['user_embedding'], dtype=tf.float32, name="user_embedding")
        print("Using pretrained embeddings.")
    else:
        all_weights["user_embedding"] = tf.Variable(
            initializer([self.n_users, self.emb_dim]), name="user_embedding"
        )
        all_weights["item_embedding"] = tf.Variable(
            initializer([self.n_items, self.emb_dim]), name="item_embedding"
        )
        print("Using xavier initialization.")

    return all_weights

In [None]:
LightGCN.__init__ = new_init
LightGCN._init_weights = new_init_weights

In [None]:
if lightgcn:
    for key_dataset, dataset in tqdm(datasets.items()):
        
        dataset['implicit']['RS_strategy']['graph-based'] = {}
        graph_based = dataset['implicit']['RS_strategy']['graph-based']
        dataset['implicit']['RS_strategy']['graph-based']['models'] = {}
        dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN'] = {}

        # dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['method'] = {}
        dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['embedding_model'] = {}

        print("="*100)
        print(key_dataset)
        print("="*100)

        # metrics = {"Recall": [], "Precision": [], "F1": [], "NDCG": [], "MRR": [], "MAP": []} 
        metrics = {}

        embeddings_models = ["ada" if (ada or xavier_ada) else None] + ["bert" if (bert or xavier_bert) else None]
        embedding_models = [embeddings_model for embeddings_model in embeddings_models if embeddings_model is not None]
        # for embedding_model in dataset['implicit']['items_embeddings'].keys():
        for embedding_model in embedding_models:
            dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['embedding_model'][embedding_model] = {}
            dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['embedding_model'][embedding_model]['method'] = {}
            # if embedding_model=="bert": continue
            print("-"*100)
            print(embedding_model)
            print("-"*100)
                
            for method in [method for method in ['Xavier' if xavier_ada or xavier_bert else None] + ['precomputed' if ada or bert else None] if method is not None]:
                print("*"*50)
                print(method)
                print("*"*50)

                dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['embedding_model'][embedding_model]['method'][method] = {}

                train_data = dataset['implicit']['train_data'].copy()
                train_data = train_data.rename(columns = {'mapped_user_id':'userID', 'mapped_item_id':'itemID'})
            
                test_data = dataset['implicit']['test_data'].copy()
                test_data = test_data.rename(columns = {'mapped_user_id':'userID', 'mapped_item_id':'itemID'})
                    
                dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['data'] = ImplicitCF(train=train_data, test=test_data, seed=my_seed, col_user='userID', col_item='itemID', col_rating='rating')
                data = dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['data']

                try:
                    embed_size = len(dataset['implicit']['items_embeddings'][embedding_model]['embedding'].iloc[0])
                except:
                    if xavier_ada:
                        embed_size = 1536
                    elif xavier_bert:
                        embed_size = 768
                assert (xavier_ada or ada) and embed_size==1536 or (xavier_bert or bert) and embed_size==768, "Embedding size is not correct"
                print(f"Embedding size: {embed_size}")

                if method=="precomputed":
                    item_embeddings = dataset['implicit']['items_embeddings'][embedding_model]

                    item_embeddings = item_embeddings[item_embeddings.mapped_item_id.isin(set(train_data.itemID).union(set(test_data.itemID)))]
                    item_embeddings = item_embeddings.sort_values(by='mapped_item_id', ascending=True)
                    assert item_embeddings[item_embeddings.mapped_item_id.isin(set(train_data.itemID).union(set(test_data.itemID)))].mapped_item_id.is_monotonic_increasing, 'not ordered by item_id'

                    initial_embeddings = {}
                    initial_embeddings['item_embedding'] = np.vstack(item_embeddings[item_embeddings.mapped_item_id.isin(set(train_data.itemID).union(test_data.itemID))].embedding.values)
                    ns = lightgcn_params['n']
                elif method=="Xavier":
                    initial_embeddings = None
                    ns = [None]

                total_configs = len(lightgcn_params['n_layers'])*len(ns)*len(lightgcn_params['lr'])
                config = 0
                for n_layers, n, lr in tqdm(itertools.product(lightgcn_params['n_layers'], ns , lightgcn_params['lr']), total=total_configs):
                    
                    print("^"*50)    
                    print(f"n_layers: {n_layers} - n: {n} - lr: {lr}")
                    print("^"*50) 
                    if method=="precomputed":
                        user_embeddings, real_nb_neigh = compute_user_embeddings(dataset['implicit']['train_data'], item_embeddings, n)
                        initial_embeddings['user_embedding'] = np.vstack(user_embeddings[user_embeddings.mapped_user_id.isin(set(train_data.userID))].embedding.values)
                

                    hparams = prepare_hparams(                          
                                learning_rate=lr,
                                eval_epoch=10000000,
                                top_k=5,
                                save_model=False,
                                epochs=1,
                                save_epoch=1,
                                model_type="lightgcn",
                                embed_size=embed_size,
                                n_layers=n_layers,
                                batch_size=1024,
                                decay=0.0001,
                                metrics=["recall", "ndcg", "precision", "map"],
                                MODEL_DIR="./tests/resources/deeprec/lightgcn/model/lightgcn_model/"
                                )
                
                
                    best_model = None
                    best_ndcg = 0
                    tf.compat.v1.set_random_seed(my_seed)
                    tf.random.set_seed(my_seed)
                    np.random.seed(my_seed)
                    random.seed(my_seed)
                    model = LightGCN(hparams, data, initial_embeddings=initial_embeddings, seed=my_seed)
                    save_path = "../results/lightgcn_model/best_model"
                    patience_max = 10

                    with Timer() as train_time:
                        for epoch in range(sys.maxsize):
                            model.fit()
                            eval_start = time.time()
                            recall, ndcg, precision, map = model.run_eval()
                            eval_end = time.time()
                            eval_time = eval_end - eval_start         
                            print(f"Evaluation time: {eval_time:.1f}s")
                            print(f"Epoch {epoch} - Recall@5: {recall:.4f} - NDCG@5: {ndcg:.4f} - Precision@5: {precision:.4f} - MAP@5: {map:.4f}")
                            print("------------------------")    
                            if ndcg > best_ndcg:
                                patience = 0
                                best_ndcg = ndcg
                                best_epoch = epoch
                                model.saver.save(model.sess, save_path)
                            else:
                                patience += 1
                            
                            if patience == patience_max:
                                print("="*25 + f"Best NDCG: {best_ndcg:.4f} at epoch {best_epoch}" + "="*25)
                                break
                    print("Took {} seconds for training.".format(train_time.interval))

                    model.load(save_path)
                    eval_metrics =  evaluate(dataset['implicit']['test_data'], None, 'implicit', 'graph-based', dataset['implicit']['RS_strategy']['graph-based'], 'LightGCN', {'model': model}, key_dataset, k=5)
                    
                    storing_key = f'{embedding_model}_{method}'
                    if storing_key not in metrics.keys():
                        metrics[storing_key] = {}
                        metrics[storing_key] = {"Recall": [], "Precision": [], "F1": [], "NDCG": [], "MRR": [], "MAP": []} 
                    metrics[storing_key]['Recall'].append(eval_metrics['mrecall'])
                    metrics[storing_key]['Precision'].append(eval_metrics['mprecision'])
                    metrics[storing_key]['F1'].append(eval_metrics['mf1'])
                    metrics[storing_key]['NDCG'].append(eval_metrics['mndcg'])
                    metrics[storing_key]['MRR'].append(eval_metrics['mrr'])
                    metrics[storing_key]['MAP'].append(eval_metrics['map'])

                    model.sess.close()  # Close the existing session
                    tf.compat.v1.Session().close()
                    # Reset the default graph
                    tf.compat.v1.reset_default_graph()
                

In [None]:
# metrics

In [None]:
# Extract the NDCG values for each series
xavier_ada_ndcg = metrics['ada_Xavier']['NDCG']
pretrained_ada_ndcg = metrics['ada_precomputed']['NDCG']

plt.figure(figsize=(10, 6))
layers = np.arange(1, len(xavier_ada_ndcg) + 1)

# Xavier ADA NDCG series
plt.plot(layers, xavier_ada_ndcg, marker='x', linestyle='--', label='Xavier ADA', color='black')

# Pretrained ADA NDCG series
plt.plot(layers, pretrained_ada_ndcg, marker='x', linestyle='-', label='Pretrained ADA', color='black')

# Highlight the best NDCG value for Xavier ADA
best_xavier_index = np.argmax(xavier_ada_ndcg)
plt.scatter(best_xavier_index + 1, xavier_ada_ndcg[best_xavier_index], color='black',
            s=100, edgecolor='black', zorder=5, marker='o')

# Highlight the best NDCG value for Pretrained ADA
best_pretrained_index = np.argmax(pretrained_ada_ndcg)
plt.scatter(best_pretrained_index + 1, pretrained_ada_ndcg[best_pretrained_index], color='black',
            s=100, edgecolor='black', zorder=5, marker='o')

# Set x-axis ticks to only display integers
plt.xticks(layers)
plt.xlabel('Number of Layers')
plt.ylabel('NDCG@5')
plt.title('LightGCN Performance by Number of Layers')
plt.legend()
plt.grid(False)

# Save the figure to a PDF file
plt.savefig('ndcg_ada.pdf', format='pdf', bbox_inches='tight')

# Show the plot
plt.show()