In [None]:
!poetry run pip install lightfm=="1.17"
!poetry run pip install scikit-surprise=="1.1.3"
# !poetry add recommenders@"1.1.1"
!poetry run pip install recommenders=="1.1.1"

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

import pandas as pd
import numpy as np
import scipy.sparse as sparse
import implicit
from tqdm.auto import tqdm
import random
from sklearn.metrics.pairwise import cosine_similarity
import ast

my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

from huggingface_hub import snapshot_download

import tensorflow as tf

import copy
import itertools

import time
import sys

In [None]:
save_tqdm_to_file = False
tqdm_file = "tqdm_file.txt"

# best_hparams_file = None
# evaluation_results_file = None
best_hparams_file = '../results/best_hparams_zeegu.csv'
evaluation_results_file = '../results/evaluation_results_zeegu.csv'

# Specify the datasets we want to use
lingorank = True #Zeegu dataset
ml_100k = False
goodreads = False
tomplay = False

#Content-based
content_based = False

#CF
als = False
als_params = {
    'factors': [20,50,100],
    'regularization': [0.001, 0.01, 0.05, 0.1],
    'alpha': [1,5,10,40,50]
}

bpr = False
bpr_params = {
    'factors': [20,50,100],
    'regularization': [0.001, 0.01, 0.05, 0.1],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'verify_negative_samples': [True, False]
}

lmf = False
lmf_params = {
    'factors': [20,50,100],
    'regularization': [0.05, 0.1, 0.5, 0.6, 1],
    'learning_rate': [0.05, 0.1, 0.5, 1],
    'neg_prop': [10, 30, 50]
}

cf = als or bpr or lmf # If at least one, we do the CF

#LightGCN
ada = False
bert = False
xavier_ada = False
xavier_bert = False
lightgcn = ada or bert or xavier_ada or xavier_bert # If at least one, we do the LightGCN
lightgcn_params = {
    'n_layers': [1,2,3,4,5,6,7,8,9,10],
    'n': [1,2,3,5,20,30,50],
    'lr': [0.0005,0.001,0.01,0.05,0.1]
}   

In [None]:
# Download CSVs
snapshot_download(
    repo_id="OloriBern/FLDE",
    allow_patterns=["recommendation/*"],
    local_dir=os.path.join("..", "results"),
    revision="main",
    repo_type="dataset"
)

In [None]:
def split_user_data(group, test_size):
        if len(group) == 1:
            return pd.DataFrame(), pd.DataFrame(), group
        else:
            n = max(1, int(round(len(group) * test_size)))
            return group.iloc[:-n], group.iloc[-n:], pd.DataFrame()

In [None]:
def train_test_split(df, n=None, test_size=0.2, one_core=False):
    # Sort by user_id and timestamp
    # df.sort_values(by=['user_id', 'timestamp'], ascending=[True, True], inplace=True)

    # Sort by user_id and timestamp
    df.sort_values(by=['user_id', 'timestamp'], ascending=[True, True], inplace=True)

    

    tqdm.pandas(desc="Splitting data")
    splits = df.groupby('user_id').progress_apply(lambda x: split_user_data(x, test_size))

    # Concatenate the results
    train_data = pd.concat([split[0] for split in splits])
    test_data = pd.concat([split[1] for split in splits])
    other_data = pd.concat([split[2] for split in splits])

    # Remove other_data from data by checking user_id and item_id
    data = pd.concat([train_data, test_data])
    # Reset index
    data.reset_index(drop=True, inplace=True)


    # Find number of articles in test_data but not in train_data
    unique_test_articles = set(test_data['item_id'])
    unique_train_articles = set(train_data['item_id'])
    unique_articles_in_test_not_train = unique_test_articles - unique_train_articles
    num_unique_articles_in_test_not_train = len(unique_articles_in_test_not_train)

    # Calculate number of unique user_ids in test_data but not in train_data
    unique_test_user_ids = set(test_data['user_id'])
    unique_train_user_ids = set(train_data['user_id'])
    unique_users_in_test_not_train = unique_test_user_ids - unique_train_user_ids
    num_unique_users_in_test_not_train = len(unique_users_in_test_not_train)
    num_unique_users_in_other_data = len(other_data['user_id'].unique()) if not other_data.empty else 0
    num_unique_articles_in_other_data_not_train = len(other_data[~ other_data['item_id'].isin(unique_train_articles)]['item_id'].unique()) if not other_data.empty else 0

    print(f"Number of articles in test_data but not in train_data: {num_unique_articles_in_test_not_train}")
    print(f"Number of user_ids in test_data but not in train_data: {num_unique_users_in_test_not_train}")
    print(f"Number of user_ids in other_data: {num_unique_users_in_other_data}")
    print(f"Number of articles in other_data but not in train_data: {num_unique_articles_in_other_data_not_train}")

    # Calculate size of test data compared to train data
    size_test_data = len(test_data)
    size_data = len(train_data) + len(test_data)
    percentage_test_data = (size_test_data / size_data) * 100

    print(f"Size of test data: {percentage_test_data:.2f}%")
    print(f"Size of other data: {len(other_data) / size_data * 100:.2f}%")
    print(f"Size of all test data: {(size_test_data + len(other_data)) / size_data * 100:.2f}%")
    return train_data, test_data, other_data

In [None]:
def map_ids(data, train_data, test_data):
    # Create mapping for userIds and itemIds based on train_df
    unique_user_ids = train_data['user_id'].unique()
    unique_item_ids = train_data['item_id'].unique()

    user_id_mapping = {userId: i for i, userId in enumerate(unique_user_ids)}
    item_id_mapping = {itemId: i for i, itemId in enumerate(unique_item_ids)}

    # Map userIds and itemIds in train_df to new consecutive IDs
    train_data['mapped_user_id'] = train_data['user_id'].map(user_id_mapping)
    train_data['mapped_item_id'] = train_data['item_id'].map(item_id_mapping)

    # Function to map new IDs in test_df and update the mapping accordingly
    def map_and_update_id(id_value, current_mapping):
        if id_value not in current_mapping:
            current_mapping[id_value] = max(current_mapping.values()) + 1
        return current_mapping[id_value]

    test_data['mapped_user_id'] = test_data['user_id'].apply(lambda x: map_and_update_id(x, user_id_mapping))
    test_data['mapped_item_id'] = test_data['item_id'].apply(lambda x: map_and_update_id(x, item_id_mapping))

    if data is not None:
        data['mapped_user_id'] = data['user_id'].map(user_id_mapping).astype('Int64')
        data['mapped_item_id'] = data['item_id'].map(item_id_mapping).astype('Int64')
    else:
        data = pd.concat([train_data, test_data])
    
    data.reset_index(drop=True, inplace=True)
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    return data, train_data, test_data, user_id_mapping, item_id_mapping

In [None]:
def load_ml_100k():
    data = pd.read_csv("../results/recommendation/ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])

    data = data[data['rating'] >= 4]

    # Substract 3 to all the ratings
    data['rating'] = data['rating'] - 3

    train_data, test_data, other_data = train_test_split(data)
    print(f"ml-100k - Proportion of positive ratings affected to test set: {round(len(test_data)/(len(test_data)+len(train_data))*100,2)} %")
    
    data, train_data, test_data, user_id_mapping, item_id_mapping = map_ids(data, train_data, test_data)

    # Compute the number of unique users and items and print
    num_unique_users = data['mapped_user_id'].nunique()
    num_unique_items = data['mapped_item_id'].nunique()
    print(f"Number of unique users: {num_unique_users}")
    print(f"Number of unique items: {num_unique_items}")
    # Print the number of interactions
    print(f"Number of interactions: {len(data)}")
    # Compute sparsity
    sparsity = 1 - len(data) / (num_unique_users * num_unique_items)
    print(f"Sparsity: {sparsity:.2%}")
    ######

    ##### Load embeddings #####

    items_embeddings = None
    if ada or bert or content_based:
        items_embeddings = {}
        
        embeddings_file = f"../results/recommendation/embeddings_ml-100k.csv.gz"

        embedding_models = {"ada": ada, "bert": bert}
        if content_based:
            embedding_models = {"ada": True, "bert": True}

        df = pd.read_csv(embeddings_file)
        # Rename movie id to item_id    
        df = df.rename(columns={'movie id': 'item_id'})
                # Iterate over the dictionary
        for embedding_model, is_enabled in embedding_models.items():
            if not is_enabled:
                continue
            embedding_key = embedding_model + "_embedding"
            assert train_data[train_data['item_id'].isin(df[df[embedding_key].isnull()]['item_id'])].empty 
            assert test_data[test_data['item_id'].isin(df[df[embedding_key].isnull()]['item_id'])].empty 
            df2 = df[~ df[embedding_key].isnull()]
            df2 = df2.rename(columns={embedding_key: 'embedding'})
            df2['embedding'] = df2['embedding'].apply(ast.literal_eval) #Convert list stored as a str to real list type
            df2['mapped_item_id'] = df2['item_id'].map(item_id_mapping).astype('Int64')
            df2 = df2.dropna(subset=['mapped_item_id']) # If NA, item is not in train set nor test set
            items_embeddings[embedding_model] = df2[['item_id', 'mapped_item_id', 'embedding']]


    ###########################    
    
    return {
        'data': data,
        'train_data': train_data,
        'test_data': test_data,
        'user_id_mapping': user_id_mapping,
        'item_id_mapping': item_id_mapping,
        'items_embeddings': items_embeddings
    }

In [None]:
def load_LingoRank(strategy: int = 2):
    data_full = pd.read_csv(f"../results/recommendation/Zeegu/strategy{strategy}.csv")
    
    # Rename data_full.article_id as data_full.item_id
    data_full.rename(columns={"article_id": "item_id"}, inplace=True)
    ## Remove the articles for which there is no positive rating 
    # Before removing articles, count the unique articles
    original_unique_articles = data_full['item_id'].nunique()

    # Identify articles that have maximum rating <= 0
    articles_to_remove = data_full.groupby('item_id')['rating'].max()
    articles_to_remove = articles_to_remove[articles_to_remove <= 0].index.tolist()
    articles_to_remove.append(2223234) # Cannot use it for content-based because content is wrong. So we don't consider it at all

    # Remove these articles from data_full
    data_full = data_full[~data_full['item_id'].isin(articles_to_remove)]

    data = data_full[(data_full['rating'] != 0)].copy()
    
    train_data, test_data, other_data = train_test_split(data)
    print(f"Strategy {strategy} - Proportion of positive ratings affected to test set: {round(len(test_data)/(len(test_data)+len(train_data))*100,2)} %")

    data, train_data, test_data, user_id_mapping, item_id_mapping = map_ids(data, train_data, test_data)

    # Compute the number of unique users and items and print
    num_unique_users = data['mapped_user_id'].nunique()
    num_unique_items = data['mapped_item_id'].nunique()
    print(f"Number of unique users: {num_unique_users}")
    print(f"Number of unique items: {num_unique_items}")
    # Print the number of interactions
    print(f"Number of interactions: {len(data)}")
    # Compute sparsity
    sparsity = 1 - len(data) / (num_unique_users * num_unique_items)
    print(f"Sparsity: {sparsity:.2%}")


    ##### Load embeddings #####

    items_embeddings = None
    if ada or bert or content_based:
        items_embeddings = {}
        embeddings_file = f"../results/recommendation/embeddings_strategy{strategy}.csv.gz"

        embedding_models = {"ada": ada, "bert": bert}
        if content_based:
            embedding_models = {"ada": True, "bert": True}

        df = pd.read_csv(embeddings_file)
        df = df.rename(columns={'id': 'item_id'})
        # Iterate over the dictionary
        for embedding_model, is_enabled in embedding_models.items():
            if not is_enabled:
                continue

            embedding_key = embedding_model + "_embedding"
            # Remove rows for which ada_embedding is null
            assert data[data['item_id'].isin(df[df[embedding_key].isnull()]['item_id'])].empty # Already removed the article
            assert data_full[data_full['item_id'].isin(df[df[embedding_key].isnull()]['item_id'])].empty # Already removed the article
            
            df2 = df[~ df[embedding_key].isnull()]
            df2 = df2.rename(columns={embedding_key: 'embedding'})
            df2['embedding'] = df2['embedding'].apply(ast.literal_eval) #Convert list stored as a str to real list type
            df2['mapped_item_id'] = df2['item_id'].map(item_id_mapping).astype('Int64')
            df2 = df2.dropna(subset=['mapped_item_id']) # If NA, item is not in train set nor test set
            items_embeddings[embedding_model] = df2[['item_id', 'mapped_item_id', 'embedding']]

    ###########################

    return {
            'data': data,
            'train_data': train_data,
            'test_data': test_data,
            'user_id_mapping': user_id_mapping,
            'item_id_mapping': item_id_mapping,
            'items_embeddings': items_embeddings
        }

In [None]:
def load_goodreads(subset: str = "children"):

    file_path = f'../results/recommendation/Goodreads/goodreads_reviews_{subset}.json.gz'
    
    # Load the data
    data = pd.read_json(file_path, lines=True, compression='gzip')

    print("JSON LOADED")

    # Select and rename columns
    data = data[['user_id', 'book_id', 'rating', 'date_added']]
    data.rename(columns={'book_id': 'item_id', 'date_added': 'timestamp'}, inplace=True)

    print("COLUMNS SELECTED AND RENAMED")

    data = data[data['rating'] >= 4]

    # Substract 3 to each rating
    data['rating'] = data['rating'] - 3

    print("RATINGS FILTERED")

    # Convert 'timestamp' column to datetime
    data['timestamp'] = pd.to_datetime(data['timestamp'])

    print("TIMESTAMP CONVERTED")

    unique_user_ids = data['user_id'].unique()
    print(f"Number of users: {len(unique_user_ids)}")
    unique_item_ids = data['item_id'].unique()
    print(f"Number of items: {len(unique_item_ids)}")

    train_data, test_data, other_data = train_test_split(data, test_size=0.2, one_core=True)
    print(f"Goodreads {subset} - Proportion of positive ratings affected to test set: {round(len(test_data)/(len(test_data)+len(train_data))*100,2)} %")
    data, train_data, test_data, user_id_mapping, item_id_mapping = map_ids(data, train_data, test_data)

    # Compute the number of unique users and items and print
    num_unique_users = data['mapped_user_id'].nunique()
    num_unique_items = data['mapped_item_id'].nunique()
    print(f"Number of unique users: {num_unique_users}")
    print(f"Number of unique items: {num_unique_items}")
    # Print the number of interactions
    print(f"Number of interactions: {len(data)}")
    # Compute sparsity
    sparsity = 1 - len(data) / (num_unique_users * num_unique_items)
    print(f"Sparsity: {sparsity:.2%}")

    ##### Load embeddings #####

    items_embeddings = None
    if ada or bert or content_based:
        items_embeddings = {}
        
        embeddings_file = f"../results/recommendation/embeddings_goodreads_{subset}.csv.gz"

        embedding_models = {"ada": ada, "bert": bert}

        if content_based:
            embedding_models = {"ada": True, "bert": True}

        df = pd.read_csv(embeddings_file)
        df = df.rename(columns={'book_id': 'item_id'})
        
        # Iterate over the dictionary
        for embedding_model, is_enabled in embedding_models.items():
            if not is_enabled:
                continue
            embedding_key = embedding_model + "_embedding"
            assert data[data['item_id'].isin(df[df[embedding_key].isnull()]['item_id'])].empty # Already removed the article
            df2 = df[~ df[embedding_key].isnull()]
            df2 = df2.rename(columns={embedding_key: 'embedding'})
            df2['embedding'] = df2['embedding'].apply(ast.literal_eval) #Convert list stored as a str to real list type
            df2['mapped_item_id'] = df2['item_id'].map(item_id_mapping).astype('Int64')
            df2 = df2.dropna(subset=['mapped_item_id']) # If NA, item is not in train set nor test set
            items_embeddings[embedding_model] = df2[['item_id', 'mapped_item_id', 'embedding']]

    ###########################    
    
    return {
        'data': data,
        'train_data': train_data,
        'test_data': test_data,
        'user_id_mapping': user_id_mapping,
        'item_id_mapping': item_id_mapping,
        'items_embeddings': items_embeddings
    }

In [None]:
def sparsify_data(data, n_items_per_user):
    # Ensure n_items_per_user is at least 1
    n_items_per_user = max(1, n_items_per_user)

    # Sort data by user and timestamp in descending order (newest first)
    sorted_data = data.sort_values(by=['user_id', 'timestamp'], ascending=[True, False])

    # Group by user and keep only the newest n_items_per_user items for each user
    sparsified_data = sorted_data.groupby('user_id').head(n_items_per_user)

    # Compute the number of unique users and items
    num_unique_users = sparsified_data['user_id'].nunique()
    num_unique_items = sparsified_data['item_id'].nunique()

    # Compute sparsity
    sparsity = 1 - len(sparsified_data) / (num_unique_users * num_unique_items)
    print(f"Number of unique users: {num_unique_users}")
    print(f"Number of unique items: {num_unique_items}")
    print(f"Number of interactions: {len(sparsified_data)}")
    print(f"Sparsity: {sparsity:.6%}")

    return sparsified_data, sparsity

In [None]:
def load_tomplay():
    file_path = f'../results/recommendation/Tomplay/interactions.csv'
    data = pd.read_csv(file_path, usecols=["USER_ID", "ITEM_ID", "TIMESTAMP"])
    # Rename the columns to lowercase
    data.rename(columns=lambda x: x.lower(), inplace=True)

    data['rating'] = 1

    # Sort the DataFrame by timestamp in descending order
    data = data.sort_values(by='timestamp', ascending=False)

    # Remove duplicates, keeping the first occurrence
    data = data.drop_duplicates(subset=['user_id', 'item_id'], keep='first')

    data, _ = sparsify_data(data, 30)

    unique_user_ids = data['user_id'].unique()
    print(f"Number of users: {len(unique_user_ids)}")
    unique_item_ids = data['item_id'].unique()
    print(f"Number of items: {len(unique_item_ids)}")

    train_data, test_data, other_data = train_test_split(data)
    print(f"Tomplay - Proportion of positive ratings affected to test set: {round(len(test_data)/(len(test_data)+len(train_data))*100,2)} %")

    data, train_data, test_data, user_id_mapping, item_id_mapping = map_ids(data, train_data, test_data)

    # Compute the number of unique users and items and print
    num_unique_users = data['mapped_user_id'].nunique()
    num_unique_items = data['mapped_item_id'].nunique()
    print(f"Number of unique users: {num_unique_users}")
    print(f"Number of unique items: {num_unique_items}")
    # Print the number of interactions
    print(f"Number of interactions: {len(data)}")
    # Compute sparsity
    sparsity = 1 - len(data) / (num_unique_users * num_unique_items)
    print(f"Sparsity: {sparsity:.2%}")

    ##### Load embeddings #####

    items_embeddings = None
    if ada or bert or content_based:
        items_embeddings = {}
        
        embeddings_file = f"../results/recommendation/embeddings_tomplay.csv.gz"

        embedding_models = {"ada": ada, "bert": bert}
        if content_based:
            embedding_models = {"ada": True, "bert": True}

        df = pd.read_csv(embeddings_file)
        df = df.rename(columns={'ITEM_ID': 'item_id'})
        
        # Iterate over the dictionary
        for embedding_model, is_enabled in embedding_models.items():
            if not is_enabled:
                continue
            embedding_key = embedding_model + "_embedding"
            assert data[data['item_id'].isin(df[df[embedding_key].isnull()]['item_id'])].empty # Already removed the article
            df2 = df[~ df[embedding_key].isnull()]
            df2 = df2.rename(columns={embedding_key: 'embedding'})
            df2['embedding'] = df2['embedding'].apply(ast.literal_eval) #Convert list stored as a str to real list type
            df2['mapped_item_id'] = df2['item_id'].map(item_id_mapping).astype('Int64')
            df2 = df2.dropna(subset=['mapped_item_id']) # If NA, item is not in train set nor test set
            items_embeddings[embedding_model] = df2[['item_id', 'mapped_item_id', 'embedding']]

    ###########################    


    return {
            'data': data,
            'train_data': train_data,
            'test_data': test_data,
            'user_id_mapping': user_id_mapping,
            'item_id_mapping': item_id_mapping,
            'items_embeddings': items_embeddings
        }

In [None]:
def ndcg_at_k(recommendations, real_items, k):
    """
    Compute NDCG at rank k
    """
    ranked_list = [1 if item_id in real_items else 0 for item_id in recommendations]
    num_relevant = min(k, len(real_items))
    ideal_list = [1] * num_relevant + [0] * (k - num_relevant)
    dcg = sum((2 ** rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(ranked_list[:k]))
    idcg = sum((2 ** rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(ideal_list[:k]))
    return dcg / idcg if idcg > 0 else 0

def mrr_at_k(recommendations, real_items, k):
    """
    Compute MRR at rank k
    """
    ranked_list = [1 if item_id in real_items else 0 for item_id in recommendations]
    for idx, rel in enumerate(ranked_list[:k]):
        if rel > 0:
            return 1 / (idx + 1)
    return 0

def precision_at_k(recommendations, real_items, k):
    """
    Compute Precision at rank k
    """
    return sum([1 if item_id in real_items else 0 for item_id in recommendations[:k]]) / min(k, len(recommendations))

def recall_at_k(recommendations, real_items, k):
    """
    Compute Recall at rank k
    """
    return sum([1 if item_id in real_items else 0 for item_id in recommendations[:k]]) / len(real_items)

def f1_at_k(precision_at_k, recall_at_k):
    """
    Compute F1 at rank k
    """
    if precision_at_k == 0 and recall_at_k == 0:
        return 0

    f1_score = 2 * (precision_at_k * recall_at_k) / (precision_at_k + recall_at_k)
    return f1_score

def average_precision_at_k(recommendations, test_items, k):

    recommendations_presence = [1 if item_id in test_items else 0 for item_id in recommendations[:k]]

    # Keep track of the number of relevant items found
    num_relevant = 0
    # Keep track of the sum of precisions
    sum_precisions = 0
    # We are interested in the precision at each point a relevant document is retrieved
    for idx in range(min(k, len(recommendations_presence))):
        # Check if the item is relevant
        if recommendations_presence[idx] == 1:
            # Increment the count of relevant items
            num_relevant += 1
            # Update the sum of precisions
            sum_precisions += num_relevant / (idx + 1)

    return sum_precisions / min(k, len(test_items))

In [None]:
def evaluate(test_data, data_full, data_strategy, RS_strategy_name, RS_strategy, model_name, model, dataset, k=5, progress=True):

    global user_embedding
    global item_embeddings
    
    ndcgs = []
    rrs = []
    aps = []  
    precisions = []
    recalls = []
    f1s = []
    all_recommendations = []
    all_really_in_test = []

    # Loop for each user in test_df
    user_ids = test_data['mapped_user_id'].unique()

    if progress:
        user_ids_for_loop = tqdm(user_ids, desc="Users")
    else:
        user_ids_for_loop = user_ids   
    for user in user_ids_for_loop:

        if RS_strategy_name=="CF" and data_strategy=="implicit":
            top_k_recommendations = model['model'].recommend(user, RS_strategy['user_item_train_data'][user], k)[0]
            user_test_items = test_data[test_data['mapped_user_id'] == user].mapped_item_id.tolist()
        
        elif RS_strategy_name=="content-based":

            # Get the list of item_ids from train_data that the user has already interacted with.
            train_data_items = data_strategy['train_data'][data_strategy['train_data']['mapped_user_id'] == user]['mapped_item_id'].tolist()
            # Get the similarity scores and filter out items present in train_data.
            filtered_sim_scores = model['sim_matrix'].loc[user].drop(train_data_items)
            # Now sort the remaining items and select the top k items.
            top_k_recommendations = np.array(filtered_sim_scores.sort_values(ascending=False).index.tolist()[:k])
            user_test_items = test_data[test_data['mapped_user_id'] == user].mapped_item_id.tolist()

        elif RS_strategy_name=="graph-based":

            top_k_recommendations = model['model'].recommend_k_items(pd.DataFrame({'userID': [user]}), top_k=k, remove_seen=True)
            top_k_recommendations = top_k_recommendations['itemID'].tolist()
            user_test_items = test_data[test_data['mapped_user_id'] == user].mapped_item_id.tolist()

        ndcg = ndcg_at_k(top_k_recommendations, user_test_items, k)
        rr = mrr_at_k(top_k_recommendations, user_test_items, k)
        ap = average_precision_at_k(top_k_recommendations, user_test_items, k)
        precision = precision_at_k(top_k_recommendations, user_test_items, k)
        recall = recall_at_k(top_k_recommendations, user_test_items, k)
        f1 = f1_at_k(precision, recall)

        ndcgs.append(ndcg)
        rrs.append(rr)
        aps.append(ap)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    mndcg = np.mean(ndcgs)
    mrr = np.mean(rrs)
    map = np.mean(aps)
    mprecision = np.mean(precisions)
    mrecall = np.mean(recalls)
    mf1 = np.mean(f1s)

    return {
        'mndcg': mndcg,
        'mrr': mrr,
        'map': map,
        'mrecall': mrecall,
        'mprecision': mprecision,
        'mf1': mf1,
        'ndcgs': ndcgs,
        'rrs': rrs,
        'aps': aps,
        'recalls': recalls,
        'precisions': precisions,
        'f1s': f1s,
        'all_recommendations': all_recommendations,
        'all_really_in_test': all_really_in_test
    }


In [None]:
def load_tomplay():
    file_path = f'../results/recommendation/Tomplay/interactions.csv'
    data = pd.read_csv(file_path, usecols=["USER_ID", "ITEM_ID", "TIMESTAMP"])
    # Rename the columns to lowercase
    data.rename(columns=lambda x: x.lower(), inplace=True)

    data['rating'] = 1

    # Sort the DataFrame by timestamp in descending order
    data = data.sort_values(by='timestamp', ascending=False)
    # Remove duplicates, keeping the first occurrence
    data = data.drop_duplicates(subset=['user_id', 'item_id'], keep='first')

    data, _ = sparsify_data(data, 30)

    unique_user_ids = data['user_id'].unique()
    print(f"Number of users: {len(unique_user_ids)}")
    unique_item_ids = data['item_id'].unique()
    print(f"Number of items: {len(unique_item_ids)}")

    train_data, test_data, other_data, cold_start_test_data = train_test_split(data)
    print(f"Tomplay - Proportion of positive ratings affected to test set: {round(len(test_data)/(len(test_data)+len(train_data))*100,2)} %")

    data, train_data, test_data, user_id_mapping, item_id_mapping = map_ids(data, train_data, test_data)

    # Compute the number of unique users and items and print
    num_unique_users = data['mapped_user_id'].nunique()
    num_unique_items = data['mapped_item_id'].nunique()
    print(f"Number of unique users: {num_unique_users}")
    print(f"Number of unique items: {num_unique_items}")
    # Print the number of interactions
    print(f"Number of interactions: {len(data)}")
    # Compute sparsity
    sparsity = 1 - len(data) / (num_unique_users * num_unique_items)
    print(f"Sparsity: {sparsity:.2%}")

    ##### Load embeddings #####

    items_embeddings = None
    if ada or bert or content_based:
        items_embeddings = {}
        
        embeddings_file = f"../results/recommendation/embeddings_tomplay.csv.gz"
        
        embedding_models = {"ada": ada, "bert": bert}
        if content_based:
            embedding_models = {"ada": True, "bert": True}

        df = pd.read_csv(embeddings_file)
        df = df.rename(columns={'ITEM_ID': 'item_id'})
        
        # Iterate over the dictionary
        for embedding_model, is_enabled in embedding_models.items():
            if not is_enabled:
                continue
            embedding_key = embedding_model + "_embedding"
            assert data[data['item_id'].isin(df[df[embedding_key].isnull()]['item_id'])].empty # Already removed the article
            df2 = df[~ df[embedding_key].isnull()]
            df2 = df2.rename(columns={embedding_key: 'embedding'})
            df2['embedding'] = df2['embedding'].apply(ast.literal_eval) #Convert list stored as a str to real list type
            df2['mapped_item_id'] = df2['item_id'].map(item_id_mapping).astype('Int64')
            df2 = df2.dropna(subset=['mapped_item_id']) # If NA, item is not in train set nor test set
            items_embeddings[embedding_model] = df2[['item_id', 'mapped_item_id', 'embedding']]

    ###########################    

    return {
            'data': data,
            'train_data': train_data,
            'test_data': test_data,
            'user_id_mapping': user_id_mapping,
            'item_id_mapping': item_id_mapping,
            'items_embeddings': items_embeddings
        }

In [None]:
datasets = {}

# 1. Zeegu
if lingorank:
    datasets['LingoRank'] = {}
    datasets['LingoRank']['implicit'] = load_LingoRank()
    assert set(datasets['LingoRank']['implicit']['data'].dropna(subset=['mapped_user_id'])['mapped_user_id'].astype(int)) == set(range(int(datasets['LingoRank']['implicit']['data']['mapped_user_id'].max()) + 1)), "User IDs are not continuous and ordered."
    assert set(datasets['LingoRank']['implicit']['data'].dropna(subset=['mapped_item_id'])['mapped_item_id'].astype(int)) == set(range(int(datasets['LingoRank']['implicit']['data']['mapped_item_id'].max()) + 1)), "Item IDs are not continuous and ordered."

# 2. MovieLens-100k
if ml_100k:
    datasets['ml-100k'] = {}
    datasets['ml-100k']['implicit'] = load_ml_100k()
    assert set(datasets['ml-100k']['implicit']['data'].dropna(subset=['mapped_user_id'])['mapped_user_id'].astype(int)) == set(range(int(datasets['ml-100k']['implicit']['data']['mapped_user_id'].max()) + 1)), "User IDs are not continuous and ordered."
    assert set(datasets['ml-100k']['implicit']['data'].dropna(subset=['mapped_item_id'])['mapped_item_id'].astype(int)) == set(range(int(datasets['ml-100k']['implicit']['data']['mapped_item_id'].max()) + 1)), "Item IDs are not continuous and ordered."

# 3. Goodreads
if goodreads:
    datasets['Goodreads'] = {}
    datasets['Goodreads']['implicit'] = load_goodreads()
    assert set(datasets['Goodreads']['implicit']['data'].dropna(subset=['mapped_user_id'])['mapped_user_id'].astype(int)) == set(range(int(datasets['Goodreads']['implicit']['data']['mapped_user_id'].max()) + 1)), "User IDs are not continuous and ordered."
    assert set(datasets['Goodreads']['implicit']['data'].dropna(subset=['mapped_item_id'])['mapped_item_id'].astype(int)) == set(range(int(datasets['Goodreads']['implicit']['data']['mapped_item_id'].max()) + 1)), "Item IDs are not continuous and ordered."

# 4. Tomplay
if tomplay:
    datasets['Tomplay'] = {}
    datasets['Tomplay']['implicit'] = {}
    datasets['Tomplay']['implicit'] = load_tomplay()
    assert set(datasets['Tomplay']['implicit']['data'].dropna(subset=['mapped_user_id'])['mapped_user_id'].astype(int)) == set(range(int(datasets['Tomplay']['implicit']['data']['mapped_user_id'].max()) + 1)), "User IDs are not continuous and ordered."
    assert set(datasets['Tomplay']['implicit']['data'].dropna(subset=['mapped_item_id'])['mapped_item_id'].astype(int)) == set(range(int(datasets['Tomplay']['implicit']['data']['mapped_item_id'].max()) + 1)), "Item IDs are not continuous and ordered."

In [None]:
# Display the structure of our dict
def display_keys(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + str(key))
        if isinstance(value, dict) and key != "user_id_mapping" and key != "item_id_mapping":
            display_keys(value, indent+1)

display_keys(datasets)

In [None]:
for key_dataset, dataset in tqdm(datasets.items()):
    dataset['implicit']['RS_strategy'] = {}

In [None]:
if cf:
    # Create user-item sparse matrix
    for key_dataset, dataset in tqdm(datasets.items()):
        dataset['implicit']['RS_strategy']['CF'] = {}
        cf = dataset['implicit']['RS_strategy']['CF']
        train_data = dataset['implicit']['train_data']

        cf['user_item_train_data'] = sparse.csr_matrix(
            (train_data['rating'].astype(float), (train_data['mapped_user_id'], train_data['mapped_item_id']))
        )

In [None]:
# User embeddings
def compute_user_embeddings(train_data, embeddings_df, n):
    # Group train_data by user_id and sort by timestamp
    train_data_sorted = train_data.sort_values(by=['mapped_user_id', 'timestamp'], ascending=[True, False])
    train_data_grouped = train_data_sorted.groupby('mapped_user_id')

    # Initialize empty DataFrame to store user embeddings
    user_embeddings = pd.DataFrame(columns=['mapped_user_id', 'embedding', 'num_items'])

    real_nb_neigh = []
    # Iterate through each user in train_data
    for user_id, user_data in tqdm(train_data_grouped, desc="Compute user embeddings"):
        # Get the last n articles the user interacted with
        item_ids = user_data['mapped_item_id'].head(n).tolist()
        real_nb_neigh.append(len(item_ids))

        # Get the embeddings for these articles
        item_embeddings = np.stack(embeddings_df[embeddings_df['mapped_item_id'].isin(item_ids)]['embedding'])

        # Compute the mean embedding for the user
        user_embedding = np.mean(item_embeddings, axis=0)

        # Add the user embedding to the DataFrame
        num_items = len(item_embeddings)
        user_embeddings.loc[len(user_embeddings)] = [user_id, user_embedding, num_items]

    real_nb_neigh = sum(real_nb_neigh)/len(real_nb_neigh)

    return user_embeddings, real_nb_neigh

def compute_similarity_matrix(items_embeddings, users_embeddings, user=None):

    if user is not None:
        users_embeddings = users_embeddings[users_embeddings['mapped_user_id'] == user]

    # Convert the embeddings to numpy arrays
    item_embeddings_array = np.vstack(items_embeddings['embedding'].apply(np.array))
    user_embeddings_array = np.vstack(users_embeddings['embedding'].apply(np.array))

    # Compute the cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(user_embeddings_array, item_embeddings_array)

    # Convert the cosine similarity matrix into a DataFrame
    # Use user_ids for the index and item_ids for the columns
    cosine_sim_df = pd.DataFrame(cosine_sim_matrix,
                                index=users_embeddings['mapped_user_id'],
                                columns=items_embeddings['mapped_item_id'])
    
    return cosine_sim_df



In [None]:
def new_init(self, hparams, data, initial_embeddings=None ,seed=None):
    
    """Initializing the model. Create parameters, placeholders, embeddings and loss function.

    Args:
        hparams (HParams): A HParams object, hold the entire set of hyperparameters.
        data (object): A recommenders.models.deeprec.DataModel.ImplicitCF object, load and process data.
        seed (int): Seed.

    """

    tf.compat.v1.set_random_seed(seed)
    np.random.seed(seed)

    self.data = data
    self.epochs = hparams.epochs
    self.lr = hparams.learning_rate
    self.emb_dim = hparams.embed_size
    self.batch_size = hparams.batch_size
    self.n_layers = hparams.n_layers
    self.decay = hparams.decay
    self.eval_epoch = hparams.eval_epoch
    self.top_k = hparams.top_k
    self.save_model = hparams.save_model
    self.save_epoch = hparams.save_epoch
    self.metrics = hparams.metrics
    self.model_dir = hparams.MODEL_DIR
    self.initial_embeddings = initial_embeddings

    metric_options = ["map", "ndcg", "precision", "recall"]
    for metric in self.metrics:
        if metric not in metric_options:
            raise ValueError(
                "Wrong metric(s), please select one of this list: {}".format(
                    metric_options
                )
            )

    self.norm_adj = data.get_norm_adj_mat()

    self.n_users = data.n_users
    self.n_items = data.n_items

    self.users = tf.compat.v1.placeholder(tf.int32, shape=(None,))
    self.pos_items = tf.compat.v1.placeholder(tf.int32, shape=(None,))
    self.neg_items = tf.compat.v1.placeholder(tf.int32, shape=(None,))

    self.weights = self._init_weights()
    self.ua_embeddings, self.ia_embeddings = self._create_lightgcn_embed()

    self.u_g_embeddings = tf.nn.embedding_lookup(
        params=self.ua_embeddings, ids=self.users
    )
    self.pos_i_g_embeddings = tf.nn.embedding_lookup(
        params=self.ia_embeddings, ids=self.pos_items
    )
    self.neg_i_g_embeddings = tf.nn.embedding_lookup(
        params=self.ia_embeddings, ids=self.neg_items
    )
    self.u_g_embeddings_pre = tf.nn.embedding_lookup(
        params=self.weights["user_embedding"], ids=self.users
    )
    self.pos_i_g_embeddings_pre = tf.nn.embedding_lookup(
        params=self.weights["item_embedding"], ids=self.pos_items
    )
    self.neg_i_g_embeddings_pre = tf.nn.embedding_lookup(
        params=self.weights["item_embedding"], ids=self.neg_items
    )

    self.batch_ratings = tf.matmul(
        self.u_g_embeddings,
        self.pos_i_g_embeddings,
        transpose_a=False,
        transpose_b=True,
    )

    self.mf_loss, self.emb_loss = self._create_bpr_loss(
        self.u_g_embeddings, self.pos_i_g_embeddings, self.neg_i_g_embeddings
    )
    self.loss = self.mf_loss + self.emb_loss

    self.opt = tf.compat.v1.train.AdamOptimizer(learning_rate=self.lr).minimize(
        self.loss
    )
    self.saver = tf.compat.v1.train.Saver(max_to_keep=1)

    gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
    self.sess = tf.compat.v1.Session(
        config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)
    )
    self.sess.run(tf.compat.v1.global_variables_initializer())

In [None]:
def new_init_weights(self):
    """Initialize user and item embeddings.

    Returns:
        dict: With keys `user_embedding` and `item_embedding`, embeddings of all users and items.

    """
    all_weights = dict()
    initializer = tf.compat.v1.keras.initializers.VarianceScaling(
        scale=1.0, mode="fan_avg", distribution="uniform"
    )

    if self.initial_embeddings is not None:
        all_weights["item_embedding"] = tf.Variable(self.initial_embeddings['item_embedding'], dtype=tf.float32, name="item_embedding")
        all_weights["user_embedding"] = tf.Variable(self.initial_embeddings['user_embedding'], dtype=tf.float32, name="user_embedding")
        print("Using pretrained embeddings.")
    else:
        all_weights["user_embedding"] = tf.Variable(
            initializer([self.n_users, self.emb_dim]), name="user_embedding"
        )
        all_weights["item_embedding"] = tf.Variable(
            initializer([self.n_items, self.emb_dim]), name="item_embedding"
        )
        print("Using xavier initialization.")

    return all_weights

In [None]:
LightGCN.__init__ = new_init
LightGCN._init_weights = new_init_weights

In [None]:
if lightgcn:
    for key_dataset, dataset in tqdm(datasets.items()):
        dataset['implicit']['RS_strategy']['graph-based'] = {}
        graph_based = dataset['implicit']['RS_strategy']['graph-based']
        dataset['implicit']['RS_strategy']['graph-based']['models'] = {}
        dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN'] = {}

        dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['embedding_model'] = {}

        print("="*100)
        print(key_dataset)
        print("="*100)

        embeddings_models = ["ada" if (ada or xavier_ada) else None] + ["bert" if (bert or xavier_bert) else None]
        embedding_models = [embeddings_model for embeddings_model in embeddings_models if embeddings_model is not None]
        # for embedding_model in dataset['implicit']['items_embeddings'].keys():
        for embedding_model in embedding_models:
            dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['embedding_model'][embedding_model] = {}
            dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['embedding_model'][embedding_model]['method'] = {}
            # if embedding_model=="bert": continue
            print("-"*100)
            print(embedding_model)
            print("-"*100)
                
            for method in [method for method in ['Xavier' if xavier_ada or xavier_bert else None] + ['precomputed' if ada or bert else None] if method is not None]:
                print("*"*50)
                print(method)
                print("*"*50)

                dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['embedding_model'][embedding_model]['method'][method] = {}

                train_data = dataset['implicit']['train_data'].copy()
                train_data = train_data.rename(columns = {'mapped_user_id':'userID', 'mapped_item_id':'itemID'})
            
                test_data = dataset['implicit']['test_data'].copy()
                test_data = test_data.rename(columns = {'mapped_user_id':'userID', 'mapped_item_id':'itemID'})
                    
                dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['data'] = ImplicitCF(train=train_data, test=test_data, seed=my_seed, col_user='userID', col_item='itemID', col_rating='rating')
                data = dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['data']

                try:
                    embed_size = len(dataset['implicit']['items_embeddings'][embedding_model]['embedding'].iloc[0])
                except:
                    if embedding_model=="ada":
                        embed_size = 1536
                    elif embedding_model=="bert":
                        embed_size = 768
                assert (xavier_ada or ada) and embed_size==1536 or (xavier_bert or bert) and embed_size==768, "Embedding size is not correct"
                print(f"Embedding size: {embed_size}")

                if method=="precomputed":
                    item_embeddings = dataset['implicit']['items_embeddings'][embedding_model]

                    item_embeddings = item_embeddings[item_embeddings.mapped_item_id.isin(set(train_data.itemID).union(set(test_data.itemID)))]
                    item_embeddings = item_embeddings.sort_values(by='mapped_item_id', ascending=True)
                    assert item_embeddings[item_embeddings.mapped_item_id.isin(set(train_data.itemID).union(set(test_data.itemID)))].mapped_item_id.is_monotonic_increasing, 'not ordered by item_id'

                    initial_embeddings = {}
                    initial_embeddings['item_embedding'] = np.vstack(item_embeddings[item_embeddings.mapped_item_id.isin(set(train_data.itemID).union(test_data.itemID))].embedding.values)
                    ns = lightgcn_params['n']
                elif method=="Xavier":
                    initial_embeddings = None
                    ns = [None]

                all_hparams_best_model = None
                all_hparams_best_ndcg = 0
                best_hparams = None
                best_hparams_for_init = None
                best_initial_embeddings = None

                if save_tqdm_to_file:
                    f = open(tqdm_file, "w")
                else:
                    f = None
                total_configs = len(lightgcn_params['n_layers'])*len(ns)*len(lightgcn_params['lr'])
                config = 0

                for n_layers, n, lr in tqdm(itertools.product(lightgcn_params['n_layers'], ns , lightgcn_params['lr']), total=total_configs, file=f):
                    
                    print("^"*50)    
                    print(f"n_layers: {n_layers} - n: {n} - lr: {lr}")
                    print("^"*50) 
                    if method=="precomputed":
                        user_embeddings, real_nb_neigh = compute_user_embeddings(dataset['implicit']['train_data'], item_embeddings, n)
                        initial_embeddings['user_embedding'] = np.vstack(user_embeddings[user_embeddings.mapped_user_id.isin(set(train_data.userID))].embedding.values)
                

                    hparams = prepare_hparams(                          
                                learning_rate=lr,
                                eval_epoch=10000000,
                                top_k=5,
                                save_model=False,
                                epochs=1,
                                save_epoch=1,
                                model_type="lightgcn",
                                embed_size=embed_size,
                                n_layers=n_layers,
                                batch_size=1024,
                                decay=0.0001,
                                metrics=["recall", "ndcg", "precision", "map"],
                                MODEL_DIR="./tests/resources/deeprec/lightgcn/model/lightgcn_model/"
                                )
                
                
                    best_model = None
                    best_ndcg = 0
                    tf.compat.v1.set_random_seed(my_seed)
                    tf.random.set_seed(my_seed)
                    np.random.seed(my_seed)
                    random.seed(my_seed)
                    model = LightGCN(hparams, data, initial_embeddings=initial_embeddings, seed=my_seed)
                    save_path = f"../results/lightgcn_model_{embedding_model}_{method}/best_model"
                    patience_max = 10

                    with Timer() as train_time:
                        for epoch in range(sys.maxsize):
                            model.fit()
                            eval_start = time.time()
                            recall, ndcg, precision, map = model.run_eval()
                            eval_end = time.time()
                            eval_time = eval_end - eval_start         
                            print(f"Evaluation time: {eval_time:.1f}s")
                            print(f"Epoch {epoch} - Recall@5: {recall:.4f} - NDCG@5: {ndcg:.4f} - Precision@5: {precision:.4f} - MAP@5: {map:.4f}")
                            print("------------------------")
                            if ndcg > all_hparams_best_ndcg:
                                all_hparams_best_ndcg = ndcg
                                best_hparams = {'n_layers': n_layers, 'n': n, 'lr': lr}
                                best_hparams_for_init = hparams
                                best_initial_embeddings = initial_embeddings
                                model.saver.save(model.sess, save_path)
                            if ndcg > best_ndcg:
                                patience = 0
                                best_ndcg = ndcg
                                best_epoch = epoch
                            else:
                                patience += 1
                            
                            if patience == patience_max:
                                print("="*25 + f"Best NDCG: {best_ndcg:.4f} at epoch {best_epoch}" + "="*25)
                                break
                    print("Took {} seconds for training.".format(train_time.interval))
                    
                    model.sess.close()  # Close the existing session
                    tf.compat.v1.Session().close()
                    # Reset the default graph
                    tf.compat.v1.reset_default_graph()


                model = LightGCN(best_hparams_for_init, data, initial_embeddings=best_initial_embeddings, seed=my_seed)
                model.load(save_path)
                dataset['implicit']['RS_strategy']['graph-based']['models']['LightGCN']['embedding_model'][embedding_model]['method'][method]['model'] = model
                # model.sess.close()  # Close the existing session
                tf.compat.v1.Session().close()
                # Reset the default graph
                tf.compat.v1.reset_default_graph()
                model = None
    if f is not None:
        f.close()

In [None]:
if content_based:
    for key_dataset, dataset in tqdm(datasets.items()):
        dataset['implicit']['RS_strategy']['content-based'] = {}
        dataset['implicit']['RS_strategy']['content-based']['models'] = {}
        for embedding_model in ['ada', 'bert']:
            dataset['implicit']['RS_strategy']['content-based']['models'][embedding_model] = {}
            # dataset['implicit']['RS_strategy']['content-based']['models'][embedding_model]['k'] = {1: {}}
            dataset['implicit']['RS_strategy']['content-based']['models'][embedding_model]['k'] = {1: {}, 2: {}, 3: {}, 4:{}, 5:{}, 10:{}, 20:{}, 30: {}, 50: {}, 100: {}}
        # datasets['LingoRank']['implicit']['RS_strategy']['content-based']['models']['ada_v2']['k'] = {10:{}, 100: {}}

            train_data = dataset['implicit']['train_data']
            
            cb_embed = dataset['implicit']['RS_strategy']['content-based']['models'][embedding_model]
            for key in cb_embed['k']:
                # print(key)
                cb_embed['k'][key]['users_embeddings'], cb_embed['k'][key]['mean_k'] = compute_user_embeddings(train_data, dataset['implicit']['items_embeddings'][embedding_model], key)
                # Link item embeddings to model so we can access them later
                # dataset['implicit']['RS_strategy']['content-based']['models'][embedding_model]['k'][key]['items_embeddings'] = dataset['implicit']['items_embeddings'][embedding_model]
                # dataset['implicit']['RS_strategy']['content-based']['models'][embedding_model]['k'][key]['sim_matrix'] = compute_similarity_matrix(dataset['implicit']['items_embeddings'][embedding_model], cb_embed['k'][key]['users_embeddings'])

            

In [None]:
if cf:

    if save_tqdm_to_file:
        f = open(tqdm_file, "w")
    else:
        f = None
    # Create user-item sparse matrix
    for key_dataset, dataset in tqdm(datasets.items(), desc="Datasets"):
        cf = dataset['implicit']['RS_strategy']['CF']
        cf['models'] = {}
        if als:
            cf['models']['ALS'] = {}
            cf['models']['ALS']['config'] = {}
            
            all_hparams_best_model = None
            all_hparams_best_ndcg = 0
            best_hparams = None
              
            for factors, regularization, alpha in tqdm(itertools.product(als_params['factors'], als_params['regularization'], als_params['alpha']), total=len(list(itertools.product(*als_params.values()))), file=f):
                
                model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=regularization, alpha=alpha, iterations=1, calculate_training_loss=True, random_state=my_seed, use_gpu=False)
                training_data = cf['user_item_train_data'].astype('double')
                test_data = dataset['implicit']['test_data']
                best_ndcg = 0
                patience = 0
                patience_max = 10
                for epoch in range(sys.maxsize):
                    model.fit(training_data, show_progress=False)
                    # Evaluate the model
                    results = evaluate(test_data, None, 'implicit', 'CF', dataset['implicit']['RS_strategy']['CF'], 'ALS', {'model':model}, dataset, k=5, progress=False)
                    ndcg = results['mndcg']

                    if ndcg > all_hparams_best_ndcg:
                        all_hparams_best_ndcg = ndcg
                        best_hparams = {'factors': factors, 'regularization': regularization, 'alpha': alpha}
                        all_hparams_best_model = copy.deepcopy(model)
                    if ndcg > best_ndcg:
                        patience = 0
                        best_ndcg = ndcg
                        best_epoch = epoch
                    else:
                        patience += 1
                    
                    if patience == patience_max:
                        print("-"*25 + f"hparams: {{'factors': {factors}, 'regularization': {regularization}, 'alpha': {alpha}}}" + "-"*25)
                        print("="*25 + f"Best NDCG: {best_ndcg:.4f} at epoch {best_epoch}" + "="*25)
                        break
                
            cf['models']['ALS']['model'] = all_hparams_best_model
            cf['models']['ALS']['config'] = best_hparams

            #cf['models']['ALS'] = {'model': implicit.als.AlternatingLeastSquares(factors=50, random_state=my_seed)}
        if bpr:
            cf['models']['BPR'] = {}
            cf['models']['BPR']['config'] = {}
            
            all_hparams_best_model = None
            all_hparams_best_ndcg = 0
            best_hparams = None

            for factors, regularization, learning_rate, verify_negative_samples in tqdm(itertools.product(bpr_params['factors'], bpr_params['regularization'], bpr_params['learning_rate'], bpr_params['verify_negative_samples']), total=len(list(itertools.product(*bpr_params.values()))), file=f):
                
                model = implicit.bpr.BayesianPersonalizedRanking(factors=factors, learning_rate=learning_rate, regularization=regularization, verify_negative_samples=verify_negative_samples, iterations=1, random_state=my_seed, use_gpu=False)
                training_data = cf['user_item_train_data'].astype('double')
                test_data = dataset['implicit']['test_data']
                best_ndcg = 0
                patience = 0
                patience_max = 10
                for epoch in range(sys.maxsize):
                    model.fit(training_data, show_progress=False)
                    # Evaluate the model
                    results = evaluate(test_data, None, 'implicit', 'CF', dataset['implicit']['RS_strategy']['CF'], 'BPR', {'model':model}, dataset, k=5, progress=False)
                    ndcg = results['mndcg']

                    if ndcg > all_hparams_best_ndcg:
                        all_hparams_best_ndcg = ndcg
                        best_hparams = {'factors': factors, 'regularization': regularization, 'learning_rate': learning_rate, 'verify_negative_samples': verify_negative_samples}
                        all_hparams_best_model = copy.deepcopy(model)
                    if ndcg > best_ndcg:
                        patience = 0
                        best_ndcg = ndcg
                        best_epoch = epoch
                    else:
                        patience += 1
                    
                    if patience == patience_max:
                        print("-"*25 + f"hparams: {{'factors': {factors}, 'regularization': {regularization}, 'learning_rate': {learning_rate}, 'verify_negative_samples': {verify_negative_samples}}}" + "-"*25)
                        print("="*25 + f"Best NDCG: {best_ndcg:.4f} at epoch {best_epoch}" + "="*25)
                        break
                
            cf['models']['BPR']['model'] = all_hparams_best_model
            cf['models']['BPR']['config'] = best_hparams
        if lmf:
            
            cf['models']['LMF'] = {}
            cf['models']['LMF']['config'] = {}
            
            all_hparams_best_model = None
            all_hparams_best_ndcg = 0
            best_hparams = None

            for factors, regularization, learning_rate, neg_prop in tqdm(itertools.product(lmf_params['factors'], lmf_params['regularization'], lmf_params['learning_rate'], lmf_params['neg_prop']), total=len(list(itertools.product(*lmf_params.values()))), file=f):
                
                model = implicit.lmf.LogisticMatrixFactorization(factors=factors, learning_rate=learning_rate, regularization=regularization, neg_prop=neg_prop, iterations=1, random_state=my_seed, use_gpu=False)
                training_data = cf['user_item_train_data'].astype('double')
                test_data = dataset['implicit']['test_data']
                best_ndcg = 0
                patience = 0
                patience_max = 10
                for epoch in range(sys.maxsize):
                    model.fit(training_data, show_progress=False)
                    # Evaluate the model
                    results = evaluate(test_data, None, 'implicit', 'CF', dataset['implicit']['RS_strategy']['CF'], 'LMF', {'model':model}, dataset, k=5, progress=False)
                    ndcg = results['mndcg']

                    if ndcg > all_hparams_best_ndcg:
                        all_hparams_best_ndcg = ndcg
                        best_hparams = {'factors': factors, 'regularization': regularization, 'learning_rate': learning_rate, 'neg_prop': neg_prop}
                        all_hparams_best_model = copy.deepcopy(model)
                    if ndcg > best_ndcg:
                        patience = 0
                        best_ndcg = ndcg
                        best_epoch = epoch
                    else:
                        patience += 1
                    
                    if patience == patience_max:
                        print("-"*25 + f"hparams: {{'factors': {factors}, 'regularization': {regularization}, 'learning_rate': {learning_rate}, 'neg_prop': {neg_prop}}}" + "-"*25)
                        print("="*25 + f"Best NDCG: {best_ndcg:.4f} at epoch {best_epoch}" + "="*25)
                        break
                
            cf['models']['LMF']['model'] = all_hparams_best_model
            cf['models']['LMF']['config'] = best_hparams

    if save_tqdm_to_file:
        f.close()

In [None]:
display_keys(datasets)

In [None]:
# Temporary !!
# datasets['ml-100k'].pop('explicit', None)
# datasets.pop('LingoRank', None)
# datasets['ml-100k']['implicit']['RS_strategy'].pop('content-based', None)
# datasets['ml-100k']['implicit']['RS_strategy'].pop('CF', None)
# datasets['LingoRank']['implicit']['RS_strategy'].pop('content-based', None)
# datasets['LingoRank']['implicit']['RS_strategy'].pop('CF', None)


for key_dataset, dataset in datasets.items():
    print(key_dataset)
    for key_data_strategy, data_strategy in dataset.items():
        print('\t' + key_data_strategy) 
        for key_RS_strategy, RS_strategy in data_strategy['RS_strategy'].items():
            print('\t\t' + key_RS_strategy)
            for key_model, model in RS_strategy['models'].items():
                print('\t\t\t' + key_model)
                # if model.get('method', None):
                if model.get('embedding_model', None):
                    for key_embedding_model, embedding_model in model['embedding_model'].items():
                        for method in embedding_model['method'].keys():
                            print('\t\t\t\t' + key_embedding_model)
                            model['embedding_model'][key_embedding_model]['method'][method]['metrics'] = evaluate(data_strategy['test_data'], data_strategy.get('data_full', None), key_data_strategy, key_RS_strategy, RS_strategy, key_model, model['embedding_model'][key_embedding_model]['method'][method], key_dataset, k=5)
                elif model.get('k', None):
                    for nb_neigh in model['k'].keys():
                        model['k'][nb_neigh]['metrics'] = {}
                        model['k'][nb_neigh]['sim_matrix'] = compute_similarity_matrix(dataset['implicit']['items_embeddings'][key_model], model['k'][nb_neigh]['users_embeddings'])
                        model['k'][nb_neigh]['metrics'] = evaluate(data_strategy['test_data'], data_strategy.get('data_full', None), data_strategy, key_RS_strategy, RS_strategy, key_model, model['k'][nb_neigh], key_dataset, k=5)
                        del  model['k'][nb_neigh]['sim_matrix']

                else:
                    model['metrics'] = {}
                    model['metrics'] = evaluate(data_strategy['test_data'], data_strategy.get('data_full', None), key_data_strategy, key_RS_strategy, RS_strategy, key_model, model, key_dataset, k=5)

In [None]:
display_keys(datasets)

In [None]:
# Save best hparams to a CSV file
if 'best_hparams' in globals() or 'best_hparams' in locals():
    best_hparams_df = pd.DataFrame(list(best_hparams.items()), columns=['Parameter', 'Value'])
    best_hparams_df.to_csv(best_hparams_file, index=False)

In [None]:
import pandas as pd

k=5
# Extracting evaluation results
data = []

for key_dataset, dataset in datasets.items():
    print(key_dataset)
    for key_data_strategy, data_strategy in dataset.items():
        # print('\t' + key_data_strategy) 
        for key_RS_strategy, RS_strategy in data_strategy['RS_strategy'].items():
            # print('\t\t' + key_RS_strategy)
            for key_model, model in RS_strategy['models'].items():
                # print('\t\t\t' + key_model)
                
                if model.get('k', None):
                    for nb_neigh in model['k'].keys():
                        metrics = model['k'][nb_neigh]['metrics']
                        row = {
                            'dataset': key_dataset,
                            'data_strategy': key_data_strategy, 
                            'RS_strategy': key_RS_strategy,
                            'model': key_model,
                            'embeddind_model': key_model,
                            'nb_neighbours_asked': int(nb_neigh),
                            'nb_neighbours_mean': model['k'][nb_neigh]['mean_k'],
                            f'Recall@{k}': f"{metrics.get('mrecall'):.4f}",
                            f'Precision@{k}': f"{metrics.get('mprecision'):.4f}",
                            f'F1@{k}': f"{metrics.get('mf1'):.4f}" ,
                            f'NDCG@{k}': f"{metrics.get('mndcg'):.4f}",  
                            f'MRR@{k}': f"{metrics.get('mrr'):.4f}",
                            f'MAP@{k}': f"{metrics.get('map'):.4f}"
                                 
                        }
                        data.append(row)
                # elif model.get('method', None):
                elif model.get('embedding_model', None):
                    for key_embedding_model, embedding_model in model['embedding_model'].items():
                        for method in embedding_model['method'].keys():
                            metrics = model['embedding_model'][key_embedding_model]['method'][method]['metrics']
                            row = {
                                'dataset': key_dataset,
                                'data_strategy': key_data_strategy, 
                                'RS_strategy': key_RS_strategy,
                                'model': key_model,
                                'embeddind_model': key_embedding_model,
                                'nb_neighbours_asked': None,
                                'nb_neighbours_mean': method,
                                f'Recall@{k}': f"{metrics.get('mrecall'):.4f}",
                                f'Precision@{k}': f"{metrics.get('mprecision'):.4f}",
                                f'F1@{k}': f"{metrics.get('mf1'):.4f}" ,
                                f'NDCG@{k}': f"{metrics.get('mndcg'):.4f}",  
                                f'MRR@{k}': f"{metrics.get('mrr'):.4f}",
                                f'MAP@{k}': f"{metrics.get('map'):.4f}"
                                         
                            }
                            data.append(row)
                else:
                    metrics = model['metrics'] 
                    row = {
                            'dataset': key_dataset,
                            'data_strategy': key_data_strategy, 
                            'RS_strategy': key_RS_strategy,
                            'model': key_model,
                            'embeddind_model': None,
                            'nb_neighbours_asked': None,
                            'nb_neighbours_mean': None,
                            f'Recall@{k}': f"{metrics.get('mrecall'):.4f}",
                            f'Precision@{k}': f"{metrics.get('mprecision'):.4f}",
                            f'F1@{k}': f"{metrics.get('mf1'):.4f}" ,
                            f'NDCG@{k}': f"{metrics.get('mndcg'):.4f}",  
                            f'MRR@{k}': f"{metrics.get('mrr'):.4f}",
                            f'MAP@{k}': f"{metrics.get('map'):.4f}"          
                        }
                    data.append(row)


# Convert to DataFrame
evaluation_results = pd.DataFrame(data)
evaluation_results['nb_neighbours_asked'] = evaluation_results['nb_neighbours_asked'].astype('Int64')
# evaluation_results['nb_neighbours_mean'] = evaluation_results['nb_neighbours_mean'].astype('Float64')

# Function to highlight the best row
def highlight_best(row):
    subset = evaluation_results[(evaluation_results['dataset'] == row['dataset']) & (evaluation_results['data_strategy'] == row['data_strategy']) & (evaluation_results['RS_strategy'] == row['RS_strategy']) & (evaluation_results['model'] == row['model'])]
    max_ndcg = subset[f'NDCG@{k}'].max()
    max_mrr = subset[f'MRR@{k}'].max()
    max_map = subset[f'MAP@{k}'].max()
    # if row[f'NDCG@{k}'] == max_ndcg and row[f'MRR@{k}'] == max_mrr and row[f'MAP@{k}'] == max_map:
    if row[f'NDCG@{k}'] == max_ndcg:
        return ['background-color: lightblue']*len(row)
    return ['']*len(row)

# Apply the highlight function
styled_evaluation_results = evaluation_results.style.apply(highlight_best, axis=1)

# Display the styled DataFrame
styled_evaluation_results

# Save the DataFrame to a CSV file
evaluation_results.to_csv(evaluation_results_file, index=False)


In [None]:
styled_evaluation_results