In [1]:
import os
import sys
from pathlib import Path

# setting proper working directory
PROJECT_DIRECTORY = Path(os.path.abspath('')).resolve().parents[0]
sys.path.extend([str(PROJECT_DIRECTORY)])

print(f'Python {sys.version} on {sys.platform}')
print('Project directory: ', PROJECT_DIRECTORY)

Python 3.12.5 (tags/v3.12.5:ff3bc82, Aug  6 2024, 20:45:27) [MSC v.1940 64 bit (AMD64)] on win32
Project directory:  C:\Users\s8347434\Documents\RecBole-GNN


In [2]:
import pandas as pd
import numpy as np
import argparse
import ast
import matplotlib.pyplot as plt
from itertools import product
import seaborn as sns
from recbole_gnn.config import Config
from recbole_gnn.utils import create_dataset
from recbole_gnn.data.dataset_metrics import GraphDatasetEvaluator
from sklearn.linear_model import LinearRegression
from matplotlib.colors import LinearSegmentedColormap
import statsmodels.formula.api as sm
from tqdm import tqdm

In [3]:
file_path = "../eval/log/Benchmark/TotalEvaluationData-RO.csv"
if not os.path.isfile(file_path):
    print("File does not exist, please run all the lines above..")
else:
    print(f"File exists! Load file from {file_path}")
    # Load the existing DataFrame
    final_eval_df = pd.read_csv(file_path, sep='\t')
    final_eval_df = process_user_columns(final_eval_df)

    # Convert the user_popularity column back to dictionaries
    if 'user_popularity' in final_eval_df.columns:
        final_eval_df['user_popularity'] = final_eval_df['user_popularity'].apply(ast.literal_eval)

File exists! Load file from ../eval/log/Benchmark/TotalEvaluationData-RO.csv


NameError: name 'process_user_columns' is not defined

In [None]:
dataset_characteristics = pd.read_csv("../eval/log/Dataset/dataset_eval.csv", sep="\t")

In [None]:
dataset_characteristics['density'] = 1 - dataset_characteristics['sparsity']

In [None]:
# Loop through columns that start with "average_clustering"
for col in dataset_characteristics.columns:
    if col.startswith('average_clustering') or col.startswith('density'):
        # Calculate log10 and create a new column with "_log" suffix
        dataset_characteristics[f"{col}_log"] = np.log10(dataset_characteristics[col])

dataset_characteristics['dataset'] = dataset_characteristics['dataset'].str.extract(r'-(\d+)$').astype(int)

In [None]:
#dataset_characteristics.keys()

## Read and Transform Benchark Datasets

In [6]:
#als_df = pd.read_csv("log/Benchmark/old/ALS-Benchmark-RO.csv", sep="\t")
asym_user_df = pd.read_csv("log/Benchmark/old/RO/AsymKNNUser-Benchmark-RO.csv", sep="\t")
asym_item_df = pd.read_csv("log/Benchmark/old/RO/AsymKNNItem-Benchmark-RO.csv", sep="\t")
#bpr_df = pd.read_csv("log/Benchmark/old/BPR-Benchmark-RO.csv", sep="\t")
ngcf_df = pd.read_csv("log/Benchmark/RO/NGCF-Benchmark-RO.csv", sep="\t")
lightgcn_df = pd.read_csv("log/Benchmark/RO/LightGCN-Benchmark-RO.csv", sep="\t")
sgl_df = pd.read_csv("log/Benchmark/RO/SGL-Benchmark-RO.csv", sep="\t")
xsimgcl_df = pd.read_csv("log/Benchmark/RO/XSimGCL-Benchmark-RO.csv", sep="\t")
pop_df = pd.read_csv("log/Benchmark/RO/Pop-Benchmark-RO.csv", sep="\t")

In [7]:
overall_df = pd.concat([pop_df, asym_user_df, asym_item_df, ngcf_df, lightgcn_df, sgl_df, xsimgcl_df], ignore_index=True)
overall_df['dataset'] = overall_df['dataset'].str.extract(r'-(\d+)$').astype(int)

In [8]:
print(overall_df.iloc[1])

Model                                                                      Pop
dataset                                                                      2
precision@10                                                            0.0313
hit@10                                                                   0.202
mrr@10                                                                  0.0806
ndcg@10                                                                 0.0382
map@10                                                                  0.0162
itemcoverage@10                                                         0.0018
averagepopularity@10                                                   93.7833
tailpercentage@10                                                          0.0
best_user_precision@[10]     [{'206': '0.6'}, {'157': '0.5'}, {'428': '0.5'...
best_user_hit@[10]           [{'184': '1'}, {'461': '1'}, {'460': '1'}, {'7...
best_user_mrr@[10]           [{'184': '1.0'}, {'137'

In [9]:
# Function to process columns starting with 'best_user_' or 'worst_user_'
def process_user_columns(df):
    for col in df.columns:
        if col.startswith('best_user_') or col.startswith('worst_user_'):
            # Apply transformation for each row in the selected columns
            df[col] = df[col].apply(ast.literal_eval)            
    return df

In [10]:
overall_df = process_user_columns(overall_df)

In [11]:
print(overall_df.head(2)['best_user_precision@[10]'])

0    [{'937': '0.8'}, {'24': '0.5'}, {'743': '0.5'}...
1    [{'206': '0.6'}, {'157': '0.5'}, {'428': '0.5'...
Name: best_user_precision@[10], dtype: object


#### Calculate the best and worst users topological characterisitcs

In [12]:
def get_users_topological_chars(data, file_path, num_datasets):    
    
    dataset_eval_list = []
    for i in tqdm(range(113,114), total=num_datasets, unit='datasets'):
        # mapping recbole ID -> local ID (as recbole resets the index after filtering the datasets)
        config = Config(model="BPR", dataset=f"real-life-atomic-100000-{i+1}", config_file_list=["config_files/datasets.yaml"])
        dataset = create_dataset(config)

        print(i)

        # Extract unique user IDs from best_user_ and worst_user_ columns
        best_user_ids = set()  # Use a set to ensure uniqueness
        worst_user_ids = set()  # Use a set to ensure uniqueness
        for index, row in data[data['dataset']==i+1].iterrows():
            for col in data.columns:
                if col.startswith('best_user_'):
                    # Extract user IDs from the list of dictionaries
                    for entry in row[col]:  # Assuming each entry is a list of dictionaries
                        best_user_ids.add(int(list(entry.keys())[0]))  # Add user ID (the key) to the set
                if col.startswith('worst_user_'):
                    # Extract user IDs from the list of dictionaries
                    for entry in row[col]:  # Assuming each entry is a list of dictionaries
                        worst_user_ids.add(int(list(entry.keys())[0]))  # Add user ID (the key) to the set

        print(best_user_ids)
        print(worst_user_ids)

        # calculate dataset metrics
        dataset_evaluator = GraphDatasetEvaluator(config, dataset)
        # some metrics need a connected graph and thus drops nodes which are in smallest partition
        #if not dataset_evaluator.connected:
        #    continue
        dataset_eval_dict = {"dataset": i+1}
        dataset_eval_dict.update(dataset_evaluator.evaluate_best_worst_users(best_user_ids, worst_user_ids))
        dataset_eval_list.append(dataset_eval_dict)

        df = pd.DataFrame(dataset_eval_list)
        df.to_csv(file_path, sep='\t', index=False)

    return df

In [13]:
# HINT: takes approx. 5h
file_path = "log/Dataset/user_topological_characteristics3.csv"
if not os.path.isfile(file_path):
    print("File does not exist, calculate all user's topology characteristics for each dataset..")
    user_topologies_df = get_users_topological_chars(overall_df, file_path, num_datasets=1)
else:
    print(f"File exists! Load file from {file_path}")
    user_topologies_df = pd.read_csv(file_path, sep='\t')
    user_topologies_df = process_user_columns(user_topologies_df)

File does not exist, calculate all user's topology characteristics for each dataset..


  0%|          | 0/1 [00:00<?, ?datasets/s]

113
{1, 524, 1038, 534, 30, 38, 551, 567, 569, 572, 576, 579, 580, 70, 583, 584, 585, 77, 592, 595, 599, 602, 605, 608, 610, 612, 613, 615, 616, 620, 623, 1136, 632, 633, 122, 637, 638, 640, 643, 645, 646, 648, 650, 652, 660, 159, 161, 677, 167, 1193, 683, 1196, 687, 688, 699, 187, 189, 190, 723, 215, 749, 750, 751, 752, 753, 754, 242, 756, 757, 758, 760, 761, 762, 763, 768, 770, 771, 263, 775, 776, 778, 780, 781, 782, 783, 784, 785, 787, 788, 789, 276, 791, 1307, 796, 797, 296, 298, 299, 815, 817, 819, 310, 835, 362, 365, 376, 892, 393, 917, 410, 425, 430, 444, 964, 969, 458, 976, 977, 472, 984, 994}
{513, 515, 1028, 1029, 516, 1031, 518, 1033, 1034, 523, 1035, 1037, 1040, 530, 532, 535, 536, 538, 539, 1052, 540, 543, 1056, 545, 546, 547, 1058, 1061, 1055, 1064, 557, 1073, 562, 564, 1077, 566, 570, 572, 574, 577, 1089, 581, 1097, 587, 588, 1099, 1102, 590, 592, 594, 1107, 597, 598, 1109, 600, 601, 1111, 603, 604, 1116, 608, 1121, 609, 611, 612, 1120, 1127, 619, 1134, 1137, 1149, 639, 

  0%|          | 0/1 [06:59<?, ?datasets/s]


KeyError: 'Key 1031 not found'

#### Translate the userIDs into the userIDs out of the original dataset
- global ID: the userID which holds through all splits
- local ID: the userID which is only valid within one split
- recbole ID: the userID which is assigned after the filtering

In [None]:
def translate_userids(data, num_rows):      
    # Extract columns ending with @[10]
    columns_to_process = [col for col in data.columns if col.endswith("@[10]")]
    
    for index, row in tqdm(data.iloc[:num_rows].iterrows(), total=num_rows, unit='rows'):
        # configurations initialization
        FILENAME = PROJECT_DIRECTORY / f"asset/data/real-life-atomic-splits/real-life-atomic-100000-{row['dataset']}/real-life-atomic-100000-{row['dataset']}.inter"
        db = pd.read_csv(FILENAME, sep="\t", encoding="utf-8")
        
        # mapping recbole ID -> local ID (as recbole resets the index after filtering the datasets)
        config = Config(model="BPR", dataset=f"real-life-atomic-100000-{row['dataset']}", config_file_list=["config_files/datasets.yaml"])
        dataset = create_dataset(config)
        flipped_dict = {v: k for k, v in dataset.field2token_id['user_id'].items()}
                
        # mapping local ID -> global ID
        translation_dict = dict(zip(db["user_id:token"], db["userID:token"]))
                
        # Process each column
        for col in columns_to_process:
            # Parse the string entries and extract the user IDs
            #print(row['Model'], row['dataset'], col)
            new_entry = [
                    {translation_dict[int(flipped_dict[int(list(entry.keys())[0])])] : float(list(entry.values())[0])}
                    for entry in row[col]
                ]
            data.at[index, col] = new_entry
            
    return data

In [None]:
# HINT: takes approx. 20-30min.
file_path = "log/Benchmark/Overall-Benchmark-RO.csv"
if not os.path.isfile(file_path):
    print("File does not exist, calculate all user's popularity for each dataset..")
    overall_df = translate_userids(overall_df, num_rows=overall_df.shape[0])
    overall_df.to_csv(file_path, sep='\t', index=False)
else:
    print(f"File exists! Load file from {file_path}")
    overall_df = pd.read_csv(file_path, sep='\t')
    overall_df = process_user_columns(overall_df)

In [None]:
print(overall_df.head(2)['best_user_precision@[10]'])

#### Calculate the popularity of each user's interactions

In [None]:
def get_all_users_popularity(num_datasets):    
    
    # Initialize DataFrame with pre-defined size and columns
    df = pd.DataFrame({'dataset': range(1, num_datasets + 1)})        
    df['user_popularity'] = [{} for _ in range(num_datasets)]     
    
    # Loop through each dataset and compute user popularity
    for i in tqdm(range(num_datasets)):
        FILENAME = PROJECT_DIRECTORY / f"asset/data/real-life-atomic-splits/real-life-atomic-100000-{i+1}/real-life-atomic-100000-{i+1}.inter"
        db = pd.read_csv(FILENAME, sep="\t", encoding="utf-8")
        
        # Filter users with less than 20 interactions
        db['interaction_count'] = db.groupby("userID:token")["itemID:token"].transform('count')
        filtered_df = db[db['interaction_count'] >= 20].copy()
        
        # Calculate item popularity
        item_popularity = filtered_df.groupby("itemID:token")["userID:token"].nunique()
        
        # Map item popularity back to filtered_df safely
        filtered_df.loc[:, 'item_popularity'] = filtered_df["itemID:token"].map(item_popularity)
        
        # Compute user average popularity
        user_avg_popularity = filtered_df.groupby("userID:token")['item_popularity'].agg(['mean', 'median'])
        
        # Compute user popularity dictionary
        all_users_popularity_dict = {
            user_id: (row['mean'], row['median']) 
            for user_id, row in user_avg_popularity.iterrows()
        }
        
        # Assign the dictionary to the DataFrame
        df.at[i,'user_popularity'] = all_users_popularity_dict
    
    return df

In [None]:
# HINT: takes approx.
file_path = "../asset/data/real-life-atomic-splits/user_popularity.csv"
if not os.path.isfile(file_path):
    print("File does not exist, calculate all user's popularity for each dataset..")
    popularity_dict = get_all_users_popularity(num_datasets=177)
    popularity_dict.to_csv(file_path, sep='\t', index=False)
else:
    print(f"File exists! Load file from {file_path}")
    # Load the existing DataFrame
    df = pd.read_csv(file_path, sep='\t')
    # Convert the user_popularity column back to dictionaries
    if 'user_popularity' in df.columns:
        df['user_popularity'] = df['user_popularity'].apply(ast.literal_eval)

In [None]:
print(popularity_dict.head())

In [None]:
def get_user_popularity(data, all_users_popularity_dict, num_rows):
    
    df = data.copy()
    
    # Initialize new columns with empty lists
    df['best_users_mean_popularity_dict'] = [{} for _ in range(len(df))]
    df['best_users_mean_popularity_mean'] = [{} for _ in range(len(df))]
    df['best_users_mean_popularity_max'] = [{} for _ in range(len(df))]
    df['best_users_mean_popularity_min'] = [{} for _ in range(len(df))]    
    df['best_users_median_popularity_dict'] = [{} for _ in range(len(df))]
    df['best_users_median_popularity_mean'] = [{} for _ in range(len(df))]
    df['best_users_median_popularity_max'] = [{} for _ in range(len(df))]
    df['best_users_median_popularity_min'] = [{} for _ in range(len(df))]
    df['best_users_node_degree'] = [{} for _ in range(len(df))]
    df['best_users_node_degree_mean'] = [{} for _ in range(len(df))]
    df['best_users_node_degree_median'] = [{} for _ in range(len(df))]
    df['best_users_node_degree_min'] = [{} for _ in range(len(df))]
    df['best_users_node_degree_max'] = [{} for _ in range(len(df))]

    df['worst_users_mean_popularity_dict'] = [{} for _ in range(len(df))]
    df['worst_users_mean_popularity_mean'] = [{} for _ in range(len(df))]
    df['worst_users_mean_popularity_max'] = [{} for _ in range(len(df))]
    df['worst_users_mean_popularity_min'] = [{} for _ in range(len(df))]    
    df['worst_users_median_popularity_dict'] = [{} for _ in range(len(df))]
    df['worst_users_median_popularity_mean'] = [{} for _ in range(len(df))]
    df['worst_users_median_popularity_max'] = [{} for _ in range(len(df))]
    df['worst_users_median_popularity_min'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree_mean'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree_median'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree_min'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree_max'] = [{} for _ in range(len(df))]

    df['all_users_median_popularity_mean'] = [{} for _ in range(len(df))]
    df['all_users_median_popularity_max'] = [{} for _ in range(len(df))]
    df['all_users_median_popularity_min'] = [{} for _ in range(len(df))]
    df['all_users_node_degree_mean'] = [{} for _ in range(len(df))]
    df['all_users_median_node_degree_median'] = [{} for _ in range(len(df))]
    df['all_users_node_degree_max'] = [{} for _ in range(len(df))]
    df['all_users_node_degree_min'] = [{} for _ in range(len(df))]    
     
    # Loop through each row
    for index, row in tqdm(df.iloc[:num_rows].iterrows(), total=num_rows, unit='rows'):

        # configurations initialization
        FILENAME = PROJECT_DIRECTORY / f"asset/data/real-life-atomic-splits/real-life-atomic-100000-{row['dataset']}/real-life-atomic-100000-{row['dataset']}.inter"
        db = pd.read_csv(FILENAME, sep="\t", encoding="utf-8")
        
        # Filter users with less than 20 node_degree
        db['interaction_count'] = db.groupby("userID:token")["itemID:token"].transform('count')
        filtered_df = db[db['interaction_count'] >= 20].copy()
        all_users_node_degree_dict = filtered_df.groupby("userID:token")['interaction_count'].first()
                        
        # Extract unique user IDs from best_user_ and worst_user_ columns
        best_user_ids = set()  # Use a set to ensure uniqueness
        worst_user_ids = set()  # Use a set to ensure uniqueness
        for col in df.columns:
            if col.startswith('best_user_'):
                # Extract user IDs from the list of dictionaries
                for entry in row[col]:  # Assuming each entry is a list of dictionaries
                    best_user_ids.add(int(list(entry.keys())[0]))  # Add user ID (the key) to the set
            if col.startswith('worst_user_'):
                # Extract user IDs from the list of dictionaries
                for entry in row[col]:  # Assuming each entry is a list of dictionaries
                    worst_user_ids.add(int(list(entry.keys())[0]))  # Add user ID (the key) to the set
                                         
        # Prepare best users
        best_users_mean_popularity_dict = {user_id: all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'][user_id][0] for user_id in best_user_ids}
        best_users_median_popularity_dict = {user_id: all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'][user_id][1] for user_id in best_user_ids}
        best_users_node_degree_dict = {user_id: all_users_node_degree_dict[user_id] for user_id in best_user_ids}     
        best_users_mean_values_list = list(best_users_mean_popularity_dict.values())
        best_users_median_values_list = list(best_users_median_popularity_dict.values())
        best_users_node_degree_list = list(best_users_node_degree_dict.values())

        worst_users_mean_popularity_dict = {user_id: all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'][user_id][0] for user_id in worst_user_ids}
        worst_users_median_popularity_dict = {user_id: all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'][user_id][1] for user_id in worst_user_ids}
        worst_users_node_degree_dict = {user_id: all_users_node_degree_dict[user_id] for user_id in worst_user_ids}     
        worst_users_mean_values_list = list(worst_users_mean_popularity_dict.values())
        worst_users_median_values_list = list(worst_users_median_popularity_dict.values())
        worst_users_node_degree_list = list(worst_users_node_degree_dict.values())
        
        all_users_mean_values_list = [entry[0] for entry in list(all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'].values())]
        all_users_median_values_list = [entry[1] for entry in list(all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'].values())]

        # Assign
        df.at[index, 'best_users_mean_popularity_dict'] = best_users_mean_popularity_dict
        df.at[index, 'best_users_mean_popularity_mean'] = np.mean(best_users_mean_values_list)
        df.at[index, 'best_users_mean_popularity_max'] = np.max(best_users_mean_values_list)
        df.at[index, 'best_users_mean_popularity_min'] = np.min(best_users_mean_values_list)
        df.at[index, 'best_users_median_popularity_dict'] = best_users_median_popularity_dict
        df.at[index, 'best_users_median_popularity_mean'] = np.mean(best_users_median_values_list)
        df.at[index, 'best_users_median_popularity_max'] = np.max(best_users_median_values_list)
        df.at[index, 'best_users_median_popularity_min'] = np.min(best_users_median_values_list)
        df.at[index, 'best_users_node_degree'] = best_users_node_degree_dict
        df.at[index, 'best_users_node_degree_mean'] = np.mean(best_users_node_degree_list)
        df.at[index, 'best_users_node_degree_median'] = np.median(best_users_node_degree_list)
        df.at[index, 'best_users_node_degree_min'] = np.min(best_users_node_degree_list)
        df.at[index, 'best_users_node_degree_max'] = np.max(best_users_node_degree_list)

        df.at[index, 'worst_users_mean_popularity_dict'] = worst_users_mean_popularity_dict
        df.at[index, 'worst_users_mean_popularity_mean'] = np.mean(worst_users_mean_values_list)
        df.at[index, 'worst_users_mean_popularity_max'] = np.max(worst_users_mean_values_list)
        df.at[index, 'worst_users_mean_popularity_min'] = np.min(worst_users_mean_values_list)
        df.at[index, 'worst_users_median_popularity_dict'] = worst_users_median_popularity_dict
        df.at[index, 'worst_users_median_popularity_mean'] = np.mean(worst_users_median_values_list)
        df.at[index, 'worst_users_median_popularity_max'] = np.max(worst_users_median_values_list)
        df.at[index, 'worst_users_median_popularity_min'] = np.min(worst_users_median_values_list)
        df.at[index, 'worst_users_node_degree'] = worst_users_node_degree_dict
        df.at[index, 'worst_users_node_degree_mean'] = np.mean(worst_users_node_degree_list)
        df.at[index, 'worst_users_node_degree_median'] = np.median(worst_users_node_degree_list)
        df.at[index, 'worst_users_node_degree_min'] = np.min(worst_users_node_degree_list)
        df.at[index, 'worst_users_node_degree_max'] = np.max(worst_users_node_degree_list)

        df.at[index, 'all_users_mean_popularity_mean'] = np.mean(all_users_mean_values_list)
        df.at[index, 'all_users_mean_popularity_max'] = np.max(all_users_mean_values_list)
        df.at[index, 'all_users_mean_popularity_min'] = np.min(all_users_mean_values_list)
        df.at[index, 'all_users_median_popularity_mean'] = np.mean(all_users_median_values_list)
        df.at[index, 'all_users_median_popularity_max'] = np.max(all_users_median_values_list)
        df.at[index, 'all_users_median_popularity_min'] = np.min(all_users_median_values_list)
        df.at[index, 'all_users_node_degree_mean'] = all_users_node_degree_dict.mean()
        df.at[index, 'all_users_median_node_degree_median'] = all_users_node_degree_dict.median()
        df.at[index, 'all_users_node_degree_max'] = all_users_node_degree_dict.max()
        df.at[index, 'all_users_node_degree_min'] = all_users_node_degree_dict.min()
        
    return df

In [None]:
#overall_df = pd.read_csv('log/Benchmark/Overall-Benchmark.csv', sep='\t')

In [None]:
# NOTE: takes approx. 2-3 min
file_path = "log/Dataset/user_classical_characteristics.csv"
if not os.path.isfile(file_path):
    print("File does not exist, calculate all user's popularity for each dataset..")
    overall_df = get_user_popularity(overall_df, popularity_dict, num_rows = overall_df.shape[0])
    overall_df.to_csv(file_path, sep='\t', index=False)
else:
    print(f"File exists! Load file from {file_path}")
    overall_df = pd.read_csv(file_path, sep='\t')
    overall_df = process_user_columns(overall_df)

## Evaluation
- merge all processed data frames together

In [None]:
merged_df = pd.merge(dataset_characteristics, overall_df, on='dataset', how='inner')  # you can change how to 'left', 'right', or 'outer'
merged_df = merged_df[merged_df['dataset'] != 177]

In [None]:
# Function to plot metrics by dataset with model averages
def plot_metrics_by_dataset(df, metric, dataset_range=None, models=None, model_names=None, save_fig=False):
    # Ensure the 'metric' column exists in the dataframe
    if metric not in df.columns:
        raise ValueError(f"Metric '{metric}' not found in dataframe columns.")
    
    if dataset_range:
        lower_bound, upper_bound = dataset_range
        df = df[(df['dataset'] >= lower_bound) & (df['dataset'] <= upper_bound)]

    if model_names is None:
        model_names = models
    
    # Check if models list is provided
    if models:
        # Filter DataFrame to include only specified models and set ordering
        df = df[df['Model'].isin(models)]
        df['Model'] = pd.Categorical(df['Model'], categories=model_names, ordered=True)

    # Calculate average metric values for each model
    model_averages = df.groupby('Model', observed=False)[metric].mean()

    # Set global font size and padding
    plt.rc('font', size=30)            
    plt.rc('axes', titlesize=30, labelsize=30)
    plt.rc('axes', labelpad=30) 
    plt.rc('xtick', labelsize=14)
    plt.rc('ytick', labelsize=18) 

    # Define custom colors
    tu_dd_blue = (0 / 255, 48 / 255, 93 / 255)
    bu_green1 = (138 / 255, 203 / 255, 193 / 255)
    bu_green2 = (0 / 255, 172 / 255, 169 / 255)
    bu_green3 = (0 / 255, 131 / 255, 141 / 255)
    ing_blue1 = (132 / 255, 207 / 255, 237 / 255)
    ing_blue2 = (0 / 255, 161 / 255, 217 / 255)
    ing_blue3 = (0 / 255, 119 / 255, 174 / 255)
    ing_blue4 = (0 / 255, 105 / 255, 180 / 255)

    colors = [tu_dd_blue, ing_blue4, bu_green1]
    cmap = LinearSegmentedColormap.from_list("custom_cmap", colors, N=len(df['Model'].unique()))
    palette = [cmap(i / (len(df['Model'].unique()) - 1)) for i in range(len(df['Model'].unique()))]
    
    plt.figure(figsize=(12, 6))
    lineplot = sns.lineplot(x='dataset', y=metric, hue='Model', data=df, marker='o', markersize=6, palette=palette)

    # Retrieve the colors used for each model from the seaborn plot
    handles, labels = lineplot.get_legend_handles_labels()
    color_map = {label: handle.get_color() for label, handle in zip(labels, handles)}

    # Add horizontal lines for model averages with the corresponding model color
    for model, avg in model_averages.items():
        plt.axhline(avg, linestyle='--', color=color_map[model])

    # Customize plot
    plt.xlabel('Dataset #')
    plt.ylabel("")
    plt.legend(title='Model')
    
    # Resize and position legend
    plt.legend(
        title='Model',
        title_fontsize=16,           # Font size for the legend title
        fontsize=14,                 # Font size for the legend labels (text)
        #bbox_to_anchor=(1.05, 1),  # Place legend to the right of the plot
        loc='upper right',          # Align legend at the top-left relative to bbox
        borderaxespad=0.5,         # Padding between legend and axes
        markerscale=1.5,           # Scale marker size in legend
        labelspacing=0.5,          # Space between legend entries
        handlelength=2.0           # Length of legend line handles
    )

    if dataset_range and dataset_range[1] >= 176:
        xticks = [1] + list(range(10, 176, 10)) + [176]
        plt.xticks(xticks)  # Adjust step size (e.g., 10) for better label visibility

    # Display grid
    plt.grid(True)
    
    if save_fig:
        plt.savefig(f"../asset/plots/results_{metric}.png", dpi=300, transparent=True)

    # Display the plot
    plt.show()


In [None]:
# Example usage: Plot MR@10 and NDCG@10
model_list = ['AsymKNNUser', 'ALS', 'XSimGCL']
model_name_list = ['User-KNN', 'ALS', 'XSimGCL']
plot_metrics_by_dataset(merged_df, "ndcg@10", dataset_range=[1,176], models=model_list, model_names=model_name_list, save_fig = True)

In [None]:
def plot_metrics_by_dataset_with_boxplot(df, metric, dataset_range=None, models=None, model_names=None, save_fig = False):
    # Ensure the 'metric' column exists in the dataframe
    if metric not in df.columns:
        raise ValueError(f"Metric '{metric}' not found in dataframe columns.")
    
    if dataset_range:
        lower_bound, upper_bound = dataset_range
        df = df[(df['dataset'] >= lower_bound) & (df['dataset'] <= upper_bound)]

    if model_names is None:
        model_names = models
    
    # Check if models list is provided
    if models:
        # Filter DataFrame to include only specified models and set ordering
        df = df[df['Model'].isin(models)]
        df['Model'] = pd.Categorical(df['Model'], categories=model_names, ordered=True)

    # Set global font size and padding
    plt.rc('font', size=30)            
    plt.rc('axes', titlesize=30, labelsize=30)
    plt.rc('axes', labelpad=30) 
    plt.rc('xtick', labelsize=16)
    plt.rc('ytick', labelsize=18) 

    # Define custom colors
    tu_dd_blue = (0 / 255, 48 / 255, 93 / 255)
    bu_green1 = (138 / 255, 203 / 255, 193 / 255)
    bu_green2 = (0 / 255, 172 / 255, 169 / 255)
    bu_green3 = (0 / 255, 131 / 255, 141 / 255)
    ing_blue1 = (132 / 255, 207 / 255, 237 / 255)
    ing_blue2 = (0 / 255, 161 / 255, 217 / 255)
    ing_blue3 = (0 / 255, 119 / 255, 174 / 255)
    ing_blue4 = (0 / 255, 105 / 255, 180 / 255)

    colors = [bu_green1, ing_blue1, ing_blue4]
    cmap = LinearSegmentedColormap.from_list("custom_cmap", colors, N=len(df['Model'].unique()))
    palette = [cmap(i / (len(df['Model'].unique()) - 1)) for i in range(len(df['Model'].unique()))]
    
    # Create the plot using seaborn boxplot
    plt.figure(figsize=(12, 6))
    boxplot = sns.boxplot(
        x='Model', y=metric, data=df, showmeans=True, meanline=True, 
        meanprops={"linestyle": "--", "color": "black"}, 
        palette=palette, hue='Model', dodge=False, legend=False
    )
    
    # Customize plot
    plt.xlabel("")  # Turn off x-axis label
    plt.ylabel(f'{metric}')
    plt.ylabel("")
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Display the plot
    plt.tight_layout()
    
    if save_fig: plt.savefig(f"../asset/plots/boxplot_{metric}.png", dpi=300, transparent=True)

    plt.show()

In [None]:
# Example usage: Plot MR@10 and NDCG@10
model_list = ['Pop', 'AsymKNNUser', 'AsymKNNItem', 'ALS', 'BPR', 'NGCF', 'LightGCN', 'SGL', 'XSimGCL']
model_name_list = ['Pop', 'AsymKNNUser', 'AsymKNNItem', 'ALS', 'BPR', 'NGCF', 'LightGCN', 'SGL', 'XSimGCL']
plot_metrics_by_dataset_with_boxplot(merged_df, "ndcg@10", dataset_range=[1,176], models=model_list, model_names=model_name_list, save_fig = True)

In [None]:
def plot_user_characteristics_per_dataset(df, metrics, dataset_range=None):
    # Validate metrics list
    valid_metrics = [
        'best_users_mean_popularity_mean', 'best_users_median_popularity_mean',
        'worst_users_mean_popularity_mean', 'worst_users_median_popularity_mean',
        'best_users_node_degree_mean', 'best_users_node_degree_median',
        'worst_users_node_degree_mean', 'worst_users_node_degree_median'
    ]
    for metric in metrics:
        if metric not in valid_metrics:
            raise ValueError(f"Invalid metric '{metric}'. Choose a valid metric.")

    # Filter dataframe based on dataset range if provided
    if dataset_range:
        integer1, integer2 = dataset_range
        df = df[(df['dataset'] >= integer1) & (df['dataset'] <= integer2)]

    # Initialize plot
    plt.figure(figsize=(12, 6))

    # Loop through each metric to plot
    for metric in metrics:
        # Ensure the metric column and min/max columns are numeric
        df[metric] = pd.to_numeric(df[metric], errors='coerce')

        # Check for missing or invalid values in the metric column
        if df[metric].isnull().any():
            raise ValueError(f"The metric column '{metric}' contains NaN values, which are not allowed.")

        # Initialize min_value and max_value based on the metric
        if metric == 'best_users_mean_popularity_mean':
            min_value = 'best_users_mean_popularity_min'
            max_value = 'best_users_mean_popularity_max'
        elif metric == 'best_users_median_popularity_mean':
            min_value = 'best_users_median_popularity_min'
            max_value = 'best_users_median_popularity_max'
        elif metric == 'worst_users_mean_popularity_mean':
            min_value = 'worst_users_mean_popularity_min'
            max_value = 'worst_users_mean_popularity_max'
        elif metric == 'worst_users_median_popularity_mean':
            min_value = 'worst_users_median_popularity_min'
            max_value = 'worst_users_median_popularity_max'
        elif metric in ['best_users_node_degree_mean', 'best_users_node_degree_median']:
            min_value = 'best_users_node_degree_min'
            max_value = 'best_users_node_degree_max'
        elif metric in ['worst_users_node_degree_mean', 'worst_users_node_degree_median']:
            min_value = 'worst_users_node_degree_min'
            max_value = 'worst_users_node_degree_max'

        # Ensure the min/max columns are numeric
        df[min_value] = pd.to_numeric(df[min_value], errors='coerce')
        df[max_value] = pd.to_numeric(df[max_value], errors='coerce')


        # Plot the selected metric
        sns.lineplot(x='dataset', 
                     y=metric, 
                     data=df, 
                     marker='o', 
                     markersize=4, 
                     errorbar=lambda x: (x.min(), x.max()),
                     label=metric.replace('_', ' ').title())


    # Customize plot
    plt.xlabel('Dataset #')
    plt.ylabel("")

    # Resize and position legend
    plt.legend(
        title='Metrics',
        title_fontsize=16,           # Font size for the legend title
        fontsize=14,                 # Font size for the legend labels (text)
        loc='upper right',          # Align legend at the top-right relative to bbox
        borderaxespad=0.5,         # Padding between legend and axes
        markerscale=1.5,           # Scale marker size in legend
        labelspacing=0.5,          # Space between legend entries
        handlelength=2.0           # Length of legend line handles
    )

    # Adjust x-axis ticks based on the dataset range if necessary
    if dataset_range and dataset_range[1] >= 176:
        xticks = [1] + list(range(15, 176, 10)) + [176]
        plt.xticks(xticks)  # Adjust step size (e.g., 10) for better label visibility

    # Display grid
    plt.grid(True)

    # Display the plot
    plt.show()


In [None]:
plot_user_characteristics_per_dataset(merged_df, metrics=['best_users_mean_popularity_mean', 'worst_users_mean_popularity_mean'], dataset_range=[1,176])

In [None]:
def plot_regression(df, models, dataset_range=None, model_names=None, metric='best_users_mean_popularity_mean', target_metric='ndgc@10', transformation='log'):
    # Ensure the metric exists in the dataframe
    if metric not in df.columns:
        raise ValueError(f"Metric '{metric}' not found in dataframe columns.")
    
    # Extract metric values and the target metric values
    metric_values = df[metric].dropna()  # Avoid NaNs in the metric
    target_values = df[target_metric].dropna()  # Avoid NaNs in the target metric
    
    if len(metric_values) == 0 or len(target_values) == 0:
        raise ValueError(f"No valid data found for the regression between '{metric}' and '{target_metric}'.")

    # Filter the dataframe by the dataset range if provided
    if dataset_range:
        lower_bound, upper_bound = dataset_range
        df = df[(df['dataset'] >= lower_bound) & (df['dataset'] <= upper_bound)]

    # Set model names if not provided
    if model_names is None:
        model_names = models

    # Ensure models are provided and filter the dataframe accordingly
    if models:
        df = df[df['Model'].isin(models)]  # Filter by the models in the 'models' list
        df['Model'] = pd.Categorical(df['Model'], categories=model_names, ordered=True)  # Set model categories

    # Prepare a plot for each model
    plt.figure(figsize=(12, 6))

    for model_name in model_names:
        model_df = df[df['Model'] == model_name]  # Filter for the current model

        # Extract metric values and target values for the current model
        model_metric_values = pd.to_numeric(model_df[metric].dropna())
        model_target_values = pd.to_numeric(model_df[target_metric].dropna())

        if len(model_metric_values) == 0 or len(model_target_values) == 0:
            continue  # Skip this model if no valid data points are found

        # Apply data transformation if needed
        if transformation == 'log':
            # Apply log transformation (ensure no zero or negative values)
            model_metric_values = np.log1p(model_metric_values)
        elif transformation == 'sqrt':
            # Apply square root transformation (ensure no negative values)
            model_metric_values = np.sqrt(model_metric_values)
        
        # Ensure the metric and target have matching valid data points
        model_combined_df = pd.DataFrame({metric: model_metric_values, target_metric: model_target_values})
        model_combined_df = model_combined_df.dropna()

        # Perform linear regression for the current model
        X = model_combined_df[[metric]].values.reshape(-1, 1)  # Features (metric)
        y = model_combined_df[target_metric].values  # Target (e.g., ndgc@10)

        # Perform linear regression
        model_reg = LinearRegression()
        model_reg.fit(X, y)

        # Predictions from the model
        y_pred = model_reg.predict(X)

        # Scatter plot of the data for this model
        sns.scatterplot(x=model_combined_df[metric], y=model_combined_df[target_metric], label=f'{model_name} Data', s=100)

        # Plot the regression line for this model
        plt.plot(model_combined_df[metric], y_pred, label=f'{model_name} Regression Line', linewidth=2)

    # Customize the plot
    plt.xlabel(f'{metric}')
    plt.ylabel(f'{target_metric}')
    plt.legend()

    # Display grid
    plt.grid(True)

    # Show the plot
    plt.show()

    return model_reg.coef_, model_reg.intercept_


In [None]:
plot_regression(example, models=['LightGCN'], dataset_range=[1,176], metric='best_users_mean_popularity_mean', target_metric='ndcg@10', transformation='sqrt')


In [None]:
def plot_correlation_matrix(df, models, dataset_range=None, model_names=None, metrics=['best_users_mean_popularity_mean'], target_metrics=['ndgc@10'], transformation='log'):
    # Ensure the metric exists in the dataframe
    if target_metrics is None:
        target_metrics = [col for col in df.columns if col.endswith('@10')]
    
    # Extract metric values and the target metric values
    metric_values = df[metrics].dropna()  # Avoid NaNs in the metric
    target_values = df[target_metrics].dropna()  # Avoid NaNs in the target metric
    
    # Filter the dataframe by the dataset range if provided
    if dataset_range:
        lower_bound, upper_bound = dataset_range
        df = df[(df['dataset'] >= lower_bound) & (df['dataset'] <= upper_bound)]

    # Set model names if not provided
    if model_names is None:
        model_names = models

    # Ensure models are provided and filter the dataframe accordingly
    if models:
        df = df[df['Model'].isin(models)]  # Filter by the models in the 'models' list
        df['Model'] = pd.Categorical(df['Model'], categories=model_names, ordered=True)  # Set model categories

    # Set global font size and padding
    plt.rc('font', size=12)            
    plt.rc('axes', titlesize=30, labelsize=30)
    plt.rc('axes', labelpad=10) 
    plt.rc('xtick', labelsize=12)
    plt.rc('ytick', labelsize=12) 

    # Define custom colors
    tu_dd_blue = (0 / 255, 48 / 255, 93 / 255)
    bu_green1 = (138 / 255, 203 / 255, 193 / 255)
    bu_green2 = (0 / 255, 172 / 255, 169 / 255)
    bu_green3 = (0 / 255, 131 / 255, 141 / 255)
    ing_blue1 = (132 / 255, 207 / 255, 237 / 255)
    ing_blue2 = (0 / 255, 161 / 255, 217 / 255)
    ing_blue3 = (0 / 255, 119 / 255, 174 / 255)
    ing_blue4 = (0 / 255, 105 / 255, 180 / 255)

    colors = [bu_green1, ing_blue4, tu_dd_blue]
    cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)

    # Prepare a plot for each model
    plt.figure(figsize=(12, 6))

    for model_name in model_names:
        model_df = df[df['Model'] == model_name].reset_index()  # Filter for the current model

        # Extract metric values and target values for the current model
        model_metric_values = model_df[metrics].apply(pd.to_numeric)
        model_target_values = model_df[target_metrics].apply(pd.to_numeric)

        if len(model_metric_values) == 0 or len(model_target_values) == 0:
            continue  # Skip this model if no valid data points are found

        # Apply data transformation if needed
        if transformation == 'log':
            # Apply log transformation (ensure no zero or negative values)
            model_metric_values = model_metric_values.apply(np.log1p)
        elif transformation == 'sqrt':
            # Apply square root transformation (ensure no negative values)
            model_metric_values =  model_metric_values.apply(np.sqrt)

        correlation_data = pd.concat([model_metric_values, model_target_values], axis=1)
    
        # Compute the correlation matrix
        correlation_matrix = correlation_data.corr()
    
        # Plot the heatmap
        plt.figure(figsize=(12, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap=cmap, center=0, fmt='.3f', linewidths=0.5)

    # Customize the plot
    plt.tight_layout()
    
    # Show the plot
    plt.show()

In [None]:
plot_correlation_matrix(example, models=['LightGCN'], dataset_range=[1,176], metrics=['best_users_mean_popularity_mean'], target_metrics=None,  transformation='log')

#### For Debugging

In [None]:
config = Config(model="BPR", dataset="real-life-atomic-100000-1", config_file_list=["config_files/datasets.yaml"])
dataset = create_dataset(config)
original_dict = dataset.field2token_id['user_id']
flipped_dict = {v: k for k, v in dataset.field2token_id['user_id'].items()}

#print(original_dict)
print(flipped_dict[937])

In [None]:
FILENAME = PROJECT_DIRECTORY / f"asset/data/real-life-atomic-splits/real-life-atomic-100000-{1}/real-life-atomic-100000-{1}.inter"
db = pd.read_csv(FILENAME, sep="\t", encoding="utf-8")

# Filter users with less than 20 interactions
db['interaction_count'] = db.groupby("userID:token")["itemID:token"].transform('count')
filtered_df = db[db['interaction_count'] >= 20].copy()
all_users_interactions_dict = filtered_df.groupby("userID:token")['interaction_count'].first()

print(all_users_interactions_dict[896])

#### Statistical Significance

In [None]:
def significance_test(df, characteristics, models, metrics, apply_log = True):
    # Center the data (subtract the mean of each feature)
    data = df[df['Model'].isin(models)].reset_index(drop=True).copy()
    sanitized_columns = {col: col.split('@')[0] for col in df.columns}
    data = data.rename(columns=sanitized_columns)
    
    data[characteristics] = data[characteristics].apply(pd.to_numeric)
    data[metrics] = data[metrics].apply(pd.to_numeric)

    if apply_log:
        log_characteristics = [characteristic for characteristic in characteristics if not characteristic.startswith('degree')]
        data[log_characteristics] = data[log_characteristics].apply(np.log10)

    data[characteristics] = data[characteristics].apply(lambda x: x - x.mean())


    # Iterate over the metrics to create the new columns
    df_new = pd.DataFrame()
    
    for model in models:
        for metric in metrics:
            df_new[f"{model}_{metric}"] = data[data['Model']==model][metric].reset_index(drop=True)
        for characteristic in characteristics:
            df_new[characteristic] = data[data['Model']==model][characteristic].reset_index(drop=True)

    # Randomly split data into training and test sets (90% training, 10% test)
    #msk = np.random.rand(len(df_new)) < 0.9
    #test = df_new[~msk]
    train = df_new

    # List to store the regression results
    models_results = []

    # Iterate over each metric
    for metric in metrics:
        # Iterate over each model
        for idx, model in enumerate(models):
            X = train[characteristics]
            y = train[model + '_' + metric]  # Use the newly created column
            
            # Define the formula for OLS regression
            formula_str_ml = y.name + ' ~ ' + '+'.join(characteristics)
            
            # Perform OLS regression using statsmodels
            model_ml = sm.ols(formula=formula_str_ml, data=train[characteristics+[model + '_' + metric]])
            fitted_ml = model_ml.fit(cov_type='HC1')  # Robust standard errors (HC1)

            print(fitted_ml.summary())
            print("Parameters: ", fitted_ml.params)
            print("R2: ", fitted_ml.rsquared)
            
            # Append the results to models_results
            models_results.append({
                'model': model,
                'metric': metric,
                'score': fitted_ml.rsquared,
                'adjusted_score': fitted_ml.rsquared_adj,
                **fitted_ml.params.to_dict(),  # Coefficients
                **fitted_ml.pvalues.rename(lambda x: 'p_' + x).to_dict()  # p-values
            })

            pred_ols = fitted_ml.get_prediction()
            iv_l = pred_ols.summary_frame()["obs_ci_lower"]
            iv_u = pred_ols.summary_frame()["obs_ci_upper"]

            assert len(iv_l) == len(X), f"Length of 'iv_l' ({len(iv_l)}) does not match length of X ({len(X)})"

            if X.shape[1] == 1:
                fig, ax = plt.subplots(figsize=(8, 6))
                ax.plot(X, y, "o", label="data")
                ax.plot(X, fitted_ml.fittedvalues, "r--.", label="OLS")
                ax.plot(X, iv_u, "r--")
                ax.plot(X, iv_l, "r--")
                ax.legend(loc="best")
            
        # Convert the results to a DataFrame
        df_results = pd.DataFrame.from_dict(models_results)
    
        df_results.to_csv(f"../eval/log/dataset/significance_test_{metric}.csv", sep='\t', index=None)
    
    # Regression plots: https://www.statsmodels.org/dev/examples/notebooks/generated/regression_plots.html


df = pd.read_csv('myFile', delim_whitespace = True, header = None)
df.columns = ['column1', 'column2']
y, X = ps.dmatrices('column1 ~ column2',data = df, return_type = 'dataframe')
model = sm.OLS(y,X)
results = model.fit()
predictedValues = results.predict()
#print predictedValues
yData = df.as_matrix(columns = ['column1'])
res = yData - predictedValues

In [None]:
#merged_df.keys()[:50]

In [None]:
# Example usage:
# Define your characteristics, models, and metrics
characteristics = ['best_users_mean_popularity_mean', 'degree_assort_user', 'average_degree', 'gini_user']
# same characteristics as in https://dl.acm.org/doi/pdf/10.1145/3640457.3688070
characteristics = ['space_size', 'shape', 'density', 'gini_user', 'gini_item', 'average_degree_user', 'average_degree_item', 'average_clustering_coef_dot_user', 'average_clustering_coef_dot_item', 'degree_assort_user', 'degree_assort_item']
characteristics = ['space_size', 'shape', 'density', 'gini_user', 'gini_item', 'average_degree_user', 'average_degree_item', 'average_clustering_coef_dot_user', 'average_clustering_coef_dot_item', 'degree_assort_item']

models = ['LightGCN']
metrics = ['ndcg']

# Assuming 'df' is your DataFrame
significance_test(merged_df, characteristics, models, metrics, apply_log=True)