In [25]:
import os
import sys
from pathlib import Path

# setting proper working directory
PROJECT_DIRECTORY = Path(os.path.abspath('')).resolve().parents[0]
sys.path.extend([str(PROJECT_DIRECTORY)])

print(f'Python {sys.version} on {sys.platform}')
print('Project directory: ', PROJECT_DIRECTORY)

Python 3.12.5 (tags/v3.12.5:ff3bc82, Aug  6 2024, 20:45:27) [MSC v.1940 64 bit (AMD64)] on win32
Project directory:  C:\Users\s8347434\Documents\RecBole-GNN


In [50]:
import pandas as pd
import numpy as np
import argparse
import ast
import json
import matplotlib.pyplot as plt
from itertools import product
import seaborn as sns
from recbole_gnn.config import Config
from recbole_gnn.utils import create_dataset
from recbole_gnn.data.dataset_metrics import GraphDatasetEvaluator
from sklearn.linear_model import LinearRegression
from matplotlib.colors import LinearSegmentedColormap
import statsmodels.formula.api as sm
from tqdm import tqdm
from scipy.stats import gaussian_kde

#### Read Evaluation Data
- model evaluation from test runs
- data set evaluation (topological characteristics)

In [27]:
dataset_split_characteristics_df = pd.read_csv("../eval/log/Dataset/dataset_eval.csv", sep="\t")
print(dataset_split_characteristics_df.shape)

(177, 37)


In [28]:
dataset_split_characteristics_df['density'] = 1 - dataset_split_characteristics_df['sparsity']

In [29]:
EVAL_RUN = "RO"
#als_df = pd.read_csv(f"log/Benchmark/{EVAL_RUN}/ALS-Benchmark-{EVAL_RUN}.csv", sep="\t")
asym_user_df = pd.read_csv(f"log/Benchmark/{EVAL_RUN}/AsymKNNUser-Benchmark-{EVAL_RUN}.csv", sep="\t")
asym_item_df = pd.read_csv(f"log/Benchmark/{EVAL_RUN}/AsymKNNItem-Benchmark-{EVAL_RUN}.csv", sep="\t")
#bpr_df = pd.read_csv(f"log/Benchmark/{EVAL_RUN}/BPR-Benchmark-{EVAL_RUN}.csv", sep="\t")
ngcf_df = pd.read_csv(f"log/Benchmark/{EVAL_RUN}/NGCF-Benchmark-{EVAL_RUN}.csv", sep="\t")
lightgcn_df = pd.read_csv(f"log/Benchmark/{EVAL_RUN}/LightGCN-Benchmark-{EVAL_RUN}.csv", sep="\t")
sgl_df = pd.read_csv(f"log/Benchmark/{EVAL_RUN}/SGL-Benchmark-{EVAL_RUN}.csv", sep="\t")
xsimgcl_df = pd.read_csv(f"log/Benchmark/{EVAL_RUN}/XSimGCL-Benchmark-{EVAL_RUN}.csv", sep="\t")
pop_df = pd.read_csv(f"log/Benchmark/{EVAL_RUN}/Pop-Benchmark-{EVAL_RUN}.csv", sep="\t")

In [30]:
model_evaluation_df = pd.concat([pop_df, asym_user_df, asym_item_df, ngcf_df, lightgcn_df, sgl_df, xsimgcl_df], ignore_index=True)
model_evaluation_df['dataset'] = model_evaluation_df['dataset'].str.extract(r'-(\d+)$').astype(int)
model_evaluation_df["eval_type"] = [EVAL_RUN] * len(model_evaluation_df)
print(model_evaluation_df.shape)

(1240, 20)


In [31]:
# Function to process columns
def process_user_columns(df):
    for col in df.columns:
        if col.startswith('best_user_') or col.startswith('worst_user_') or col.startswith('clustering_coefficients'):
            # Apply transformation for each row in the selected columns
            df[col] = df[col].apply(ast.literal_eval)            
    return df

In [32]:
model_evaluation_df = process_user_columns(model_evaluation_df)

In [33]:
evaluation_dataset_characteristics_df = pd.merge(model_evaluation_df, dataset_split_characteristics_df, on='dataset', how='left')
print(evaluation_dataset_characteristics_df.shape)

(1240, 57)


#### Create necessary Data for Evaluation
- classical characteristics per best / worst user (popularity, interactions)
- topological characteristics per best / worst user
- match userID with global userID

In [34]:
def get_users_topological_chars(data, file_path, num_datasets):    
    
    dataset_eval_list = []
    for i in tqdm(range(num_datasets), total=num_datasets, unit='datasets'):        
        # mapping recbole ID -> local ID (as recbole resets the index after filtering the datasets)
        config = Config(model="BPR", dataset=f"real-life-atomic-100000-{i+1}", config_file_list=["config_files/datasets.yaml"])
        dataset = create_dataset(config)

        # Extract unique user IDs from best_user_ and worst_user_ columns
        best_user_ids = set()  # Use a set to ensure uniqueness
        worst_user_ids = set()  # Use a set to ensure uniqueness
        for index, row in data[data['dataset']==i+1].iterrows():
            for col in data.columns:
                if col.startswith('best_user_'):
                    # Extract user IDs from the list of dictionaries
                    for entry in row[col]:  # Assuming each entry is a list of dictionaries
                        best_user_ids.add(int(list(entry.keys())[0]))  # Add user ID (the key) to the set
                if col.startswith('worst_user_'):
                    # Extract user IDs from the list of dictionaries
                    for entry in row[col]:  # Assuming each entry is a list of dictionaries
                        worst_user_ids.add(int(list(entry.keys())[0]))  # Add user ID (the key) to the set

        # calculate dataset metrics
        dataset_evaluator = GraphDatasetEvaluator(config, dataset)
        # some metrics need a connected graph and thus drops nodes which are in smallest partition
        dataset_eval_dict = {"dataset": i+1}
        dataset_eval_dict.update(dataset_evaluator.evaluate_best_worst_users(best_user_ids, worst_user_ids))
        dataset_eval_list.append(dataset_eval_dict)

        df = pd.DataFrame(dataset_eval_list)
        df.to_csv(file_path, sep='\t', index=False)

    return df

In [35]:
# HINT: takes approx. 5h
file_path = "log/Dataset/user_topological_characteristics.csv"
if not os.path.isfile(file_path):
    print("File does not exist, calculate all user's topology characteristics for each dataset..")
    user_topologies_df = get_users_topological_chars(evaluation_dataset_characteristics_df, file_path, num_datasets=177)
else:
    print(f"File exists! Load file from {file_path}")
    user_topologies_df = pd.read_csv(file_path, sep='\t')
    user_topologies_df = process_user_columns(user_topologies_df)

File exists! Load file from log/Dataset/user_topological_characteristics.csv


In [36]:
user_topologies_df

Unnamed: 0,dataset,degree_assort_best_users,degree_assort_worst_users,average_clustering_coef_dot_best_users,average_clustering_coef_dot_worst_users
0,1,-0.093766,0.083721,0.046294,0.023263
1,2,-0.079205,0.104290,0.057149,0.025817
2,3,-0.036276,0.132727,0.057840,0.029270
3,4,-0.057325,0.018219,0.042298,0.023618
4,5,-0.103245,-0.025815,0.046922,0.023108
...,...,...,...,...,...
172,173,-0.109842,0.126600,0.052375,0.023652
173,174,-0.092176,0.133401,0.048152,0.026208
174,175,-0.065099,-0.083633,0.041568,0.019093
175,176,-0.091919,0.061838,0.057624,0.022613


### Merge all together

In [37]:
evaluation_dataset_characteristics_user_topologies_df = pd.merge(evaluation_dataset_characteristics_df, user_topologies_df, on='dataset', how='left')  

In [38]:
print(evaluation_dataset_characteristics_user_topologies_df.shape)

(1240, 61)


#### Translate the userIDs into the userIDs out of the original dataset
- global ID: the userID which holds through all splits
- local ID: the userID which is only valid within one split
- recbole ID: the userID which is assigned after the filtering

In [41]:
def translate_userids(df, num_rows):      
    # Extract columns ending with @[10]
    data = df.copy()
    columns_to_process = [col for col in data.columns if col.endswith("@[10]")]
    
    for index, row in tqdm(data.iloc[:num_rows].iterrows(), total=num_rows, unit='rows'):
        # configurations initialization
        FILENAME = PROJECT_DIRECTORY / f"asset/data/real-life-atomic-splits/real-life-atomic-100000-{row['dataset']}/real-life-atomic-100000-{row['dataset']}.inter"
        db = pd.read_csv(FILENAME, sep="\t", encoding="utf-8")
        
        # mapping recbole ID -> local ID (as recbole resets the index after filtering the datasets)
        config = Config(model="BPR", dataset=f"real-life-atomic-100000-{row['dataset']}", config_file_list=["config_files/datasets.yaml"])
        dataset = create_dataset(config)
        flipped_dict = {v: k for k, v in dataset.field2token_id['user_id'].items()}

        # mapping local ID -> global ID
        translation_dict = dict(zip(db["user_id:token"], db["userID:token"]))

        # Process each column
        for col in columns_to_process:
            # Parse the string entries and extract the user IDs
            #print(row['Model'], row['dataset'], col)
            new_entry = {str(translation_dict[int(flipped_dict[int(list(entry.keys())[0])])]) : float(list(entry.values())[0]) for entry in row[col]}
            data.at[index, col] = new_entry
            
    return data

In [42]:
# HINT: takes approx. 20-30min.
file_path = "log/Benchmark/Overall-Benchmark-RO.csv"
file_path = "log/Benchmark/Overall-Benchmark-RO-subset.csv"
if not os.path.isfile(file_path):
    print("File does not exist, translate userIDs for each dataset..")
    translated_ids_df = translate_userids(evaluation_dataset_characteristics_user_topologies_df, num_rows=evaluation_dataset_characteristics_user_topologies_df.shape[0])
    translated_ids_df.to_csv(file_path, sep='\t', index=False)
else:
    print(f"File exists! Load file from {file_path}")
    translated_ids_df = pd.read_csv(file_path, sep='\t')
    translated_ids_df = translated_ids_df[translated_ids_df['dataset'] != 177]
    translated_ids_df = translated_ids_df.fillna({})
    translated_ids_df = process_user_columns(translated_ids_df)

File does not exist, translate userIDs for each dataset..


100%|██████████| 1240/1240 [40:42<00:00,  1.97s/rows]


In [46]:
# Save DataFrame to a Parquet file
file_path = "../eval/log/Benchmark/translated_df.parquet"

translated_ids_df.to_parquet(file_path, engine="pyarrow", index=False)

In [47]:
loaded_df = pd.read_parquet(file_path, engine="pyarrow")

In [56]:
example = translated_ids_df.copy()
example['best_user_precision@[10]'] = example['best_user_precision@[10]'].apply(json.dumps)

In [57]:
file_path = "../eval/log/Benchmark/example_json.parquet"
example.to_parquet(file_path, engine="pyarrow", index=False)
example = pd.read_parquet(file_path, engine="pyarrow")
example['best_user_precision@[10]'] = example['best_user_precision@[10]'].apply(json.loads)

In [58]:
example

Unnamed: 0,Model,dataset,precision@10,hit@10,mrr@10,ndcg@10,map@10,itemcoverage@10,averagepopularity@10,tailpercentage@10,...,average_clustering_coef_min_item,average_clustering_coef_max,average_clustering_coef_max_user,average_clustering_coef_max_item,density,degree_assort_best_users,degree_assort_worst_users,average_clustering_coef_dot_best_users,average_clustering_coef_dot_worst_users,eval_type
0,Pop,1,0.0426,0.2676,0.1078,0.0513,0.0217,0.0020,120.4951,0.0000,...,0.673293,0.267778,0.042524,0.279763,0.004025,-0.093766,0.083721,0.046294,0.023263,RO
1,Pop,2,0.0313,0.2020,0.0806,0.0382,0.0162,0.0018,93.7833,0.0000,...,0.745253,0.347611,0.052145,0.362474,0.003342,-0.079205,0.104290,0.057149,0.025817,RO
2,Pop,3,0.0473,0.2945,0.1274,0.0636,0.0301,0.0029,150.6464,0.0000,...,0.700724,0.272593,0.058749,0.289451,0.003679,-0.036276,0.132727,0.057840,0.029270,RO
3,Pop,4,0.0528,0.3015,0.1404,0.0690,0.0334,0.0023,146.2831,0.0000,...,0.688358,0.289853,0.043892,0.303190,0.003699,-0.057325,0.018219,0.042298,0.023618,RO
4,Pop,5,0.0343,0.2308,0.0805,0.0407,0.0170,0.0021,100.1591,0.0000,...,0.686097,0.280193,0.045598,0.292586,0.003955,-0.103245,-0.025815,0.046922,0.023108,RO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1235,XSimGCL,173,0.1576,0.6140,0.3319,0.2030,0.1251,0.1073,57.3223,0.0040,...,0.725371,0.318710,0.050125,0.335126,0.003670,-0.109842,0.126600,0.052375,0.023652,RO
1236,XSimGCL,174,0.1567,0.6431,0.3480,0.2039,0.1208,0.1160,67.9777,0.0052,...,0.742498,0.300826,0.051739,0.319921,0.003596,-0.092176,0.133401,0.048152,0.026208,RO
1237,XSimGCL,175,0.1404,0.5764,0.3079,0.1796,0.1099,0.1103,43.4651,0.0090,...,0.711805,0.286854,0.038564,0.300418,0.003866,-0.065099,-0.083633,0.041568,0.019093,RO
1238,XSimGCL,176,0.1472,0.5805,0.3085,0.1867,0.1152,0.1085,58.8952,0.0095,...,0.742387,0.321873,0.050143,0.337916,0.003431,-0.091919,0.061838,0.057624,0.022613,RO


In [55]:
translated_ids_df

Unnamed: 0,Model,dataset,precision@10,hit@10,mrr@10,ndcg@10,map@10,itemcoverage@10,averagepopularity@10,tailpercentage@10,...,average_clustering_coef_min_item,average_clustering_coef_max,average_clustering_coef_max_user,average_clustering_coef_max_item,density,degree_assort_best_users,degree_assort_worst_users,average_clustering_coef_dot_best_users,average_clustering_coef_dot_worst_users,eval_type
0,Pop,1,0.0426,0.2676,0.1078,0.0513,0.0217,0.0020,120.4951,0.0000,...,0.673293,0.267778,0.042524,0.279763,0.004025,-0.093766,0.083721,0.046294,0.023263,RO
1,Pop,2,0.0313,0.2020,0.0806,0.0382,0.0162,0.0018,93.7833,0.0000,...,0.745253,0.347611,0.052145,0.362474,0.003342,-0.079205,0.104290,0.057149,0.025817,RO
2,Pop,3,0.0473,0.2945,0.1274,0.0636,0.0301,0.0029,150.6464,0.0000,...,0.700724,0.272593,0.058749,0.289451,0.003679,-0.036276,0.132727,0.057840,0.029270,RO
3,Pop,4,0.0528,0.3015,0.1404,0.0690,0.0334,0.0023,146.2831,0.0000,...,0.688358,0.289853,0.043892,0.303190,0.003699,-0.057325,0.018219,0.042298,0.023618,RO
4,Pop,5,0.0343,0.2308,0.0805,0.0407,0.0170,0.0021,100.1591,0.0000,...,0.686097,0.280193,0.045598,0.292586,0.003955,-0.103245,-0.025815,0.046922,0.023108,RO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1235,XSimGCL,173,0.1576,0.6140,0.3319,0.2030,0.1251,0.1073,57.3223,0.0040,...,0.725371,0.318710,0.050125,0.335126,0.003670,-0.109842,0.126600,0.052375,0.023652,RO
1236,XSimGCL,174,0.1567,0.6431,0.3480,0.2039,0.1208,0.1160,67.9777,0.0052,...,0.742498,0.300826,0.051739,0.319921,0.003596,-0.092176,0.133401,0.048152,0.026208,RO
1237,XSimGCL,175,0.1404,0.5764,0.3079,0.1796,0.1099,0.1103,43.4651,0.0090,...,0.711805,0.286854,0.038564,0.300418,0.003866,-0.065099,-0.083633,0.041568,0.019093,RO
1238,XSimGCL,176,0.1472,0.5805,0.3085,0.1867,0.1152,0.1085,58.8952,0.0095,...,0.742387,0.321873,0.050143,0.337916,0.003431,-0.091919,0.061838,0.057624,0.022613,RO


In [72]:
def reformat_list_to_dicts(data, num_rows):
    
    df = data.copy()
    
    columns_to_process = [col for col in df.columns if col.endswith("@[10]")]
    
    for index, row in tqdm(df.iloc[:num_rows].iterrows(), total=num_rows, unit='rows'):
        # Process each column
        for col in columns_to_process:
            # Parse the string entries and extract the user IDs
            #print(row['Model'], row['dataset'], col)
            print([entry for entry in row[col].values()])
            new_entry = {str(list(entry.keys())[0]) : float(list(entry.values())[0]) for entry in row[col]}
            
            df.at[index, col] = new_entry
            
    return df

In [73]:
df = reformat_list_to_dicts(translated_ids_df, num_rows=len(translated_ids_df))

  0%|          | 0/1594 [00:00<?, ?rows/s]

[0.8, 0.5, 0.5, 0.5, 0.5, 0.5, 0.4, 0.4, 0.4, 0.4]





AttributeError: 'int' object has no attribute 'keys'

In [57]:
# Save DataFrame to a Parquet file
file_path = "../eval/log/Benchmark/translated_df.parquet"

df.to_parquet(file_path, engine="pyarrow", index=False)

ArrowTypeError: ("Expected dict key of type str or bytes, got 'int'", 'Conversion failed for column best_user_precision@[10] with type object')

#### Calculate the popularity of each user's interactions

In [88]:
def get_all_users_popularity(num_datasets):    
    
    # Initialize DataFrame with pre-defined size and columns
    df = pd.DataFrame({'dataset': range(1, num_datasets + 1)})        
    df['user_popularity'] = [{} for _ in range(num_datasets)]     
    
    # Loop through each dataset and compute user popularity
    for i in tqdm(range(num_datasets)):
        FILENAME = PROJECT_DIRECTORY / f"asset/data/real-life-atomic-splits/real-life-atomic-100000-{i+1}/real-life-atomic-100000-{i+1}.inter"
        db = pd.read_csv(FILENAME, sep="\t", encoding="utf-8")
        
        # Filter users with less than 20 interactions
        db['interaction_count'] = db.groupby("userID:token")["itemID:token"].transform('count')
        filtered_df = db[db['interaction_count'] >= 20].copy()
        
        # Calculate item popularity
        item_popularity = filtered_df.groupby("itemID:token")["userID:token"].nunique()
        
        # Map item popularity back to filtered_df safely
        filtered_df.loc[:, 'item_popularity'] = filtered_df["itemID:token"].map(item_popularity)
        
        # Compute user average popularity
        user_avg_popularity = filtered_df.groupby("userID:token")['item_popularity'].agg(['mean', 'median'])
        
        # Compute user popularity dictionary
        all_users_popularity_dict = {
            user_id: (row['mean'], row['median']) 
            for user_id, row in user_avg_popularity.iterrows()
        }
        
        # Assign the dictionary to the DataFrame
        df.at[i,'user_popularity'] = all_users_popularity_dict
    
    return df

In [89]:
# HINT: takes approx. 
file_path = "../asset/data/real-life-atomic-splits/user_popularity.csv"
if not os.path.isfile(file_path):
    print("File does not exist, calculate all user's popularity for each dataset..")
    popularity_dict = get_all_users_popularity(num_datasets=177)
    popularity_dict.to_csv(file_path, sep='\t', index=False)
else:
    print(f"File exists! Load file from {file_path}")
    # Load the existing DataFrame
    popularity_dict = pd.read_csv(file_path, sep='\t') 
    # Convert the user_popularity column back to dictionaries
    if 'user_popularity' in popularity_dict.columns:
        popularity_dict['user_popularity'] = popularity_dict['user_popularity'].apply(ast.literal_eval) 

File exists! Load file from ../asset/data/real-life-atomic-splits/user_popularity.csv


In [90]:
print(popularity_dict.head())

   dataset                                    user_popularity
0        1  {1: (9.698447893569845, 4.0), 2: (3.44, 2.0), ...
1        2  {1: (21.978021978021978, 3.0), 2: (23.41304347...
2        3  {1: (16.871382636655948, 3.0), 2: (16.64761904...
3        4  {1: (7.957446808510638, 4.0), 5: (5.28125, 4.0...
4        5  {1: (34.275, 4.0), 4: (11.0, 3.0), 5: (22.3975...


In [91]:
def get_user_popularity(data, all_users_popularity_dict, num_rows):
    
    df = data.copy()
    
    # Initialize new columns with empty lists
    df['best_users_mean_popularity_dict'] = [{} for _ in range(len(df))]
    df['best_users_mean_popularity_mean'] = [{} for _ in range(len(df))]
    df['best_users_mean_popularity_max'] = [{} for _ in range(len(df))]
    df['best_users_mean_popularity_min'] = [{} for _ in range(len(df))]    
    df['best_users_median_popularity_dict'] = [{} for _ in range(len(df))]
    df['best_users_median_popularity_mean'] = [{} for _ in range(len(df))]
    df['best_users_median_popularity_max'] = [{} for _ in range(len(df))]
    df['best_users_median_popularity_min'] = [{} for _ in range(len(df))]
    df['best_users_node_degree'] = [{} for _ in range(len(df))]
    df['best_users_node_degree_mean'] = [{} for _ in range(len(df))]
    df['best_users_node_degree_median'] = [{} for _ in range(len(df))]
    df['best_users_node_degree_min'] = [{} for _ in range(len(df))]
    df['best_users_node_degree_max'] = [{} for _ in range(len(df))]

    df['worst_users_mean_popularity_dict'] = [{} for _ in range(len(df))]
    df['worst_users_mean_popularity_mean'] = [{} for _ in range(len(df))]
    df['worst_users_mean_popularity_max'] = [{} for _ in range(len(df))]
    df['worst_users_mean_popularity_min'] = [{} for _ in range(len(df))]    
    df['worst_users_median_popularity_dict'] = [{} for _ in range(len(df))]
    df['worst_users_median_popularity_mean'] = [{} for _ in range(len(df))]
    df['worst_users_median_popularity_max'] = [{} for _ in range(len(df))]
    df['worst_users_median_popularity_min'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree_mean'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree_median'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree_min'] = [{} for _ in range(len(df))]
    df['worst_users_node_degree_max'] = [{} for _ in range(len(df))]

    df['all_users_median_popularity_mean'] = [{} for _ in range(len(df))]
    df['all_users_median_popularity_max'] = [{} for _ in range(len(df))]
    df['all_users_median_popularity_min'] = [{} for _ in range(len(df))]
    df['all_users_node_degree_mean'] = [{} for _ in range(len(df))]
    df['all_users_median_node_degree_median'] = [{} for _ in range(len(df))]
    df['all_users_node_degree_max'] = [{} for _ in range(len(df))]
    df['all_users_node_degree_min'] = [{} for _ in range(len(df))]    
     
    # Loop through each row
    for index, row in tqdm(df.iloc[:num_rows].iterrows(), total=num_rows, unit='rows'):

        # configurations initialization
        FILENAME = PROJECT_DIRECTORY / f"asset/data/real-life-atomic-splits/real-life-atomic-100000-{row['dataset']}/real-life-atomic-100000-{row['dataset']}.inter"
        db = pd.read_csv(FILENAME, sep="\t", encoding="utf-8")
        
        # Filter users with less than 20 node_degree
        db['interaction_count'] = db.groupby("userID:token")["itemID:token"].transform('count')
        filtered_df = db[db['interaction_count'] >= 20].copy()
        all_users_node_degree_dict = filtered_df.groupby("userID:token")['interaction_count'].first()
                        
        # Extract unique user IDs from best_user_ and worst_user_ columns
        best_user_ids = set()  # Use a set to ensure uniqueness
        worst_user_ids = set()  # Use a set to ensure uniqueness
        for col in df.columns:
            if col.startswith('best_user_'):
                # Extract user IDs from the list of dictionaries
                for entry in row[col]:  # Assuming each entry is a list of dictionaries
                    best_user_ids.add(int(list(entry.keys())[0]))  # Add user ID (the key) to the set
            if col.startswith('worst_user_'):
                # Extract user IDs from the list of dictionaries
                for entry in row[col]:  # Assuming each entry is a list of dictionaries
                    worst_user_ids.add(int(list(entry.keys())[0]))  # Add user ID (the key) to the set
                                         
        # Prepare best users
        best_users_mean_popularity_dict = {user_id: all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'][user_id][0] for user_id in best_user_ids}
        best_users_median_popularity_dict = {user_id: all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'][user_id][1] for user_id in best_user_ids}
        best_users_node_degree_dict = {user_id: all_users_node_degree_dict[user_id] for user_id in best_user_ids}     
        best_users_mean_values_list = list(best_users_mean_popularity_dict.values())
        best_users_median_values_list = list(best_users_median_popularity_dict.values())
        best_users_node_degree_list = list(best_users_node_degree_dict.values())

        worst_users_mean_popularity_dict = {user_id: all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'][user_id][0] for user_id in worst_user_ids}
        worst_users_median_popularity_dict = {user_id: all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'][user_id][1] for user_id in worst_user_ids}
        worst_users_node_degree_dict = {user_id: all_users_node_degree_dict[user_id] for user_id in worst_user_ids}     
        worst_users_mean_values_list = list(worst_users_mean_popularity_dict.values())
        worst_users_median_values_list = list(worst_users_median_popularity_dict.values())
        worst_users_node_degree_list = list(worst_users_node_degree_dict.values())
        
        all_users_mean_values_list = [entry[0] for entry in list(all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'].values())]
        all_users_median_values_list = [entry[1] for entry in list(all_users_popularity_dict.loc[row['dataset']-1,'user_popularity'].values())]

        # Assign
        df.at[index, 'best_users_mean_popularity_dict'] = best_users_mean_popularity_dict
        df.at[index, 'best_users_mean_popularity_mean'] = np.mean(best_users_mean_values_list)
        df.at[index, 'best_users_mean_popularity_max'] = np.max(best_users_mean_values_list)
        df.at[index, 'best_users_mean_popularity_min'] = np.min(best_users_mean_values_list)
        df.at[index, 'best_users_median_popularity_dict'] = best_users_median_popularity_dict
        df.at[index, 'best_users_median_popularity_mean'] = np.mean(best_users_median_values_list)
        df.at[index, 'best_users_median_popularity_max'] = np.max(best_users_median_values_list)
        df.at[index, 'best_users_median_popularity_min'] = np.min(best_users_median_values_list)
        df.at[index, 'best_users_node_degree'] = best_users_node_degree_dict
        df.at[index, 'best_users_node_degree_mean'] = np.mean(best_users_node_degree_list)
        df.at[index, 'best_users_node_degree_median'] = np.median(best_users_node_degree_list)
        df.at[index, 'best_users_node_degree_min'] = np.min(best_users_node_degree_list)
        df.at[index, 'best_users_node_degree_max'] = np.max(best_users_node_degree_list)

        df.at[index, 'worst_users_mean_popularity_dict'] = worst_users_mean_popularity_dict
        df.at[index, 'worst_users_mean_popularity_mean'] = np.mean(worst_users_mean_values_list)
        df.at[index, 'worst_users_mean_popularity_max'] = np.max(worst_users_mean_values_list)
        df.at[index, 'worst_users_mean_popularity_min'] = np.min(worst_users_mean_values_list)
        df.at[index, 'worst_users_median_popularity_dict'] = worst_users_median_popularity_dict
        df.at[index, 'worst_users_median_popularity_mean'] = np.mean(worst_users_median_values_list)
        df.at[index, 'worst_users_median_popularity_max'] = np.max(worst_users_median_values_list)
        df.at[index, 'worst_users_median_popularity_min'] = np.min(worst_users_median_values_list)
        df.at[index, 'worst_users_node_degree'] = worst_users_node_degree_dict
        df.at[index, 'worst_users_node_degree_mean'] = np.mean(worst_users_node_degree_list)
        df.at[index, 'worst_users_node_degree_median'] = np.median(worst_users_node_degree_list)
        df.at[index, 'worst_users_node_degree_min'] = np.min(worst_users_node_degree_list)
        df.at[index, 'worst_users_node_degree_max'] = np.max(worst_users_node_degree_list)

        df.at[index, 'all_users_mean_popularity_mean'] = np.mean(all_users_mean_values_list)
        df.at[index, 'all_users_mean_popularity_max'] = np.max(all_users_mean_values_list)
        df.at[index, 'all_users_mean_popularity_min'] = np.min(all_users_mean_values_list)
        df.at[index, 'all_users_median_popularity_mean'] = np.mean(all_users_median_values_list)
        df.at[index, 'all_users_median_popularity_max'] = np.max(all_users_median_values_list)
        df.at[index, 'all_users_median_popularity_min'] = np.min(all_users_median_values_list)
        df.at[index, 'all_users_node_degree_mean'] = all_users_node_degree_dict.mean()
        df.at[index, 'all_users_median_node_degree_median'] = all_users_node_degree_dict.median()
        df.at[index, 'all_users_node_degree_max'] = all_users_node_degree_dict.max()
        df.at[index, 'all_users_node_degree_min'] = all_users_node_degree_dict.min()
        
    return df

In [92]:
# NOTE: takes approx. 2-3 min
file_path = "log/Dataset/user_classical_characteristics.csv"
if not os.path.isfile(file_path):
    print("File does not exist, assign all user's popularity to each dataset..")
    classical_user_characteristics_df = get_user_popularity(translated_ids_df, popularity_dict, num_rows = translated_ids_df.shape[0])
    classical_user_characteristics_df.to_csv(file_path, sep='\t', index=False)
else:
    print(f"File exists! Load file from {file_path}")
    classical_user_characteristics_df = pd.read_csv(file_path, sep='\t')
    classical_user_characteristics_df = process_user_columns(classical_user_characteristics_df)

File exists! Load file from log/Dataset/user_classical_characteristics.csv


In [93]:
file_path = "../eval/log/Benchmark/TotalEvaluationData-RO.csv"
final_eval_df = pd.merge(translated_ids_df, classical_user_characteristics_df, on='dataset', how='left')
final_eval_df = final_eval_df[final_eval_df['dataset'] != 177]
final_eval_df.to_csv(file_path, sep='\t', index=False)

In [None]:
file_path = "../eval/log/Benchmark/TotalEvaluationData-RO.csv"
if not os.path.isfile(file_path):
    print("File does not exist, please run all the lines above..")
else:
    print(f"File exists! Load file from {file_path}")
    # Load the existing DataFrame
    final_eval_df = pd.read_csv(file_path, sep='\t') 
    final_eval_df = process_user_columns(final_eval_df)
    
    # Convert the user_popularity column back to dictionaries
    if 'user_popularity' in final_eval_df.columns:
        final_eval_df['user_popularity'] = final_eval_df['user_popularity'].apply(ast.literal_eval) 

In [None]:
# Save DataFrame to a Parquet file
file_path = "../eval/log/Benchmark/data.parquet"

final_eval_df.to_parquet(file_path, engine="pyarrow", index=False)

In [None]:
# Load the Parquet file
final_eval_df = pd.read_parquet(file_path, engine="pyarrow")

print(final_eval_df.dtypes)