# Split and raw data statistics

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# import sys
# sys.path.append('../')

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
data_dir = '../data/statistics/preprocessed'

def process_raw_prepr(data_dir):
    dataframes = []

    for filename in os.listdir(data_dir):
        if filename.endswith('.csv'):  
            df = pd.read_csv(os.path.join(data_dir, filename))
            dataset_name = os.path.splitext(filename)[0]
            df['dataset'] = dataset_name
            df = df[[df.columns[-1]] + df.columns[:-1].tolist()]
            dataframes.append(df)

    final_dataframe = pd.concat(dataframes, ignore_index=True)
    return final_dataframe

In [6]:
def process_splitted(data_dir):
    dataframes = []
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            file_path = os.path.join(root, file)
            
            # Read the file into a DataFrame (assuming CSV format)
            df = pd.read_csv(file_path)
            current_path =  root.split('/')[4:]
            df['subset'] = file.split('_')[0]
            df['split_type'] = current_path[0]
            df['dataset'] = current_path[1]
            if len(current_path) > 2:
                df['validation_type'] = current_path[1]
                df['dataset'] = current_path[2]
                df['quantile'] = current_path[3]
                
            dataframes.append(df)
    final_df = pd.concat(dataframes, ignore_index=True)
    return final_df

In [7]:
final_cols_ints = ['n_users', 'n_items', 'n_interactions', 'timestamp_range_in_days']
final_cols_floats = ['density', 'avg_seq_length']
final_cols_cat = ['dataset']
round_2 =['avg_seq_length']

def convert_cols(df, final_cols_ints, final_cols_floats, round_2):
    for col in final_cols_ints:
        if col in df.columns:
            df[col] = df[col].astype(int)
    for col in final_cols_floats:
        if col in df.columns:
            df[col] = df[col].astype(float).round(6)
    for col in round_2:
        if col in df.columns:
            df[col] = df[col].round(2)
    return df

def style_ints(df, columns):
    styled_df = df.copy()
    for col in columns:
        if col in styled_df.columns:
            styled_df[col] = styled_df[col].apply(
                lambda x: f"{round(x):,}".replace(",", " "))

    return styled_df

def style_percent(df, columns, ndigits=0):
    styled_df = df.copy()
    for col in columns:
        if col in styled_df.columns:
            styled_df[col] = (styled_df[col] * 100).round(2)

    return styled_df
    
    

In [8]:
final_col_names = {'timestamp_range_in_days' : "#Days",
 'timestamp_range_in_days,%' : "#Days,%",
 'n_users' : "#Users",
 'n_users,%' : "#Users,%",
 'n_interactions' : "#Interact.",
 'avg_seq_length' : "Seq. len.",
 'avg_hold_len' : "Holdout len.",
 'density' : "Density (%)",
 'global_timesplit' : "GTS",
 'leave-one-out' : "LOO",
 'val_by_time': "GT",
 'val_by_user': "UB",
 'val_last_train_item': "LTI",
 "median_delta": 'Median delta',
 'dataset': 'Dataset',
 'density': 'Density (%)',
 'mean_user_lifetime,%': 'Lifetime (%)',
 'full_df': 'Full data',
 'preprocessed_median_delta': 'Prepr. data'
 }

def process_one_name(name, final_col_names):
    if name in final_col_names:
        return final_col_names[name]
    elif name.split('_')[0] in ['last', 'first', 'random', 'successive', 'LOO']:
        return name.split('_')[0].capitalize() if not name.split('_')[0] == 'LOO' else 'LOO'
    else:
        return name

def rename_cols_flat(df, final_col_names):
    new_names = []
    for name in df.columns:
        new_names.append(process_one_name(name, final_col_names))
    return new_names


def rename_cols_nested(df, final_col_names):
    new_names = []
    for col in df.columns.values:
        new_names.append([])
        for part_ in col:
            new_names[-1].append(process_one_name(part_, final_col_names))
    return new_names

# Raw data

In [9]:
data_dir = '../data/statistics/raw'
df_raw = process_raw_prepr(data_dir)


In [10]:
df_raw.head(3)

Unnamed: 0,dataset,n_users,n_items,n_interactions,density,avg_seq_length,seq_len_mean,seq_len_std,seq_len_min,seq_len_max,seq_len_median,item_occurrence_mean,item_occurrence_std,item_occurrence_min,item_occurrence_max,item_occurrence_median,user_activity_mean,user_activity_std,user_activity_min,user_activity_max,user_activity_median,max_timestamp,min_timestamp,timestamp_range_in_days,mean_user_duration,median_user_duration,raw_mean_delta,raw_median_delta
0,Movielens-1m,6040,3706,1000209,0.044684,165.597517,165.597517,192.747029,20,2314,96.0,269.889099,384.047838,1,3428,123.5,165.597517,192.747029,20,2314,96.0,1046455000.0,956703900.0,1038.780764,94.997423,0.047089,49865.74,0.0
1,Movielens-20m,138493,26744,20000263,0.0054,144.41353,144.41353,230.267257,20,9254,68.0,747.841123,3085.818268,1,67310,18.0,144.41353,230.267257,20,9254,68.0,1427784000.0,789652000.0,7385.787014,196.592449,0.039329,118437.8,11.0
2,Beauty,22363,12101,198502,0.000734,8.876358,8.876358,8.163819,5,204,6.0,16.403768,23.609928,5,431,9.0,8.876358,8.163819,5,204,6.0,1406074000.0,1023840000.0,4424.0,548.581809,365.0,6017688.0,345600.0


#### Sanity checks

In [11]:
pd.to_datetime(df_raw.sort_values(["dataset"]).drop_duplicates()["max_timestamp"], unit='s').head(3)

2   2014-07-23 00:00:00.000000000
3   2012-01-11 10:10:56.000000000
4   2016-06-01 00:19:59.894000128
Name: max_timestamp, dtype: datetime64[ns]

In [12]:
pd.to_datetime(df_raw.sort_values(["dataset"]).drop_duplicates()["min_timestamp"], unit='s').head(3)

2   2002-06-12 00:00:00.000000000
3   1996-08-22 00:00:01.000000000
4   2016-01-01 00:00:05.082000128
Name: min_timestamp, dtype: datetime64[ns]

In [13]:
final_cols_raw = ['dataset', 'n_interactions', 'n_users', 'n_items', 'avg_seq_length', 'density','timestamp_range_in_days']
df_raw_to_print = convert_cols(df_raw[final_cols_raw], final_cols_ints, final_cols_floats, round_2)
df_raw_to_print.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   dataset                  8 non-null      object 
 1   n_interactions           8 non-null      int64  
 2   n_users                  8 non-null      int64  
 3   n_items                  8 non-null      int64  
 4   avg_seq_length           8 non-null      float64
 5   density                  8 non-null      float64
 6   timestamp_range_in_days  8 non-null      int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 576.0+ bytes


In [14]:
df_raw_to_print.columns=rename_cols_flat(df_raw_to_print, final_col_names)
style_percent(style_ints(df_raw_to_print.sort_values(["Dataset"]), final_cols_ints), ['Density (%)'])

Unnamed: 0,Dataset,#Interact.,#Users,n_items,Seq. len.,Density (%),#Days
2,Beauty,198502,22363,12 101,8.88,0.07,4424
3,BeerAdvocate,1475412,14635,22 074,100.81,0.46,5620
4,Diginetica,1235380,310324,122 993,3.98,0.0,152
0,Movielens-1m,1000209,6040,3 706,165.6,4.47,1038
1,Movielens-20m,20000263,138493,26 744,144.41,0.54,7385
5,Sports,296337,35598,18 357,8.32,0.05,4521
7,YooChoose,7142670,2000000,41 048,3.57,0.01,181
6,Zvuk,10867482,20000,391 322,543.37,0.14,91


# Preprocessed

In [15]:
data_dir = '../data/statistics/preprocessed'
df_prepr = process_raw_prepr(data_dir)
df_prepr.head(3)

Unnamed: 0,dataset,n_users,n_items,n_interactions,density,avg_seq_length,seq_len_mean,seq_len_std,seq_len_min,seq_len_max,seq_len_median,item_occurrence_mean,item_occurrence_std,item_occurrence_min,item_occurrence_max,item_occurrence_median,user_activity_mean,user_activity_std,user_activity_min,user_activity_max,user_activity_median,max_timestamp,min_timestamp,timestamp_range_in_days,mean_user_duration,median_user_duration,preprocessed_mean_delta,preprocessed_median_delta
0,Movielens-1m,6040,3416,999611,0.048448,165.49851,165.49851,192.543909,18,2277,96.0,292.626171,391.674786,5,3428,146.0,165.49851,192.543909,18,2277,96.0,1046455000.0,956703900.0,1038.780764,94.991579,0.047089,49892.69,0.0
1,Movielens-20m,138493,18345,19984024,0.007866,144.296275,144.296275,229.239975,16,8540,68.0,1089.344454,3675.700761,5,67310,80.0,144.296275,229.239975,16,8540,68.0,1427784000.0,789652000.0,7385.787014,196.550497,0.039317,118509.5,11.0
2,Beauty,22363,12101,198502,0.000734,8.876358,8.876358,8.163819,5,204,6.0,16.403768,23.609928,5,431,9.0,8.876358,8.163819,5,204,6.0,1406074000.0,1023840000.0,4424.0,548.581809,365.0,6017688.0,345600.0


In [16]:
df_prepr['mean_user_lifetime,%']= (df_prepr['mean_user_duration'] * 100 / df_prepr['timestamp_range_in_days'].round(0)).round(2)
df_prepr['median_delta']=df_prepr['preprocessed_median_delta'].astype(int)

### Table 2

In [17]:
final_cols_prepr = ['dataset', 'n_interactions', 'n_users', 'n_items', 'avg_seq_length', 'density', 'timestamp_range_in_days']
df_prepr_to_print = convert_cols(df_prepr[final_cols_prepr], final_cols_ints, final_cols_floats, round_2)

In [18]:
tbl_dataset_stats = style_percent(style_ints(df_prepr_to_print[final_cols_prepr]
                                             .sort_values(["dataset"]), final_cols_ints), ['density'])
tbl_dataset_stats.columns=rename_cols_flat(tbl_dataset_stats, final_col_names)
tbl_dataset_stats

Unnamed: 0,Dataset,#Interact.,#Users,n_items,Seq. len.,Density (%),#Days
2,Beauty,198 502,22 363,12 101,8.88,0.07,4 424
3,BeerAdvocate,1 475 412,14 635,22 074,100.81,0.46,5 620
4,Diginetica,485 903,61 279,25 593,7.93,0.03,152
0,Movielens-1m,999 611,6 040,3 416,165.5,4.84,1 038
1,Movielens-20m,19 984 024,138 493,18 345,144.3,0.79,7 385
5,Sports,296 337,35 598,18 357,8.32,0.05,4 521
7,YooChoose,2 792 229,335 203,20 758,8.33,0.04,181
6,Zvuk,8 087 953,19 267,150 206,419.78,0.28,91


In [19]:
# tbl_dataset_stats.to_csv('dataset_stats.csv', index=False)

### Table 3 and 4 for `Full data` 

In [20]:
tbl_dataset_stats = style_ints(df_prepr[['dataset', 'timestamp_range_in_days', 'mean_user_lifetime,%', 'n_users', 'avg_seq_length', 'median_delta']]
                                             .sort_values(["dataset"]), final_cols_ints)
tbl_dataset_stats.columns=rename_cols_flat(tbl_dataset_stats, final_col_names)
tbl_dataset_stats.set_index('Dataset').sort_index().T


Dataset,Beauty,BeerAdvocate,Diginetica,Movielens-1m,Movielens-20m,Sports,YooChoose,Zvuk
#Days,4 424,5 620,152,1 039,7 386,4 521,182,92
Lifetime (%),12.4,11.56,0.01,9.14,2.66,12.0,0.01,43.47
#Users,22 363,14 635,61 279,6 040,138 493,35 598,335 203,19 267
Seq. len.,8.876358,100.813939,7.929356,165.49851,144.296275,8.324541,8.329964,419.782685
Median delta,345600,73182,58,0,11,172800,59,14


### Raw with Preprocessed

In [21]:
raw_prepr = pd.merge(df_raw[final_cols_raw], df_prepr[final_cols_prepr], on="dataset", suffixes=("_raw", "_pr")).sort_values(["dataset"])

In [22]:
for col in final_cols_prepr:
    if col != "dataset":
        raw_prepr[col + "_rc"] = (raw_prepr[col + "_pr"] / raw_prepr[col + "_raw"]).round(4)

In [23]:
raw_prepr[[raw_prepr.columns[0]] + raw_prepr.columns[1:].sort_values().to_list()]

Unnamed: 0,dataset,avg_seq_length_pr,avg_seq_length_raw,avg_seq_length_rc,density_pr,density_raw,density_rc,n_interactions_pr,n_interactions_raw,n_interactions_rc,n_items_pr,n_items_raw,n_items_rc,n_users_pr,n_users_raw,n_users_rc,timestamp_range_in_days_pr,timestamp_range_in_days_raw,timestamp_range_in_days_rc
2,Beauty,8.876358,8.876358,1.0,0.000734,0.000734,1.0,198502,198502,1.0,12101,12101,1.0,22363,22363,1.0,4424.0,4424.0,1.0
3,BeerAdvocate,100.813939,100.813939,1.0,0.004567,0.004567,1.0,1475412,1475412,1.0,22074,22074,1.0,14635,14635,1.0,5620.424248,5620.424248,1.0
4,Diginetica,7.929356,3.980936,1.9918,0.00031,3.2e-05,9.5722,485903,1235380,0.3933,25593,122993,0.2081,61279,310324,0.1975,152.013511,152.013829,1.0
0,Movielens-1m,165.49851,165.597517,0.9994,0.048448,0.044684,1.0842,999611,1000209,0.9994,3416,3706,0.9217,6040,6040,1.0,1038.780764,1038.780764,1.0
1,Movielens-20m,144.296275,144.41353,0.9992,0.007866,0.0054,1.4567,19984024,20000263,0.9992,18345,26744,0.6859,138493,138493,1.0,7385.787014,7385.787014,1.0
5,Sports,8.324541,8.324541,1.0,0.000453,0.000453,1.0,296337,296337,1.0,18357,18357,1.0,35598,35598,1.0,4521.0,4521.0,1.0
7,YooChoose,8.329964,3.571335,2.3325,0.000401,8.7e-05,4.6123,2792229,7142670,0.3909,20758,41048,0.5057,335203,2000000,0.1676,181.999525,181.999595,1.0
6,Zvuk,419.782685,543.3741,0.7725,0.002795,0.001389,2.0127,8087953,10867482,0.7442,150206,391322,0.3838,19267,20000,0.9634,91.999869,91.999869,1.0


# Splitted

In [24]:
data_dir = '../data/statistics/splitted'
all_splitted_stats = process_splitted(data_dir)
splitted_cols = ['dataset', 'subset', 'split_type', 'validation_type', 'quantile']
all_splitted_stats = all_splitted_stats[splitted_cols + [col for col in all_splitted_stats.columns if col not in splitted_cols]]
all_splitted_stats['validation_type'] = all_splitted_stats.apply(lambda row: 'leave-one-out' if row['split_type'] == 'leave-one-out' else row['validation_type'], axis=1)


## Training

In [25]:
df_train = all_splitted_stats[all_splitted_stats['subset'] == 'train']
df_train = df_train[(df_train['split_type'] == 'leave-one-out') | (df_train['quantile'] == "q09")]
df_train.drop(columns=['quantile', 'subset'], inplace=True)

In [26]:
final_cols_ints = ['n_users', 'n_items', 'n_interactions', 'timestamp_range_in_days', 'train_mean_delta', 'train_median_delta']
final_cols_floats = ['density', 'avg_seq_length']
final_cols_cat = ['dataset', 'validation_type', 'split_type']
final_cols_split = ['dataset', 'split_type', 'validation_type', 'n_interactions', 'n_users', 'n_items', 'avg_seq_length', 'density','timestamp_range_in_days', 'train_mean_delta', 'train_median_delta']
assert set(final_cols_split) == set(final_cols_ints + final_cols_floats + final_cols_cat)


df_train = convert_cols(df_train, final_cols_ints, final_cols_floats, round_2)

df_train[final_cols_split].info()

# df_train[final_cols_split].sort_values(["dataset"]).head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32 entries, 0 to 165
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   dataset                  32 non-null     object 
 1   split_type               32 non-null     object 
 2   validation_type          32 non-null     object 
 3   n_interactions           32 non-null     int64  
 4   n_users                  32 non-null     int64  
 5   n_items                  32 non-null     int64  
 6   avg_seq_length           32 non-null     float64
 7   density                  32 non-null     float64
 8   timestamp_range_in_days  32 non-null     int64  
 9   train_mean_delta         32 non-null     int64  
 10  train_median_delta       32 non-null     int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 3.0+ KB


In [27]:
df_prepr["split_type"] = "full_df"
df_prepr["validation_type"] = "full_df"

train_prepr = pd.merge(df_train[final_cols_split], df_prepr[final_cols_prepr], on="dataset", suffixes=("", "__pr")).sort_values(["dataset"])
for col in final_cols_prepr:
    if col not in final_cols_cat:
        train_prepr[col + ",%"] = (train_prepr[col] / train_prepr[col + "__pr"]).round(4)
train_prepr[train_prepr.columns[0:3].to_list() + train_prepr.columns[3:].sort_values().to_list()].head(5)

Unnamed: 0,dataset,split_type,validation_type,avg_seq_length,"avg_seq_length,%",avg_seq_length__pr,density,"density,%",density__pr,n_interactions,"n_interactions,%",n_interactions__pr,n_items,"n_items,%",n_items__pr,n_users,"n_users,%",n_users__pr,timestamp_range_in_days,"timestamp_range_in_days,%",timestamp_range_in_days__pr,train_mean_delta,train_median_delta
8,Beauty,leave-one-out,leave-one-out,6.88,0.7751,8.876358,0.00057,0.7771,0.000734,153776,0.7747,198502,12068,0.9973,12101,22363,1.0,22363,4424,1.0,4424.0,6066561,345600
9,Beauty,global_timesplit,val_by_time,7.87,0.8866,8.876358,0.000667,0.9093,0.000734,160178,0.8069,198502,11801,0.9752,12101,20341,0.9096,22363,4292,0.9702,4424.0,6400002,345600
10,Beauty,global_timesplit,val_last_train_item,7.42,0.8359,8.876358,0.000623,0.8493,0.000734,156529,0.7886,198502,11909,0.9841,12101,21085,0.9429,22363,4353,0.984,4424.0,6209717,345600
11,Beauty,global_timesplit,val_by_user,8.32,0.9373,8.876358,0.000696,0.9488,0.000734,170099,0.8569,198502,11958,0.9882,12101,20436,0.9138,22363,4353,0.984,4424.0,6230110,345600
15,BeerAdvocate,global_timesplit,val_by_user,95.91,0.9514,100.813939,0.0045,0.9853,0.004567,1220757,0.8274,1475412,21315,0.9656,22074,12728,0.8697,14635,4928,0.8768,5620.424248,557251,72580


### Percents

In [28]:
import itertools
# cols_to_leave=["#days", "#interact", "#users", "#items", "avg_seq_len"]
cols_to_leave=["n_interactions", "n_users", "timestamp_range_in_days", "avg_seq_length"]
# cols_to_leave_list = list(itertools.chain.from_iterable([(f'{col}__tr', f'{col}_rc') for col in cols_to_leave]))
cols_to_leave_list = [f'{col},%' for col in cols_to_leave]
# cols_to_leave_list=cols_to_leave


In [29]:
train_prepr = convert_cols(train_prepr[final_cols_split + cols_to_leave_list], final_cols_ints, final_cols_floats, round_2)

In [30]:
concat_ = pd.concat([df_prepr[final_cols_split[:-2]], train_prepr[final_cols_split[:3] + cols_to_leave_list]])
concat_['avg_seq_length'] = concat_['avg_seq_length'].round(2)
concat_.head(5)

Unnamed: 0,dataset,split_type,validation_type,n_interactions,n_users,n_items,avg_seq_length,density,timestamp_range_in_days,"n_interactions,%","n_users,%","timestamp_range_in_days,%","avg_seq_length,%"
0,Movielens-1m,full_df,full_df,999611.0,6040.0,3416.0,165.5,0.048448,1038.780764,,,,
1,Movielens-20m,full_df,full_df,19984024.0,138493.0,18345.0,144.3,0.007866,7385.787014,,,,
2,Beauty,full_df,full_df,198502.0,22363.0,12101.0,8.88,0.000734,4424.0,,,,
3,BeerAdvocate,full_df,full_df,1475412.0,14635.0,22074.0,100.81,0.004567,5620.424248,,,,
4,Diginetica,full_df,full_df,485903.0,61279.0,25593.0,7.93,0.00031,152.013511,,,,


In [31]:
tbl = pd.pivot_table(concat_, 
               index=['dataset'], 
               columns=["split_type", "validation_type"], 
               values=cols_to_leave+cols_to_leave_list)

In [32]:

train_tbl = tbl.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1).sort_index(axis=1, level=0)

In [33]:
int_cols = train_tbl.columns[1:4]
percent_cols = [col for col in train_tbl.columns if col[2].find(",%") != -1]


In [34]:
stats_percent = style_percent(style_ints(train_tbl, int_cols), percent_cols)
stats_percent.columns = pd.MultiIndex.from_tuples(rename_cols_nested(stats_percent, final_col_names))
stats_percent

Unnamed: 0_level_0,Full data,Full data,Full data,Full data,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,LOO,LOO,LOO,LOO
Unnamed: 0_level_1,Full data,Full data,Full data,Full data,GT,GT,GT,GT,UB,UB,UB,UB,LTI,LTI,LTI,LTI,LOO,LOO,LOO,LOO
Unnamed: 0_level_2,Seq. len.,#Interact.,#Users,#Days,"avg_seq_length,%","n_interactions,%","#Users,%","#Days,%","avg_seq_length,%","n_interactions,%","#Users,%","#Days,%","avg_seq_length,%","n_interactions,%","#Users,%","#Days,%","avg_seq_length,%","n_interactions,%","#Users,%","#Days,%"
dataset,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3
Beauty,8.88,198 502,22 363,4 424,88.66,80.69,90.96,97.02,93.73,85.69,91.38,98.4,83.59,78.86,94.29,98.4,77.51,77.47,100.0,100.0
BeerAdvocate,100.81,1 475 412,14 635,5 620,93.77,80.99,86.37,93.96,95.14,82.74,86.97,87.68,95.23,89.06,93.52,96.74,98.01,98.02,100.0,99.99
Diginetica,7.93,485 903,61 279,152,99.76,80.97,81.16,87.49,100.01,88.35,88.32,94.07,87.4,78.65,89.99,94.07,74.79,74.78,100.0,99.99
Movielens-1m,165.5,999 611,6 040,1 039,89.72,81.0,90.28,21.37,90.85,75.01,82.57,23.87,89.86,89.4,99.49,23.87,98.79,98.79,100.0,99.92
Movielens-20m,144.3,19 984 024,138 493,7 386,98.82,81.0,81.97,74.24,99.79,89.3,89.49,85.1,99.07,89.37,90.22,85.1,98.62,98.61,100.0,99.99
Sports,8.32,296 337,35 598,4 521,88.29,80.76,91.42,96.55,93.58,87.27,93.2,98.05,82.65,78.17,94.54,98.05,75.92,75.97,100.0,100.0
YooChoose,8.33,2 792 229,335 203,182,99.52,81.0,81.4,81.87,99.76,89.69,89.96,90.11,87.76,79.16,90.27,90.11,75.99,75.99,100.0,99.45
Zvuk,419.78,8 087 953,19 267,92,88.38,81.0,91.65,82.61,93.47,84.35,90.24,90.22,93.97,89.77,95.54,90.22,99.52,99.52,100.0,98.91


### Abs values

In [35]:
cols_to_leave_list=["n_interactions", "n_users", "timestamp_range_in_days", "avg_seq_length"]

In [None]:
concat_ = pd.concat([df_prepr[final_cols_split[:-2]], train_prepr[final_cols_split[:3] + cols_to_leave_list]])
concat_['avg_seq_length'] = concat_['avg_seq_length'].round(2)
concat_.head(5)

Unnamed: 0,dataset,split_type,validation_type,n_interactions,n_users,n_items,avg_seq_length,density,timestamp_range_in_days
0,Movielens-1m,full_df,full_df,999611,6040,3416.0,165.5,0.048448,1038.780764
1,Movielens-20m,full_df,full_df,19984024,138493,18345.0,144.3,0.007866,7385.787014
2,Beauty,full_df,full_df,198502,22363,12101.0,8.88,0.000734,4424.0
3,BeerAdvocate,full_df,full_df,1475412,14635,22074.0,100.81,0.004567,5620.424248
4,Diginetica,full_df,full_df,485903,61279,25593.0,7.93,0.00031,152.013511


In [37]:
tbl = pd.pivot_table(concat_, 
               index=['dataset'], 
               columns=["split_type", "validation_type"], 
               values=cols_to_leave_list)
train_tbl = tbl.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1).sort_index(axis=1, level=0)

In [38]:
stats_abs = style_ints(train_tbl, [col for col in train_tbl.columns if col[2].find("seq_len") == -1])
stats_abs.columns = pd.MultiIndex.from_tuples(rename_cols_nested(stats_abs, final_col_names))
stats_abs

Unnamed: 0_level_0,Full data,Full data,Full data,Full data,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,LOO,LOO,LOO,LOO
Unnamed: 0_level_1,Full data,Full data,Full data,Full data,GT,GT,GT,GT,UB,UB,UB,UB,LTI,LTI,LTI,LTI,LOO,LOO,LOO,LOO
Unnamed: 0_level_2,Seq. len.,#Interact.,#Users,#Days,Seq. len.,#Interact.,#Users,#Days,Seq. len.,#Interact.,#Users,#Days,Seq. len.,#Interact.,#Users,#Days,Seq. len.,#Interact.,#Users,#Days
dataset,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3
Beauty,8.88,198 502,22 363,4 424,7.87,160 178,20 341,4 292,8.32,170 099,20 436,4 353,7.42,156 529,21 085,4 353,6.88,153 776,22 363,4 424
BeerAdvocate,100.81,1 475 412,14 635,5 620,94.53,1 194 889,12 640,5 281,95.91,1 220 757,12 728,4 928,96.01,1 313 978,13 686,5 437,98.81,1 446 142,14 635,5 620
Diginetica,7.93,485 903,61 279,152,7.91,393 431,49 731,133,7.93,429 284,54 124,143,6.93,382 160,55 145,143,5.93,363 345,61 279,152
Movielens-1m,165.5,999 611,6 040,1 039,148.48,809 685,5 453,222,150.35,749 774,4 987,248,148.72,893 637,6 009,248,163.5,987 531,6 040,1 038
Movielens-20m,144.3,19 984 024,138 493,7 386,142.59,16 187 042,113 523,5 483,143.99,17 845 064,123 931,6 285,142.95,17 860 644,124 947,6 285,142.3,19 707 038,138 493,7 385
Sports,8.32,296 337,35 598,4 521,7.35,239 317,32 543,4 365,7.79,258 625,33 179,4 433,6.88,231 649,33 656,4 433,6.32,225 141,35 598,4 521
YooChoose,8.33,2 792 229,335 203,182,8.29,2 261 693,272 872,149,8.31,2 504 440,301 553,164,7.31,2 210 421,302 575,164,6.33,2 121 823,335 203,181
Zvuk,419.78,8 087 953,19 267,92,371.01,6 551 232,17 658,76,392.38,6 821 989,17 386,83,394.46,7 260 740,18 407,83,417.78,8 049 419,19 267,91


## Validation

### Table 3. Holdout statistics for different splits: Validation

In [39]:
df_val = all_splitted_stats[all_splitted_stats['subset'] == 'validation']

In [40]:
df_val = df_val[(df_val['split_type'] == 'leave-one-out') | (df_val['quantile'] == "q09")]
df_val['avg_hold_len'] = df_val['holdout_seq_len_mean']
df_val.drop(columns=['quantile', 'subset'], inplace=True)

In [41]:
final_cols_floats = ['density', 'avg_seq_length', "avg_hold_len"]
final_cols_ints = ['n_users', 'n_items', 'n_interactions', 'timestamp_range_in_days']
round_2 =['avg_seq_length', 'avg_hold_len']
final_cols_cat = ['dataset', 'validation_type', 'split_type']
final_cols_split = ['dataset', 'split_type', 'validation_type', 'n_interactions', 'n_users', 'n_items', 'avg_seq_length', 'avg_hold_len', 'density','timestamp_range_in_days']
assert set(final_cols_split) == set(final_cols_ints + final_cols_floats + final_cols_cat)

df_val = convert_cols(df_val, final_cols_ints, final_cols_floats, round_2)

In [42]:
val_prepr = pd.merge(df_val[final_cols_split], df_prepr[final_cols_prepr], on="dataset", suffixes=("", "__pr")).sort_values(["dataset"])
for col in final_cols_prepr:
    if col not in final_cols_cat:
        val_prepr[col + ",%"] = (val_prepr[col + ""] / val_prepr[col + "__pr"]).round(4)

#### relative

In [43]:
cols_to_leave=["n_users", "timestamp_range_in_days"]
cols_to_leave_list = [f'{col},%' for col in cols_to_leave]

In [44]:
df_prepr['avg_hold_len']=np.nan
val_prepr['avg_seq_len'] = np.nan
prepr_val_concat = pd.concat([df_prepr[final_cols_split], val_prepr[final_cols_split + cols_to_leave_list]])
prepr_val_concat_to_print = convert_cols(prepr_val_concat, final_cols_ints, final_cols_floats, round_2)

In [45]:
tbl = pd.pivot_table(prepr_val_concat_to_print, 
               index=['dataset'], 
               columns=["split_type", "validation_type"], 
               values=cols_to_leave + cols_to_leave_list + ['avg_hold_len', 'avg_seq_length'])

val_tbl =tbl.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1).sort_index(axis=1, level=0)


In [46]:
int_cols =  [col for col in val_tbl.columns if (col[2].find("_len") == -1) and (col[2].find(",%") == -1)]
percent_cols = [col for col in val_tbl.columns if col[2].find(",%") != -1]
stats_percent = style_percent(style_ints(val_tbl, int_cols), percent_cols)
stats_percent = stats_percent[list(stats_percent.columns[:3].values) + [col for col in stats_percent.columns[3:] if (col[2].find("%") != -1) or (col[2].find("_len") != -1)]]
stats_percent.columns = pd.MultiIndex.from_tuples(rename_cols_nested(stats_percent, final_col_names))
stats_percent = stats_percent[list(stats_percent.columns[:3].values) + list(stats_percent.columns[-3:].values) + list(stats_percent.columns[3:-3].values)]
stats_percent.T

Unnamed: 0,Unnamed: 1,dataset,Beauty,BeerAdvocate,Diginetica,Movielens-1m,Movielens-20m,Sports,YooChoose,Zvuk
Full data,Full data,Seq. len.,8.88,100.81,7.93,165.5,144.3,8.32,8.33,419.78
Full data,Full data,#Users,22 363,14 635,61 279,6 040,138 493,35 598,335 203,19 267
Full data,Full data,#Days,4 424,5 620,152,1 038,7 385,4 521,181,91
LOO,LOO,Seq. len.,7.88,99.81,6.93,164.5,143.3,7.32,7.33,418.78
LOO,LOO,"#Users,%",100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
LOO,LOO,"#Days,%",100.0,99.99,99.99,99.92,99.99,100.0,99.45,98.91
GTS,GT,Holdout len.,2.84,25.63,7.47,86.01,108.88,2.73,8.45,90.66
GTS,GT,Seq. len.,10.59,165.89,7.75,206.16,250.15,8.99,8.46,719.25
GTS,GT,"#Users,%",28.01,35.39,9.55,17.32,11.93,27.22,8.87,41.67
GTS,GT,"#Days,%",98.4,96.74,6.58,23.68,71.81,98.05,8.24,90.22


In [47]:
# stats_percent.to_csv('stats_percent_val.csv', index=True)

#### absolute

In [48]:
cols_to_leave=["n_users", "timestamp_range_in_days", 'avg_hold_len', 'avg_seq_length']
cols_to_leave_list = [f'{col}' for col in cols_to_leave]

In [49]:
df_val_copy = df_val.copy()
df_val_copy['avg_seq_len'] = np.nan
prepr_val_concat = pd.concat([df_prepr[final_cols_split], df_val_copy[final_cols_split]])
prepr_val_concat = convert_cols(prepr_val_concat, final_cols_ints, final_cols_floats, round_2)

In [50]:
tbl = pd.pivot_table(prepr_val_concat, 
               index=['dataset'], 
               columns=["split_type", "validation_type"], 
            #    values=train_prepr.columns[3:].sort_values().to_list())
               values=cols_to_leave_list)

val_tbl = tbl.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1).sort_index(axis=1, level=0)
int_cols =  [col for col in val_tbl.columns if (col[2].find("_len") == -1) and (col[2].find(",%") == -1)]
val_stats_abs = style_ints(val_tbl, int_cols)
val_stats_abs.columns = pd.MultiIndex.from_tuples(rename_cols_nested(val_stats_abs, final_col_names))
val_stats_abs = val_stats_abs[list(val_stats_abs.columns[:3].values) + list(val_stats_abs.columns[-3:].values) + list(val_stats_abs.columns[3:-3].values)]
val_stats_abs

Unnamed: 0_level_0,Full data,Full data,Full data,LOO,LOO,LOO,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS,GTS
Unnamed: 0_level_1,Full data,Full data,Full data,LOO,LOO,LOO,GT,GT,GT,GT,UB,UB,UB,LTI,LTI,LTI
Unnamed: 0_level_2,Seq. len.,#Users,#Days,Seq. len.,#Users,#Days,Holdout len.,Seq. len.,#Users,#Days,Seq. len.,#Users,#Days,Seq. len.,#Users,#Days
dataset,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Beauty,8.88,22 363,4 424,7.88,22 363,4 424,2.84,10.59,6 263,4 353,8.07,1 024,3 788,8.31,21 460,4 353
BeerAdvocate,100.81,14 635,5 620,99.81,14 635,5 620,25.63,165.89,5 180,5 437,104.53,1 024,5 437,96.55,13 752,5 437
Diginetica,7.93,61 279,152,6.93,61 279,152,7.47,7.75,5 855,10,7.84,1 024,137,7.93,55 148,143
Movielens-1m,165.5,6 040,1 038,164.5,6 040,1 038,86.01,206.16,1 046,246,146.36,1 024,247,149.67,6 011,248
Movielens-20m,144.3,138 493,7 385,143.3,138 493,7 385,108.88,250.15,16 519,5 304,137.25,1 024,5 871,143.94,124 955,6 285
Sports,8.32,35 598,4 521,7.32,35 598,4 521,2.73,8.99,9 691,4 433,7.59,1 024,3 308,7.79,34 203,4 433
YooChoose,8.33,335 203,181,7.33,335 203,181,8.45,8.46,29 727,15,8.36,1 024,164,8.31,302 577,164
Zvuk,419.78,19 267,91,418.78,19 267,91,90.66,719.25,8 029,83,446.45,1 024,83,395.39,18 410,83


In [51]:
# val_stats_abs.to_csv('val_stats_abs.csv', index=True)

### Table 5 Median time intervals for targets: Validation


In [52]:
df_val = all_splitted_stats[(all_splitted_stats['subset'] == 'validation')]

In [53]:
AGG_FN = 'median'
delta_cols = [col for col in df_val.columns if f'_{AGG_FN}_delta' in col]
full_data_deltas = df_prepr[['dataset', f'preprocessed_{AGG_FN}_delta']]
merged_deltas = pd.merge(df_val[['dataset', 'validation_type']+delta_cols], full_data_deltas, on='dataset')
for col in ['first', 'last', 'successive', 'random']:
    merged_deltas[f'{col}_delta,%'] = (merged_deltas[f'{col}_{AGG_FN}_delta'] / merged_deltas[f'preprocessed_{AGG_FN}_delta']).round(2)
    
merged_deltas['validation_type'].where(merged_deltas['validation_type']!='', 'leave-one-out', inplace=True)

In [54]:
rel_delta = merged_deltas[
    ['dataset', 'validation_type', f'preprocessed_{AGG_FN}_delta'] +
    [f'{col}_delta,%' for col in ['first', 'random', 'last', 'successive']]]

In [55]:
merged_deltas['preprocessed_median_delta'] = np.nan
full_data_deltas['validation_type']='preprocessed_median_delta'
# full_data_deltas['subset']='preprocessed'
full_deltas = pd.concat([full_data_deltas, merged_deltas])

In [56]:
tbl = pd.pivot_table(full_deltas, 
               index=['dataset'], 
               columns=["validation_type"], 
               values=['preprocessed_median_delta', 'last_median_delta']
               )

test_tbl =tbl.swaplevel(0, 1, axis=1).sort_index(axis=1, level=0)
test_tbl

validation_type,leave-one-out,preprocessed_median_delta,val_by_time,val_by_user,val_last_train_item
Unnamed: 0_level_1,last_median_delta,preprocessed_median_delta,last_median_delta,last_median_delta,last_median_delta
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Beauty,172800.0,345600.0,1015200.0,604800.0,604800.0
BeerAdvocate,360900.0,73182.0,379420.875,691188.0,690794.0
Diginetica,63.0,58.0,69.0,70.0,70.0
Movielens-1m,18.0,0.0,55.5,15.0,15.0
Movielens-20m,17.0,11.0,39.75,19.0,21.0
Sports,86400.0,172800.0,1252800.0,518400.0,518400.0
YooChoose,59.0,59.0,68.0,65.0,65.0
Zvuk,98.0,14.0,89.25,78.5,68.5


In [57]:
test_tbl_order = [
    ('preprocessed_median_delta', 'preprocessed_median_delta'),
    ('leave-one-out', 'last_median_delta'),
    ('val_by_user', 'last_median_delta'),
    ('val_last_train_item', 'last_median_delta'),
    ('val_by_time', 'last_median_delta'),
    ]
test_tbl=test_tbl[test_tbl_order]

In [58]:
test_tbl=style_ints(test_tbl, test_tbl.columns[0:])
test_tbl.columns = pd.MultiIndex.from_tuples(rename_cols_nested(test_tbl, final_col_names))
test_tbl.droplevel(level=1, axis=1).T


dataset,Beauty,BeerAdvocate,Diginetica,Movielens-1m,Movielens-20m,Sports,YooChoose,Zvuk
Prepr. data,345 600,73 182,58,0,11,172 800,59,14
LOO,172 800,360 900,63,18,17,86 400,59,98
UB,604 800,691 188,70,15,19,518 400,65,78
LTI,604 800,690 794,70,15,21,518 400,65,68
GT,1 015 200,379 421,69,56,40,1 252 800,68,89


In [59]:
# test_tbl.to_csv('val_delta_median.csv', index=True)

## Test

In [60]:
df_test = all_splitted_stats[all_splitted_stats['subset'] == 'test']
df_test = df_test.query("(validation_type=='val_by_time') or split_type=='leave-one-out'")
df_test['avg_hold_len'] = df_test['holdout_seq_len_mean']
df_test.drop(columns=["timestamp_range_in_days", 'subset'], inplace=True)
df_test.rename(columns={"range_gt_tmstmp_delta_in_days": "timestamp_range_in_days"}, inplace=True)
df_test = convert_cols(df_test, final_cols_ints, final_cols_floats, round_2)

In [61]:
final_cols_split = ['dataset',
 'split_type',
 'validation_type',
 'quantile',
 'n_interactions',
 'n_users',
 'n_items',
 'avg_seq_length',
 'avg_hold_len',
 'density',
 'timestamp_range_in_days']
final_cols_cat.append('quantile')

### Table 3. Holdout statistics for different splits: Test

In [62]:
df_prepr['quantile']='full_df'

In [63]:
test_prepr = pd.merge(df_test[final_cols_split], df_prepr[final_cols_split], on="dataset", suffixes=("", "__pr")).sort_values(["dataset"])
for col in final_cols_split:
    if col not in final_cols_cat:
        test_prepr[col + ",%"] = (test_prepr[col] / test_prepr[col + "__pr"]).round(4)
test_prepr['avg_seq_len'] = np.nan

#### relative

In [64]:
cols_to_leave=["n_users", "timestamp_range_in_days"]
cols_to_leave_list = [f'{col},%' for col in cols_to_leave]

In [65]:
prepr_test_concat = pd.concat([df_prepr[final_cols_split], test_prepr[final_cols_split[:4] + cols_to_leave_list + ['avg_hold_len']]])
main_table = prepr_test_concat[(prepr_test_concat["split_type"].isin(["full_df", "leave-one-out"])) | (prepr_test_concat["quantile"]=='q09')]
main_table['avg_seq_length'] = main_table['avg_seq_length'].round(2)

In [66]:
tbl = pd.pivot_table(main_table, 
               index=['dataset'], 
               columns=["split_type", "validation_type"], 
            #    values=train_prepr.columns[3:].sort_values().to_list())
               values=cols_to_leave + cols_to_leave_list + ['avg_hold_len', 'avg_seq_length'])

test_tbl =tbl.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1).sort_index(axis=1, level=0)

In [67]:
int_cols =  [col for col in test_tbl.columns if (col[2].find("_len") == -1) and (col[2].find(",%") == -1)]
percent_cols = [col for col in test_tbl.columns if col[2].find(",%") != -1]
stats_percent = style_percent(style_ints(test_tbl, int_cols), percent_cols)
stats_percent.columns = pd.MultiIndex.from_tuples(rename_cols_nested(stats_percent, final_col_names))
stats_percent.T

Unnamed: 0,Unnamed: 1,dataset,Beauty,BeerAdvocate,Diginetica,Movielens-1m,Movielens-20m,Sports,YooChoose,Zvuk
Full data,Full data,Seq. len.,8.88,100.81,7.93,165.5,144.3,8.32,8.33,419.78
Full data,Full data,#Users,22 363,14 635,61 279,6 040,138 493,35 598,335 203,19 267
Full data,Full data,#Days,4 424,5 620,152,1 039,7 386,4 521,182,92
GTS,GT,Holdout len.,3.25,28.83,7.66,82.68,107.66,2.89,8.55,95.92
GTS,GT,"#Users,%",27.32,34.97,10.36,20.02,13.4,28.66,9.74,43.76
GTS,GT,"#Days,%",1.6,3.26,5.92,76.05,14.89,1.95,9.34,8.7
LOO,LOO,"#Users,%",100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
LOO,LOO,"#Days,%",84.04,66.92,99.99,99.92,94.44,68.06,99.45,98.91


In [68]:
# stats_percent.to_csv('stats_percent_test.csv', index=True)

#### absolute

In [69]:
cols_to_leave=["n_users", "timestamp_range_in_days", 'avg_hold_len', 'avg_seq_length']
cols_to_leave_list = [f'{col}' for col in cols_to_leave]

In [70]:
df_test_copy = df_test.copy()
df_test_copy['avg_seq_len'] = np.nan
prepr_test_concat = pd.concat([df_prepr[final_cols_split], df_test_copy[final_cols_split]])
main_table = prepr_test_concat[(prepr_test_concat["split_type"].isin(["full_df", "leave-one-out"])) | (prepr_test_concat["quantile"]=='q09')]
main_table['avg_seq_length'] = main_table['avg_seq_length'].round(2)

In [71]:
tbl = pd.pivot_table(main_table, 
               index=['dataset'], 
               columns=["split_type", "validation_type"], 
            #    values=train_prepr.columns[3:].sort_values().to_list())
               values=cols_to_leave_list)

test_tbl = tbl.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1).sort_index(axis=1, level=0)

In [72]:
int_cols =  [col for col in test_tbl.columns if (col[2].find("_len") == -1) and (col[2].find(",%") == -1)]
test_tbl = style_ints(test_tbl, int_cols)
test_tbl.columns = pd.MultiIndex.from_tuples(rename_cols_nested(test_tbl, final_col_names))
test_tbl.droplevel(level=1, axis=1).T

Unnamed: 0,dataset,Beauty,BeerAdvocate,Diginetica,Movielens-1m,Movielens-20m,Sports,YooChoose,Zvuk
Full data,Seq. len.,8.88,100.81,7.93,165.5,144.3,8.32,8.33,419.78
Full data,#Users,22 363,14 635,61 279,6 040,138 493,35 598,335 203,19 267
Full data,#Days,4 424,5 620,152,1 039,7 386,4 521,182,92
GTS,Holdout len.,3.25,28.83,7.66,82.68,107.66,2.89,8.55,95.92
GTS,Seq. len.,11.5,185.23,7.9,301.95,234.67,9.78,8.56,756.59
GTS,#Users,6 110,5 118,6 346,1 209,18 562,10 203,32 657,8 432
GTS,#Days,71,183,9,790,1 100,88,17,8
LOO,Seq. len.,8.88,100.81,7.93,165.5,144.3,8.32,8.33,419.78
LOO,#Users,22 363,14 635,61 279,6 040,138 493,35 598,335 203,19 267
LOO,#Days,3 718,3 761,152,1 038,6 975,3 077,181,91


### Table 4: Test subset statistics for GTS for different quantiles

In [73]:
prepr_test_concat = pd.concat([df_prepr[final_cols_split], test_prepr[final_cols_split[:4] + cols_to_leave_list]])
quantile_table = prepr_test_concat[(prepr_test_concat["split_type"]!="leave-one-out")]

In [74]:
tbl = pd.pivot_table(
    quantile_table,
    index=['dataset'],
    columns=['quantile'],
    values=cols_to_leave_list)

quant_tbl = tbl.sort_index(axis=1, level=0, ascending=False)
quant_tbl['avg_seq_length'] = quant_tbl['avg_seq_length'].round(2)


In [75]:
cols_order = [('avg_seq_length', 'full_df')]
for key_ in ['avg_hold_len'] + cols_to_leave_list[:-2]:
    for quantile in ['full_df', 'q08', 'q09', 'q095', 'q0975']:
        if not (key_ == 'avg_hold_len' and quantile == 'full_df'):
            cols_order.append((key_, quantile))
quant_tbl = quant_tbl[cols_order]


In [76]:
int_cols =  [col for col in quant_tbl.columns if (col[0].find("_len") == -1) and (col[0].find(",%") == -1)]
quant_tbl = style_ints(quant_tbl, int_cols)

In [77]:
quant_tbl.columns = pd.MultiIndex.from_tuples(rename_cols_nested(quant_tbl, final_col_names))
quant_tbl

Unnamed: 0_level_0,Seq. len.,Holdout len.,Holdout len.,Holdout len.,Holdout len.,#Users,#Users,#Users,#Users,#Users,#Days,#Days,#Days,#Days,#Days
Unnamed: 0_level_1,Full data,q08,q09,q095,q0975,Full data,q08,q09,q095,q0975,Full data,q08,q09,q095,q0975
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Beauty,8.88,3.88,3.25,2.76,2.45,22 363,10 181,6 110,3 519,1 911,4 424,138,71,35,19
BeerAdvocate,100.81,42.53,28.83,18.7,12.01,14 635,6 938,5 118,3 944,3 071,5 620,354,183,94,48
Diginetica,7.93,7.68,7.66,7.38,6.55,61 279,12 660,6 346,3 293,1 856,152,20,9,4,2
Movielens-1m,165.5,112.13,82.68,61.48,45.6,6 040,1 783,1 209,813,548,1 039,818,790,617,400
Movielens-20m,144.3,125.93,107.66,92.79,86.89,138 493,31 738,18 562,10 769,5 750,7 386,1 994,1 100,569,201
Sports,8.32,3.52,2.89,2.61,2.6,35 598,16 728,10 203,5 630,2 791,4 521,163,88,43,22
YooChoose,8.33,8.49,8.55,8.57,8.79,335 203,65 803,32 657,16 289,7 942,182,34,17,10,5
Zvuk,419.78,149.62,95.92,61.57,42.78,19 267,10 811,8 432,6 568,4 726,92,16,8,4,2


In [78]:
# quant_tbl.to_csv('test_quant.csv', index=True)

### Table 5 Median time intervals for targets: Test

In [79]:
AGG_FN = 'median'

In [80]:
llo_delta = df_test[df_test['split_type'] == 'leave-one-out'][['dataset', f'last_{AGG_FN}_delta']]
llo_delta.rename(columns={f'last_{AGG_FN}_delta': f'LOO_{AGG_FN}_delta'}, inplace=True)

In [81]:
df_test_copy = df_test.query("validation_type=='val_by_time' & quantile=='q09'")
delta_cols = [f'{target}_{AGG_FN}_delta' for target in ['first', 'random', 'last', 'successive']]

In [82]:
merged_deltas = pd.merge(df_test_copy[['dataset']+delta_cols], llo_delta, on='dataset')
merged_deltas.columns = rename_cols_flat(merged_deltas, final_col_names)
merged_deltas = style_ints(merged_deltas, merged_deltas.columns[1:]).set_index('Dataset')
merged_deltas.T.sort_index()

Dataset,Beauty,BeerAdvocate,Diginetica,Movielens-1m,Sports,Zvuk,Movielens-20m,YooChoose
First,8 640 000,4 921 729,186,7 153 214,11 577 600,346 010,21 145 894,259
LOO,604 800,737 140,70,17,518 400,73,20,65
Last,1 382 400,508 452,70,67,1 296 000,91,29,68
Random,3 628 800,439 806,65,35,4 752 000,120,15,62
Successive,172 800,75 916,58,22,86 400,67,14,60
