In [1]:
import pandas as pd
import numpy as np


In [2]:
meta_dataset = pd.read_csv("./performance_meta_dataset.csv", index_col=0)
single_sample_algs = [
    "TopPop", 
    "GlobalEffects", 
    "Random",
    "SlopeOne",
]
min_samples = 30

keep_rows = (meta_dataset["num_samples"] >= min_samples) | meta_dataset["alg_name"].isin(single_sample_algs)
meta_dataset = meta_dataset.loc[keep_rows, :]

metafeats_fn = "../RecSys2019_DeepLearning_Evaluation/Metafeatures/Metafeatures.csv"
metafeats = pd.read_csv(metafeats_fn)
join_cols = ["dataset_name", "split_name"]
metafeats.columns = ["f__{}".format(col) if col not in join_cols else col for col in metafeats.columns]
del metafeats["split_name"]
metafeats = meta_dataset.merge(metafeats, on="dataset_name", how='left')
metafeats.head()

Unnamed: 0,dataset_name,alg_name,min_test_metric_ARHR_ALL_HITS_cut_1,max_test_metric_ARHR_ALL_HITS_cut_1,min_test_metric_ARHR_ALL_HITS_cut_10,max_test_metric_ARHR_ALL_HITS_cut_10,min_test_metric_ARHR_ALL_HITS_cut_15,max_test_metric_ARHR_ALL_HITS_cut_15,min_test_metric_ARHR_ALL_HITS_cut_2,max_test_metric_ARHR_ALL_HITS_cut_2,...,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_mean,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_median,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_min,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_mode,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_skewness,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_std,f__num_interactions,f__num_items,f__num_users,f__sparsity
0,AnimeReader,EASE_R_Recommender,0.0,0.034277,0.000442,0.059715,0.000563,0.062664,7.9e-05,0.043217,...,676.747371,337.0,-10225.0,10.0,3.895959,1021.199357,7669090,11200,69521,0.990151
1,AnimeReader,GlobalEffects,0.000187,0.000187,0.00456,0.00456,0.005064,0.005064,0.000245,0.000245,...,676.747371,337.0,-10225.0,10.0,3.895959,1021.199357,7669090,11200,69521,0.990151
2,AnimeReader,ItemKNN,0.0,0.043915,2e-06,0.070494,4e-06,0.073168,0.0,0.05484,...,676.747371,337.0,-10225.0,10.0,3.895959,1021.199357,7669090,11200,69521,0.990151
3,AnimeReader,MatrixFactorization_BPR_Cython,2.9e-05,0.013248,0.000192,0.032227,0.000222,0.034679,7.2e-05,0.017613,...,676.747371,337.0,-10225.0,10.0,3.895959,1021.199357,7669090,11200,69521,0.990151
4,AnimeReader,P3alphaRecommender,0.0,0.026841,0.0,0.047001,0.0,0.048477,0.0,0.034572,...,676.747371,337.0,-10225.0,10.0,3.895959,1021.199357,7669090,11200,69521,0.990151


In [3]:
list(meta_dataset.columns)

['dataset_name',
 'alg_name',
 'min_test_metric_ARHR_ALL_HITS_cut_1',
 'max_test_metric_ARHR_ALL_HITS_cut_1',
 'min_test_metric_ARHR_ALL_HITS_cut_10',
 'max_test_metric_ARHR_ALL_HITS_cut_10',
 'min_test_metric_ARHR_ALL_HITS_cut_15',
 'max_test_metric_ARHR_ALL_HITS_cut_15',
 'min_test_metric_ARHR_ALL_HITS_cut_2',
 'max_test_metric_ARHR_ALL_HITS_cut_2',
 'min_test_metric_ARHR_ALL_HITS_cut_20',
 'max_test_metric_ARHR_ALL_HITS_cut_20',
 'min_test_metric_ARHR_ALL_HITS_cut_3',
 'max_test_metric_ARHR_ALL_HITS_cut_3',
 'min_test_metric_ARHR_ALL_HITS_cut_30',
 'max_test_metric_ARHR_ALL_HITS_cut_30',
 'min_test_metric_ARHR_ALL_HITS_cut_4',
 'max_test_metric_ARHR_ALL_HITS_cut_4',
 'min_test_metric_ARHR_ALL_HITS_cut_40',
 'max_test_metric_ARHR_ALL_HITS_cut_40',
 'min_test_metric_ARHR_ALL_HITS_cut_5',
 'max_test_metric_ARHR_ALL_HITS_cut_5',
 'min_test_metric_ARHR_ALL_HITS_cut_50',
 'max_test_metric_ARHR_ALL_HITS_cut_50',
 'min_test_metric_ARHR_ALL_HITS_cut_6',
 'max_test_metric_ARHR_ALL_HITS_cut_6'

In [4]:
# Algorithm selection

def rank_algorithms(test_datasets, metric_name):
    """Compute algorithm ranks for each dataset"""
    # Sanity check to prevent leakage
    for test_dataset in test_datasets:
        assert test_dataset in meta_dataset['dataset_name'].values
    filtered_dataset = meta_dataset[~meta_dataset['dataset_name'].isin(test_datasets)]
    
    all_ranks = []
    for dataset_name, dataset_performance in filtered_dataset.groupby("dataset_name"):
        dataset_performance["rank"] = dataset_performance["max_test_metric_" + metric_name].rank(method='min', ascending=False)
        dataset_performance.set_index("alg_name", inplace=True)
        dataset_performance = dataset_performance[["rank"]]
        dataset_performance = dataset_performance.rename(columns={"rank": dataset_name})
        all_ranks.append(dataset_performance)
        
    ranked_algs = pd.concat(all_ranks, axis=1)
    return ranked_algs

def select_algs(test_datasets, metric_name, num_algs=10):
    """Select num_algs algorithm with best mean rank"""
    return list(ranked_algs.T.mean().sort_values().iloc[:num_algs].index)

In [5]:
# Metafeature selection

def compute_feature_corrs(test_datasets, metric_name, selected_algs):
    """Compute correlation between each metafeature and the desired metric for all selected algorithms.
    Dataframe result is num_features x num_algorithms."""
    all_features = [col for col in metafeats.columns if col.startswith("f__")]
    # Sanity check to prevent leakage
    for test_dataset in test_datasets:
        assert test_dataset in metafeats['dataset_name'].values
    filtered_metafeats = metafeats[~metafeats['dataset_name'].isin(test_datasets)]
    
    all_cors = []
    for alg in selected_algs:
        filtered_results = filtered_metafeats.loc[(filtered_metafeats["alg_name"] == alg)]
        alg_cors = filtered_results[all_features].corrwith(filtered_results["max_test_metric_" + metric_name],
                                                          method="spearman")
        alg_cors.name = alg
        all_cors.append(alg_cors)
    all_cors = pd.concat(all_cors, axis=1).abs()
    return all_cors

def select_features(test_datasets, metric_name, selected_algs, num_feats=10):
    """Select num_feats features. Greedy scheme. At each step, we compute the best correlations
    across all metafeatures for each algorithm so far. We add whichever metafeature can obtain the maximum
    improvement across any single one of the best correlations for the selected algorithms."""
    all_cors = compute_feature_corrs(test_datasets, metric_name, selected_algs)
    
    selected_feats = [all_cors.max(axis=1).idxmax()]
    
    while len(selected_feats) < num_feats:
        # Current best correlations
        current_best_cors = all_cors.loc[selected_feats].max(axis=0)
        # Pick whichever feature results in the highest maximum improvement on the current best correlations
        selected_feats.append((
            all_cors.loc[~all_cors.index.isin(selected_feats)] - current_best_cors)
            .max(axis=1)
            .idxmax())
    return selected_feats

In [6]:
# Assume some split
test_datasets = ["AnimeReader"]# , "CiaoDVDReader"]
# Assume some metric
metric_name = "PRECISION_cut_1"

In [7]:
ranked_algs = rank_algorithms(test_datasets, metric_name)
ranked_algs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_performance["rank"] = dataset_performance["max_test_metric_" + metric_name].rank(method='min', ascending=False)


Unnamed: 0,BookCrossingReader,CiaoDVDReader,DatingReader,EpinionsReader,FilmTrustReader,FrappeReader,GowallaReader,Jester2Reader,LastFMReader,MarketBiasAmazonReader,MarketBiasModClothReader,MovieTweetingsReader,Movielens100KReader,Movielens10MReader,Movielens1MReader,Movielens20MReader,MovielensHetrec2011Reader,NetflixPrizeReader,RecipesReader,WikilensReader
GlobalEffects,6.0,12.0,3.0,8.0,15.0,15.0,3.0,12.0,12.0,12.0,15.0,7.0,15.0,9.0,12.0,5.0,12.0,3.0,6.0,13.0
ItemKNN,1.0,2.0,1.0,1.0,1.0,1.0,,1.0,1.0,3.0,3.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
MatrixFactorization_BPR_Cython,4.0,10.0,,6.0,8.0,14.0,,5.0,5.0,11.0,13.0,5.0,12.0,6.0,8.0,,7.0,,4.0,9.0
PureSVDRecommender,2.0,6.0,,3.0,12.0,10.0,,7.0,10.0,7.0,11.0,4.0,9.0,7.0,9.0,3.0,10.0,,3.0,8.0
Random,7.0,14.0,5.0,9.0,15.0,15.0,3.0,13.0,12.0,16.0,16.0,9.0,17.0,11.0,13.0,7.0,13.0,4.0,7.0,15.0
TopPop,3.0,8.0,3.0,4.0,8.0,11.0,2.0,2.0,9.0,10.0,7.0,6.0,14.0,8.0,10.0,4.0,11.0,2.0,2.0,13.0
UserKNN,4.0,11.0,2.0,7.0,11.0,7.0,1.0,10.0,5.0,14.0,14.0,8.0,12.0,10.0,10.0,6.0,8.0,,5.0,6.0
CoClustering,,14.0,,,14.0,15.0,,,12.0,16.0,17.0,,15.0,,14.0,,13.0,,,15.0
MatrixFactorization_FunkSVD_Cython,,8.0,,,10.0,12.0,,,7.0,9.0,7.0,,9.0,,,,,,,10.0
NMFRecommender,,7.0,,,13.0,7.0,,,8.0,8.0,10.0,,8.0,,7.0,,8.0,,,10.0


In [8]:
selected_algs = select_algs(test_datasets, metric_name)
selected_algs

['ItemKNN',
 'P3alphaRecommender',
 'SLIM_BPR_Cython',
 'SLIMElasticNetRecommender',
 'EASE_R_Recommender',
 'RP3betaRecommender',
 'TopPop',
 'IALSRecommender',
 'PureSVDRecommender',
 'MatrixFactorization_BPR_Cython']

In [9]:
feature_corrs = compute_feature_corrs(test_datasets, metric_name, selected_algs)
feature_corrs

Unnamed: 0,ItemKNN,P3alphaRecommender,SLIM_BPR_Cython,SLIMElasticNetRecommender,EASE_R_Recommender,RP3betaRecommender,TopPop,IALSRecommender,PureSVDRecommender,MatrixFactorization_BPR_Cython
f__dist_feature__kind_item__pre_agg_func_count__agg_func_Gini,0.000000,0.171429,0.160839,0.090909,0.006061,0.318681,0.019549,,0.042918,0.141176
f__dist_feature__kind_item__pre_agg_func_count__agg_func_kurtosis:,0.482456,0.028571,0.041958,0.281818,0.163636,0.129670,0.587970,,0.467198,0.585294
f__dist_feature__kind_item__pre_agg_func_count__agg_func_max,0.010526,0.260714,0.377622,0.072727,0.054545,0.010989,0.239098,,0.131208,0.011765
f__dist_feature__kind_item__pre_agg_func_count__agg_func_mean,0.345614,0.103571,0.258741,0.181818,0.175758,0.129670,0.261654,,0.234212,0.479412
f__dist_feature__kind_item__pre_agg_func_count__agg_func_median,0.321480,0.124117,0.171156,0.200532,0.195126,0.146781,0.268639,,0.285885,0.472575
...,...,...,...,...,...,...,...,...,...,...
f__dist_feature__kind_user__pre_agg_func_sum__agg_func_std,0.159649,0.214286,0.181818,0.500000,0.406061,0.213187,0.428571,,0.616800,0.200000
f__num_interactions,0.254386,0.507143,0.034965,0.363636,0.321212,0.270330,0.505263,,0.432863,0.291176
f__num_items,0.624561,0.303571,0.412587,0.690909,0.321212,0.327473,0.873684,,0.727161,0.785294
f__num_users,0.361404,0.421429,0.006993,0.281818,0.212121,0.217582,0.461654,,0.283262,0.420588


In [10]:
selected_feats = select_features(test_datasets, metric_name, selected_algs)
selected_feats

['f__num_items',
 'f__dist_feature__kind_user__pre_agg_func_sum__agg_func_mode',
 'f__dist_feature__kind_item__pre_agg_func_mean__agg_func_mode',
 'f__dist_feature__kind_item__pre_agg_func_mean__agg_func_max',
 'f__dist_feature__kind_item__pre_agg_func_mean__agg_func_skewness',
 'f__dist_feature__kind_user__pre_agg_func_count__agg_func_kurtosis:',
 'f__dist_feature__kind_user__pre_agg_func_sum__agg_func_max',
 'f__dist_feature__kind_user__pre_agg_func_mean__agg_func_mean',
 'f__dist_feature__kind_item__pre_agg_func_mean__agg_func_mean',
 'f__dist_feature__kind_user__pre_agg_func_sum__agg_func_skewness']

In [11]:
ranked_algs.T.mean().sort_values().iloc[:10]

ItemKNN                           1.315789
P3alphaRecommender                3.666667
SLIM_BPR_Cython                   3.750000
SLIMElasticNetRecommender         3.818182
EASE_R_Recommender                3.900000
RP3betaRecommender                4.214286
TopPop                            6.850000
IALSRecommender                   7.000000
PureSVDRecommender                7.117647
MatrixFactorization_BPR_Cython    7.937500
dtype: float64

### Featurize data for training the RecZilla model

In [12]:
final_feat_columns = selected_feats
X_train = metafeats[metafeats['alg_name'].isin(selected_algs) & ~metafeats['dataset_name'].isin(test_datasets)]

In [13]:
metric_col_name = "max_test_metric_" + metric_name
X_train = X_train[[metric_col_name] + ["dataset_name", "alg_name"] + final_feat_columns]

In [14]:
X_train

Unnamed: 0,max_test_metric_PRECISION_cut_1,dataset_name,alg_name,f__num_items,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_mode,f__dist_feature__kind_item__pre_agg_func_mean__agg_func_mode,f__dist_feature__kind_item__pre_agg_func_mean__agg_func_max,f__dist_feature__kind_item__pre_agg_func_mean__agg_func_skewness,f__dist_feature__kind_user__pre_agg_func_count__agg_func_kurtosis:,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_max,f__dist_feature__kind_user__pre_agg_func_mean__agg_func_mean,f__dist_feature__kind_item__pre_agg_func_mean__agg_func_mean,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_skewness
13,0.008419,BookCrossingReader,ItemKNN,340556,8.0,8.0,10.0,-0.594459,6684.670330,62075.0,7.410599,7.553557,68.610767
14,0.001152,BookCrossingReader,MatrixFactorization_BPR_Cython,340556,8.0,8.0,10.0,-0.594459,6684.670330,62075.0,7.410599,7.553557,68.610767
15,0.002171,BookCrossingReader,PureSVDRecommender,340556,8.0,8.0,10.0,-0.594459,6684.670330,62075.0,7.410599,7.553557,68.610767
17,0.001950,BookCrossingReader,TopPop,340556,8.0,8.0,10.0,-0.594459,6684.670330,62075.0,7.410599,7.553557,68.610767
21,0.013704,CiaoDVDReader,ItemKNN,16121,5.0,4.0,5.0,-0.841069,314.094201,3270.0,4.195798,3.832154,13.294764
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,0.018182,WikilensReader,PureSVDRecommender,5111,5.0,4.0,5.0,-0.799112,25.056903,5749.0,3.558892,3.476142,4.232346
246,0.025455,WikilensReader,RP3betaRecommender,5111,5.0,4.0,5.0,-0.799112,25.056903,5749.0,3.558892,3.476142,4.232346
248,0.040000,WikilensReader,SLIMElasticNetRecommender,5111,5.0,4.0,5.0,-0.799112,25.056903,5749.0,3.558892,3.476142,4.232346
249,0.021818,WikilensReader,SLIM_BPR_Cython,5111,5.0,4.0,5.0,-0.799112,25.056903,5749.0,3.558892,3.476142,4.232346


In [15]:
transforms = {f: 'last' for f in final_feat_columns}
transforms.update({metric_col_name: list, 'alg_name': list})

X_train_grouped = X_train.groupby('dataset_name').agg(transforms)

In [16]:
X_train_grouped

Unnamed: 0_level_0,f__num_items,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_mode,f__dist_feature__kind_item__pre_agg_func_mean__agg_func_mode,f__dist_feature__kind_item__pre_agg_func_mean__agg_func_max,f__dist_feature__kind_item__pre_agg_func_mean__agg_func_skewness,f__dist_feature__kind_user__pre_agg_func_count__agg_func_kurtosis:,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_max,f__dist_feature__kind_user__pre_agg_func_mean__agg_func_mean,f__dist_feature__kind_item__pre_agg_func_mean__agg_func_mean,f__dist_feature__kind_user__pre_agg_func_sum__agg_func_skewness,max_test_metric_PRECISION_cut_1,alg_name
dataset_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BookCrossingReader,340556,8.0,8.0,10.0,-0.594459,6684.67033,62075.0,7.410599,7.553557,68.610767,"[0.008419000354484226, 0.001152073732718894, 0...","[ItemKNN, MatrixFactorization_BPR_Cython, Pure..."
CiaoDVDReader,16121,5.0,4.0,5.0,-0.841069,314.094201,3270.0,4.195798,3.832154,13.294764,"[0.013704406493780307, 0.005481762597512122, 0...","[ItemKNN, MatrixFactorization_BPR_Cython, P3al..."
DatingReader,168791,114.0,10.0,10.0,-0.075,880.517848,141851.0,6.010944,6.083559,23.291594,"[0.02497063364829823, 0.0026595941163867947]","[ItemKNN, TopPop]"
EpinionsReader,139738,5.0,5.0,5.0,-1.231557,107.260998,4475.0,4.042224,4.065894,8.135884,"[0.010390704531891741, 0.0020360164285463545, ...","[ItemKNN, MatrixFactorization_BPR_Cython, P3al..."
FilmTrustReader,2071,4.0,3.0,4.0,-0.82867,16.330765,653.0,3.086027,2.927186,2.499444,"[0.0845808383233533, 0.18263473053892207, 0.04...","[EASE_R_Recommender, ItemKNN, MatrixFactorizat..."
FrappeReader,4082,1.0,1.0,1.0,0.0,5.775338,188.0,1.0,1.0,2.185259,"[0.07078507078507079, 0.07335907335907338, 0.0...","[EASE_R_Recommender, ItemKNN, MatrixFactorizat..."
GowallaReader,1247095,1.0,1.0,1.0,0.0,121.24495,2038.0,1.0,1.0,9.152017,[0.0006750430067722058],[TopPop]
Jester2Reader,140,9.969,-2.738926,3.749777,-0.812001,2.260754,1251.477,0.867492,1.643203,1.864637,"[0.0667282054923402, 0.07613654518665791, 0.06...","[EASE_R_Recommender, ItemKNN, MatrixFactorizat..."
LastFMReader,17632,2.0,1.0,35323.0,14.511389,59.26306,457612.0,755.217578,426.248524,3.622031,"[0.031332979288369624, 0.013276686139139671, 0...","[ItemKNN, MatrixFactorization_BPR_Cython, P3al..."
MarketBiasAmazonReader,9560,5.0,5.0,5.0,-1.529576,117.367099,185.0,4.208464,4.156145,8.443656,"[0.019473813621834275, 0.01942463732480944, 0....","[EASE_R_Recommender, ItemKNN, MatrixFactorizat..."


In [17]:
def get_ordered_target(row):
    avg = np.mean(row[metric_col_name])
    algos_perfs = {alg: val for val, alg in zip(row[metric_col_name], row['alg_name'])}
    algos_perfs.update({alg: avg for alg in selected_algs if alg not in algos_perfs})
    ordered_target = [algos_perfs[key] for key in sorted(algos_perfs.keys(), reverse=True)]
    return ordered_target

X_train_grouped['target'] = X_train_grouped.apply(get_ordered_target, axis=1)

### Train RecZilla model

In [18]:
X_train = X_train_grouped[final_feat_columns].values
y_train = X_train_grouped['target'].to_list()

In [19]:
from sklearn.multioutput import RegressorChain
import xgboost as xgb

In [20]:
base_model = xgb.XGBRegressor(objective='reg:squarederror')
model = RegressorChain(base_model)

In [21]:
model = model.fit(X_train, y_train)



In [22]:
test_data = metafeats[metafeats['dataset_name'].isin(test_datasets) & metafeats['alg_name'].isin(selected_algs)]
test_data = test_data[[metric_col_name] + ["dataset_name", "alg_name"] + final_feat_columns]

In [23]:
X_test = test_data[final_feat_columns].iloc[0].values
y_test = test_data.groupby('dataset_name').agg(transforms).apply(get_ordered_target, axis=1)

In [24]:
model.predict([X_test])

array([[0.00619698, 0.01289466, 0.01524866, 0.01046404, 0.00625345,
        0.01275656, 0.00657287, 0.016408  , 0.01017338, 0.01005143]])

In [25]:
y_test.to_list()

[[0.013664935774801858,
  0.034536327152946585,
  0.023980164266912157,
  0.01111894247781246,
  0.014240301491635618,
  0.02684081069029502,
  0.01324779563009738,
  0.04391478833733692,
  0.023980164266912157,
  0.0342774125803714]]