In [2]:
import pickle
import json
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set(style='ticks', font_scale = 1.5)
sns.set(rc={'figure.figsize': (18, 6)})
sns.set_style('whitegrid')

plt.rcParams.update({'font.size': 12})

models = ["dbscan", "hdbscan", "birch", "kmeans-minibatch"]
norm_models = ["dbscan", "hdbscan"]
suite = 'generic'
num_datasets = 56
df_ref = pickle.load(open('generic_dfs.pkl', 'rb'))


In [3]:
from fanova import fANOVA

import ConfigSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformFloatHyperparameter, \
    UniformIntegerHyperparameter, Constant
from hydra import compose, initialize
from omegaconf import OmegaConf

from clustering_hyperparameters.utils.type_utils import get_type_from_str
metadata_cols = ["adjusted_rand_score", 
                "trial_status",
                "generator_model",
                "trial_index",
                "arm_name",
                "compute_time"]

res_df = { m : pd.DataFrame() for m in models } 
for model in models:
    all_df_model = pd.concat([df_ref['random'][model], df_ref['bayesian'][model]])
    all_df_model_groups = all_df_model.groupby('dataset')
    

    with initialize(config_path="../src/clustering_hyperparameters/conf", job_name="app"):
        for dataset_index in range(num_datasets):
            cfg = compose(config_name="config", overrides=["model="+model, "suite="+suite, "dataset_index="+str(dataset_index)])
            cfg.dataset_index = str(cfg.dataset_index)
            cfg = OmegaConf.to_container(cfg, resolve=True)
            dataset = cfg["suite"]["datasets"][dataset_index]["name"]
            dataset_with_norm = dataset 
            if model not in norm_models:
                dataset_with_norm += "_normalized"
            try :
                dataset_df = all_df_model_groups.get_group(dataset_with_norm)
            except KeyError as e:
                continue
            print("model:", model, "dataset_index:", dataset_index, "dataset", dataset)
            
            
            params = [ x for x in cfg["model"]["params"] if x['type']=='range'] 
            param_names = [ x['name'] for x in params ]
            cs = ConfigSpace.ConfigurationSpace()
            for ind, param in enumerate(params):
                log_scale = param['log_scale'] if 'log_scale' in param else False
                if param['value_type'] == "int":
                    cs.add_hyperparameter(UniformIntegerHyperparameter(param["name"], lower=param['bounds'][0], upper=param['bounds'][1], log=log_scale))
                elif param['value_type'] == "float":
                    cs.add_hyperparameter(UniformFloatHyperparameter(param["name"], lower=param['bounds'][0], upper=param['bounds'][1], log=log_scale))
                elif param['value_type'] == "str":
                    cs.add_hyperparameter(CategoricalHyperparameter(param["name"], choices=param['choices']))
                    
            X , Y = dataset_df[param_names], dataset_df['adjusted_rand_score'].to_numpy()
            f = fANOVA(X = X, Y = Y, config_space=cs, seed=0)
            res_dict = {'dataset': dataset}
            for param in param_names: 
                try:
                    res_dict[param] = f.quantify_importance((param,))[(param,)]['individual importance']
                except Exception:
                    res_dict[param] = 0.0
            res_df[model] = res_df[model].append(res_dict, ignore_index=True)  




model: dbscan dataset_index: 0 dataset kr-vs-kp
model: dbscan dataset_index: 1 dataset letter
model: dbscan dataset_index: 2 dataset balance-scale
model: dbscan dataset_index: 3 dataset mfeat-factors
model: dbscan dataset_index: 4 dataset mfeat-fourier
model: dbscan dataset_index: 5 dataset mfeat-karhunen
model: dbscan dataset_index: 6 dataset mfeat-morphological
model: dbscan dataset_index: 7 dataset mfeat-zernike
model: dbscan dataset_index: 8 dataset cmc
model: dbscan dataset_index: 9 dataset credit-g
model: dbscan dataset_index: 10 dataset diabetes
model: dbscan dataset_index: 11 dataset spambase
model: dbscan dataset_index: 12 dataset splice
model: dbscan dataset_index: 13 dataset tic-tac-toe
model: dbscan dataset_index: 14 dataset vehicle
model: dbscan dataset_index: 15 dataset electricity
model: dbscan dataset_index: 16 dataset satimage
model: dbscan dataset_index: 17 dataset vowel
model: dbscan dataset_index: 18 dataset isolet
model: dbscan dataset_index: 19 dataset analcatdata

In [5]:
from IPython.display import display
display()
all_dfs = []
for model in models:
    df = res_df[model].copy()
    df.set_index(['dataset'], inplace=True)
    df = df.T.copy()
    df['model'] = [model] * len(df)
    df.index.name = 'hyperparameters'
    df.reset_index(level=0, inplace=True)
    df.set_index(['model', 'hyperparameters'], inplace=True)
    all_dfs.append(df)

all_dfs = pd.concat(all_dfs)
all_dfs = all_dfs * 100
display(all_dfs)
with open('importances_generic.tex', 'w') as f:
    f.write(all_dfs.to_latex(float_format="%.3f"))

Unnamed: 0_level_0,dataset,kr-vs-kp,letter,balance-scale,mfeat-factors,mfeat-fourier,mfeat-karhunen,mfeat-morphological,mfeat-zernike,cmc,credit-g,...,steel-plates-fault,climate-model-simulation-crashes,wilt,car,segment,mfeat-pixel,jungle_chess_2pcs_raw_endgame_complete,numerai28.6,Internet-Advertisements,dna
model,hyperparameters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
dbscan,eps,41.22071,63.46753,43.121513,69.613667,77.576333,44.263242,87.021365,68.30789,34.017434,24.649256,...,69.551526,35.258499,61.216072,19.490204,83.641624,44.731198,75.899938,65.43731,33.046262,26.26167
dbscan,min_samples,6.056756,6.106907,15.53052,4.482135,1.776517,6.235089,3.64532,2.0651,10.792968,19.533144,...,4.153796,9.991612,1.556205,21.917812,3.727037,8.312577,5.594837,5.749844,16.961108,20.001601
hdbscan,cluster_selection_epsilon,0.847365,,7.681203,28.989496,10.59244,3.193991,76.879312,19.304382,8.001079,2.959263,...,29.51773,5.336312,0.665622,2.318259,59.449002,38.295536,,,1.654816,1.151808
hdbscan,min_cluster_size,19.361229,,10.553141,3.426232,7.273913,1.035273,0.7992,11.773107,1.712235,3.986944,...,1.633072,12.132141,4.077708,22.163033,2.226572,1.706278,,,7.569374,79.162233
hdbscan,min_samples,29.355308,,26.680713,8.78673,34.797058,70.762848,4.502101,31.800586,63.971244,41.713595,...,19.536199,39.210867,72.156392,38.679238,8.559348,10.859102,,,44.276848,1.297827
birch,branching_factor,10.670878,10.377799,17.86662,16.600725,7.124044,4.644191,10.663907,9.924263,5.853255,15.855833,...,17.514937,12.078069,16.913516,15.072652,11.546067,10.237963,10.720835,27.500132,12.429516,7.455596
birch,threshold,37.780271,45.022856,16.064394,38.854121,60.058822,72.86088,61.158802,42.997999,58.923584,22.3048,...,39.963893,28.355941,27.736918,35.479207,49.164562,49.317449,28.174959,54.285564,51.05957,49.818596
kmeans-minibatch,batch_size,0.357834,1.163492,1.889769,2.260817,1.51192,14.141208,0.249163,0.426068,1.852554,0.387432,...,1.047214,0.41652,5.028219,2.060026,0.613692,2.113219,1.444057,4.01623,1.450126,1.768699
kmeans-minibatch,init_size,27.138047,4.020237,8.233833,8.114177,11.050072,2.824963,31.720499,13.56323,10.377049,6.686933,...,7.209137,34.460194,13.276197,3.701417,39.13779,14.193093,3.209447,3.997026,7.819008,10.238438
kmeans-minibatch,max_iter,5.542125,3.842833,3.346068,2.454882,2.573653,2.47215,8.75311,3.925617,3.597922,1.578469,...,3.493906,1.66955,1.561814,3.392724,0.824372,1.298106,5.863471,5.936892,1.006388,3.264723
