In [1]:
import pickle
import json
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set(style='ticks', font_scale = 1.5)
sns.set(rc={'figure.figsize': (18, 6)})
sns.set_style('whitegrid')

plt.rcParams.update({'font.size': 12})

models = ["dbscan", "hdbscan", "birch", "kmeans-minibatch"]
norm_models = ["dbscan", "hdbscan"]
suite = 'nlp'
num_datasets = 9
df_ref = pickle.load(open('nlp_dfs.pkl', 'rb'))


In [2]:
from fanova import fANOVA

import ConfigSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformFloatHyperparameter, \
    UniformIntegerHyperparameter, Constant
from hydra import compose, initialize
from omegaconf import OmegaConf

from clustering_hyperparameters.utils.type_utils import get_type_from_str
metadata_cols = ["adjusted_rand_score", 
                "trial_status",
                "generator_model",
                "trial_index",
                "arm_name",
                "compute_time"]

res_df = { m : pd.DataFrame() for m in models } 
for model in models:
    all_df_model = pd.concat([df_ref['random'][model], df_ref['bayesian'][model]])
    all_df_model_groups = all_df_model.groupby('dataset')
    

    with initialize(config_path="../src/clustering_hyperparameters/conf", job_name="app"):
        for dataset_index in range(num_datasets):
            cfg = compose(config_name="config", overrides=["model="+model, "suite="+suite, "dataset_index="+str(dataset_index)])
            cfg.dataset_index = str(cfg.dataset_index)
            cfg = OmegaConf.to_container(cfg, resolve=True)
            dataset = cfg["suite"]["datasets"][dataset_index]["name"]
            dataset_with_norm = dataset 
            if model not in norm_models:
                dataset_with_norm += "_normalized"
            try :
                dataset_df = all_df_model_groups.get_group(dataset_with_norm)
            except KeyError as e:
                continue
            print("model:", model, "dataset_index:", dataset_index, "dataset", dataset)
            
            
            params = [ x for x in cfg["model"]["params"] if x['type']=='range'] 
            param_names = [ x['name'] for x in params ]
            cs = ConfigSpace.ConfigurationSpace()
            for ind, param in enumerate(params):
                log_scale = param['log_scale'] if 'log_scale' in param else False
                if param['value_type'] == "int":
                    cs.add_hyperparameter(UniformIntegerHyperparameter(param["name"], lower=param['bounds'][0], upper=param['bounds'][1], log=log_scale))
                elif param['value_type'] == "float":
                    cs.add_hyperparameter(UniformFloatHyperparameter(param["name"], lower=param['bounds'][0], upper=param['bounds'][1], log=log_scale))
                elif param['value_type'] == "str":
                    cs.add_hyperparameter(CategoricalHyperparameter(param["name"], choices=param['choices']))
                    
            X , Y = dataset_df[param_names], dataset_df['adjusted_rand_score'].to_numpy()
            f = fANOVA(X = X, Y = Y, config_space=cs, seed=0)
            res_dict = {'dataset': dataset}
            for param in param_names: 
                try:
                    res_dict[param] = f.quantify_importance((param,))[(param,)]['individual importance']
                except Exception:
                    res_dict[param] = 0.0
            res_df[model] = res_df[model].append(res_dict, ignore_index=True)  




model: dbscan dataset_index: 0 dataset AGNews-paraphrase-mpnet
model: dbscan dataset_index: 1 dataset AGNews-stsb-distilroberta
model: dbscan dataset_index: 2 dataset AGNews-glove
model: dbscan dataset_index: 3 dataset DBpedia-paraphrase-mpnet
model: dbscan dataset_index: 4 dataset DBpedia-stsb-distilroberta
model: dbscan dataset_index: 5 dataset DBpedia-glove
model: dbscan dataset_index: 6 dataset YahooAnswers-paraphrase-mpnet
model: dbscan dataset_index: 7 dataset YahooAnswers-stsb-distilroberta
model: dbscan dataset_index: 8 dataset YahooAnswers-glove
model: hdbscan dataset_index: 0 dataset AGNews-paraphrase-mpnet
model: hdbscan dataset_index: 1 dataset AGNews-stsb-distilroberta
model: hdbscan dataset_index: 2 dataset AGNews-glove
model: hdbscan dataset_index: 3 dataset DBpedia-paraphrase-mpnet
model: hdbscan dataset_index: 4 dataset DBpedia-stsb-distilroberta
model: hdbscan dataset_index: 5 dataset DBpedia-glove
model: hdbscan dataset_index: 6 dataset YahooAnswers-paraphrase-mpnet


In [3]:
from IPython.display import display
all_dfs = []
for model in models:
    df = res_df[model].copy()
    df['encoder'] = df['dataset'].apply(lambda x: "-".join(x.split("-")[1:]))
    df['dataset'] = df['dataset'].apply(lambda x: x.split("-")[0])
    df.set_index(['dataset', 'encoder'], inplace=True)
    df = df.T.copy()
    df['model'] = [model] * len(df)
    df.index.name = 'hyperparameters'
    df.reset_index(level=0, inplace=True)
    df.set_index(['model', 'hyperparameters'], inplace=True)
    all_dfs.append(df)

all_dfs = pd.concat(all_dfs)
all_dfs = all_dfs * 100
display(all_dfs)
with open('importances_nlp.tex', 'w') as f:
    f.write(all_dfs.to_latex(float_format="%.3f"))

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Unnamed: 0_level_0,dataset,AGNews,AGNews,AGNews,DBpedia,DBpedia,DBpedia,YahooAnswers,YahooAnswers,YahooAnswers
Unnamed: 0_level_1,encoder,paraphrase-mpnet,stsb-distilroberta,glove,paraphrase-mpnet,stsb-distilroberta,glove,paraphrase-mpnet,stsb-distilroberta,glove
model,hyperparameters,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
dbscan,eps,37.568645,37.324431,50.99109,58.151817,78.663948,65.720691,53.200535,53.458572,63.15547
dbscan,min_samples,6.477639,8.604143,11.112048,10.376788,2.716656,9.357979,5.998249,16.36511,4.416404
hdbscan,cluster_selection_epsilon,0.262562,1.577297,9.125589,2.520172,50.7941,67.427803,3.462006,2.097629,0.482261
hdbscan,min_cluster_size,4.927874,12.54838,10.096574,18.541017,2.263034,2.135467,13.577446,26.155241,72.431315
hdbscan,min_samples,79.091124,51.02572,52.466761,32.549169,6.540315,4.479539,32.570445,23.911001,10.668257
birch,branching_factor,5.720532,12.066211,5.877264,14.575826,8.881099,22.611233,14.130936,15.005167,12.741153
birch,threshold,76.903115,53.877544,65.074782,43.376474,53.085998,47.235873,59.315362,57.778914,46.800241
kmeans-minibatch,batch_size,2.225524,4.174537,0.787536,1.831552,0.858519,0.729947,8.598359,3.82973,1.933805
kmeans-minibatch,init_size,4.647261,8.632586,3.9307,17.680704,26.400394,4.014052,2.515027,3.402712,4.934313
kmeans-minibatch,max_iter,4.326036,1.283839,8.644913,3.200539,3.617788,4.043148,3.677978,2.200715,3.201182
