In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
from pathlib import Path
from omegaconf import OmegaConf
CONFIG_PATH = Path('config.yaml')

config = OmegaConf.load(CONFIG_PATH)

In [3]:
from utils import *
from collections import defaultdict

result_dict = defaultdict(list)
for dir_name in Path('output').iterdir():
    if not dir_name.is_dir():
        continue

    config_path = dir_name / 'config.yaml'
    assert config_path.exists(), f'{config_path} does not exist'
    config = load_conf(dir_name / 'config.yaml')
    
    if 'model_train' not in config:
        continue

    # hyperparameters
    for hyperparameter in config.model_train:
        if hyperparameter == 'params':
            continue
        result_dict[hyperparameter].append(config.model_train[hyperparameter])
    for hyperparameter in config.model_train.params:
        if hyperparameter == 'early_stopping_rounds':
            continue
        result_dict[hyperparameter].append(config.model_train.params[hyperparameter])

    # feature engineering
    if 'run_artist_name' not in config.feature_engineering:
        result_dict['run_artist_name'].append(False)
    for feature in config.feature_engineering:
        if feature.startswith('run_'):
            result_dict[feature].append(config.feature_engineering[feature])

    auroc = load_json(dir_name / 'val_auroc.json')
    result_dict['val_auroc'].append(auroc['val_auroc'])

result_df = pd.DataFrame(result_dict)
result_df = result_df.drop_duplicates()
result_df.to_csv('output/result_analysis.csv', index=False)
result_df.head()

Unnamed: 0,early_stopping_rounds,num_boost_round,bagging_fraction,bagging_freq,feature_fraction,learning_rate,max_bin,max_depth,min_data_in_leaf,num_leaves,run_artist_name,run_composer,run_genre_id,run_lyricist,run_register_duration,val_auroc
0,10,200,0.8,5,0.8,0.1,255,31,20,1024,False,False,True,False,False,0.801187
1,20,200,0.8,5,0.8,0.1,255,15,20,1024,False,True,False,True,False,0.779579
2,10,200,0.8,5,0.8,0.1,255,31,20,1024,False,True,True,False,True,0.802331
3,10,200,0.8,5,0.8,0.1,255,63,20,1024,False,False,True,True,True,0.814616
4,10,200,0.8,5,0.8,0.1,255,31,20,1024,False,False,False,True,True,0.801424


In [4]:
hyperparameters = []
for hyperparameter in config.model_train:
    if hyperparameter == 'params':
        continue
    hyperparameters.append(hyperparameter)
for hyperparameter in config.model_train.params:
    if hyperparameter == 'early_stopping_rounds':
        continue
    hyperparameters.append(hyperparameter)

feature_engineering = []
if 'run_artist_name' not in config.feature_engineering:
    feature_engineering.append('run_artist_name')
for feature in config.feature_engineering:
    if feature.startswith('run_'):
        feature_engineering.append(feature)

In [5]:
len(hyperparameters), len(feature_engineering)

(10, 5)

In [6]:
hyperparameter_values_str_list = []
for _, row in result_df.iterrows():
    hyperparameter_values_dict = {}
    for hyperparameter in hyperparameters:
        hyperparameter_values_dict[hyperparameter] = row[hyperparameter]
    hyperparameter_values_str = str(hyperparameter_values_dict)
    hyperparameter_values_str_list.append(hyperparameter_values_str)

for hy_str in sorted(set(hyperparameter_values_str_list)):
    print(hy_str)

{'early_stopping_rounds': 10, 'num_boost_round': 100, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.8, 'learning_rate': 0.1, 'max_bin': 255, 'max_depth': 15, 'min_data_in_leaf': 20, 'num_leaves': 1024}
{'early_stopping_rounds': 10, 'num_boost_round': 200, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.8, 'learning_rate': 0.05, 'max_bin': 255, 'max_depth': 63, 'min_data_in_leaf': 20, 'num_leaves': 1024}
{'early_stopping_rounds': 10, 'num_boost_round': 200, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.8, 'learning_rate': 0.1, 'max_bin': 255, 'max_depth': 15, 'min_data_in_leaf': 20, 'num_leaves': 1024}
{'early_stopping_rounds': 10, 'num_boost_round': 200, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.8, 'learning_rate': 0.1, 'max_bin': 255, 'max_depth': 31, 'min_data_in_leaf': 20, 'num_leaves': 1024}
{'early_stopping_rounds': 10, 'num_boost_round': 200, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_frac

In [7]:
hyperparameters_that_matter = ['early_stopping_rounds', 'num_boost_round', 'max_depth']

In [8]:
result_dict = defaultdict(list)

for _, row in result_df.iterrows():
    hyperparameter_values_dict = {}
    for hyperparameter in hyperparameters_that_matter:
        hyperparameter_values_dict[hyperparameter] = row[hyperparameter]
    hyperparameter_values_str = str(hyperparameter_values_dict)

    feature_values_dict = {}
    for feature in feature_engineering:
        feature_values_dict[feature] = row[feature]
    feature_values_dict['val_auroc'] = row['val_auroc']
    
    result_dict[hyperparameter_values_str].append(feature_values_dict)

result_dict = dict(sorted(result_dict.items()))
for key, item in result_dict.items():
    result_dict[key] = sorted(item, key=lambda x: x['val_auroc'], reverse=True)

save_json(result_dict, 'output/result_analysis.json')