# Hyper-parameter Optimization Results

To effectively decide which parameters to use on our models, we have to evaluate the HPO results on the GoEmotions dataset.

In [1]:
import glob
import json
import pandas as pd

results = []
paths = glob.glob('../output/*/*/*.json', recursive=True)
for path in paths:
    with open(path) as fp:
        result_dict = json.load(fp)
        result_tuple = (path, result_dict)
        results.append(result_tuple)

results_df = pd.DataFrame(results, columns=['Path', 'Dict'])

In [2]:
EXPERIMENT_METRICS = ['macro_f1', 'micro_f1']
EXPERIMENT_COLUMNS = ['Dataset', 'ModelType', 'Experiment']
INTERNAL_METRICS = ['precision', 'recall']
WEIGHT_FIELD = 'support'


def parse_path(path):
    dataset, model_type, experiment = path.split('/')[-3:]
    experiment = experiment.split('.')[0]
    return dataset, model_type, experiment


def build_experiment_columns(dataframe):
    path_parts = [parse_path(path) for path in dataframe.Path]
    for i, col in enumerate(EXPERIMENT_COLUMNS):
        dataframe[col] = [parts[i] for parts in path_parts]
    return dataframe


def extract_macro_stats(dataframe, metrics=EXPERIMENT_METRICS):
    split_names = dataframe['Dict'][0]['config']['data_config']['split_names']
    for split in split_names:
        for metric in metrics:
            new_col_name = '{}_{}'.format(split, metric)
            dataframe[new_col_name] = [exp_dict['results'][split][metric] for exp_dict in dataframe['Dict']]
    return dataframe


def extract_category_stats(dataframe, metrics=INTERNAL_METRICS, weight_field=WEIGHT_FIELD):
    split_names = dataframe['Dict'][0]['config']['data_config']['split_names']
    for split in split_names:
        for metric in metrics:
            new_col_name = '{}_{}'.format(split, metric)
            metric_dict = []
            for exp_dict in dataframe['Dict']:
                label_values = list(exp_dict['results'][split]['labels'].values())
                global_weight = sum(label_results[weight_field] for label_results in label_values)
                metric_score = sum(label_results[weight_field] * label_results[metric] / global_weight
                                   for label_results in label_values)
                metric_dict.append(metric_score)
            dataframe[new_col_name] = metric_dict
    return dataframe


def extract_model_details(dataframe):
    dataframe['Extractor'] = [exp_dict['config']['extractor_config']['ex_type'] for exp_dict in dataframe['Dict']]
    dataframe['Model'] = [exp_dict['config']['model_config']['model_name'] for exp_dict in dataframe['Dict']]
    return dataframe


def parse_df(dataframe):
    dataframe = build_experiment_columns(dataframe)
    dataframe = extract_model_details(dataframe)
    dataframe = extract_macro_stats(dataframe)
    dataframe = extract_category_stats(dataframe)
    return dataframe


In [3]:
parsed_df = parse_df(results_df)

parsed_df.sort_values('valid_macro_f1')

Unnamed: 0,Path,Dict,Dataset,ModelType,Experiment,Extractor,Model,train_macro_f1,train_micro_f1,valid_macro_f1,valid_micro_f1,test_macro_f1,test_micro_f1,train_precision,train_recall,valid_precision,valid_recall,test_precision,test_recall
0,../output/Vent/replica/882dc1bfe667f5eabe58b07...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica,882dc1bfe667f5eabe58b07ef2bde36a,fasttext,lstm,0.023937,0.037033,0.023898,0.038052,0.023806,0.038732,0.019507,0.854396,0.019895,0.871753,0.020267,0.873049
13,../output/Vent/replica/2702fa7a3c62efa83f4ef1e...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica,2702fa7a3c62efa83f4ef1e4fee64fb7,fasttext,lstm,0.043402,0.063942,0.042872,0.066260,0.042768,0.066710,0.041179,0.440259,0.044295,0.447868,0.043758,0.484318
255,../output/GoEmotions/classic/98f6bca9a4ab65148...,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,classic,98f6bca9a4ab651482d7e1b22f78f86b,tfidf,sgd,0.086467,0.237511,0.086287,0.237473,0.087133,0.243009,0.203933,0.911316,0.207818,0.905956,0.234923,0.879444
108,../output/GoEmotions/classic/a93108cca090c2dfb...,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,classic,a93108cca090c2dfbbbc383d76859d23,tfidf,sgd,0.087561,0.239064,0.087443,0.238925,0.088382,0.244441,0.239599,0.878285,0.225533,0.888558,0.221263,0.887818
220,../output/GoEmotions/classic/bc505ed7a45b78401...,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,classic,bc505ed7a45b78401005670115de7804,tfidf,sgd,0.087547,0.219247,0.087490,0.218868,0.089887,0.224112,0.190787,0.911316,0.196505,0.904545,0.192659,0.917207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012,../output/GoEmotions/neural/812213d5b056aac532...,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,neural,812213d5b056aac532335d8800486c5d,bert,dnnpool,0.593322,0.634571,0.515087,0.559888,0.499480,0.556102,0.609850,0.668141,0.510231,0.642633,0.519813,0.621899
685,../output/GoEmotions/neural/0ae5faa3b9dd41390d...,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,neural,0ae5faa3b9dd41390d57d7e006254968,bert,dnnpool,0.593322,0.634571,0.515087,0.559888,0.499480,0.556102,0.609850,0.668141,0.510231,0.642633,0.519813,0.621899
1871,../output/GoEmotions/neural/7e372907c3f585085f...,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,neural,7e372907c3f585085f6d037c3849ca63,bert,dnnpool,0.556554,0.597283,0.515675,0.558934,0.510514,0.555689,0.553376,0.662760,0.514209,0.646395,0.515199,0.634065
1045,../output/GoEmotions/neural/f7cacae6393f216913...,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,neural,f7cacae6393f2169131c6f7c394e5403,bert,dnnpool,0.564260,0.612822,0.516185,0.562844,0.498440,0.559146,0.580436,0.661801,0.513887,0.652508,0.522387,0.622689


In [4]:
keyed_groups = parsed_df.groupby(['Dataset', 'ModelType', 'Extractor', 'Model'])
best_valid_df = keyed_groups.valid_macro_f1.max().to_frame().reset_index()

results_df = parsed_df.merge(best_valid_df, on=['Dataset', 'ModelType', 'Extractor', 'Model', 'valid_macro_f1'], how='inner')

final_df = results_df[['Dict', 'Dataset', 'Extractor', 'Model', 'test_macro_f1', 'test_micro_f1', 'test_precision', 'test_recall']].round(2)
final_df = final_df.drop_duplicates(['Dataset', 'Extractor', 'Model']).sort_values(['Dataset', 'Extractor', 'Model'])
final_df

Unnamed: 0,Dict,Dataset,Extractor,Model,test_macro_f1,test_micro_f1,test_precision,test_recall
9,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,bert,dnnpool,0.5,0.56,0.52,0.63
10,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,bert,lstm,0.49,0.57,0.53,0.63
6,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,bow,naivebayes,0.35,0.46,0.43,0.53
8,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,bow,sgd,0.46,0.53,0.49,0.61
12,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,fasttext,dnnpool,0.44,0.5,0.44,0.63
11,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,fasttext,lstm,0.47,0.54,0.52,0.59
4,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,tfidf,naivebayes,0.33,0.44,0.43,0.49
3,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,tfidf,sgd,0.47,0.53,0.49,0.6
2,{'config': {'data_config': {'raw_path': 'prepr...,Vent,bow,naivebayes,0.13,0.15,0.11,0.24
1,{'config': {'data_config': {'raw_path': 'prepr...,Vent,fasttext,lstm,0.1,0.12,0.13,0.14


In [5]:
import copy

print('Patching the FastText model because we only ran single directional experiments in HPO -- we will replicate with bidirectional, which is not totally fair but should give us an idea')

row_to_patch = final_df[(final_df.Extractor == 'fasttext') & (final_df.Model == 'lstm')].to_dict('records')[0]
row_to_patch['Dict'] = copy.deepcopy(row_to_patch['Dict'])
row_to_patch['Dict']['config']['model_config']['model_conf']['bidirectional'] = True
row_to_patch = pd.DataFrame([row_to_patch])
final_df = pd.concat([final_df, row_to_patch], ignore_index=True)
final_df

Patching the FastText model because we only ran single directional experiments in HPO -- we will replicate with bidirectional, which is not totally fair but should give us an idea


Unnamed: 0,Dict,Dataset,Extractor,Model,test_macro_f1,test_micro_f1,test_precision,test_recall
0,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,bert,dnnpool,0.5,0.56,0.52,0.63
1,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,bert,lstm,0.49,0.57,0.53,0.63
2,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,bow,naivebayes,0.35,0.46,0.43,0.53
3,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,bow,sgd,0.46,0.53,0.49,0.61
4,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,fasttext,dnnpool,0.44,0.5,0.44,0.63
5,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,fasttext,lstm,0.47,0.54,0.52,0.59
6,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,tfidf,naivebayes,0.33,0.44,0.43,0.49
7,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,tfidf,sgd,0.47,0.53,0.49,0.6
8,{'config': {'data_config': {'raw_path': 'prepr...,Vent,bow,naivebayes,0.13,0.15,0.11,0.24
9,{'config': {'data_config': {'raw_path': 'prepr...,Vent,fasttext,lstm,0.1,0.12,0.13,0.14


# Generate the final experiment configuration

Create the jsons with a number of seeds for the final experiment with multiple runs.

In [6]:
import sys
import copy
import json
sys.path.append('../src')
from config import ExperimentConfig

NUM_EXPERIMENTS = 5

for experiment in range(NUM_EXPERIMENTS):
    for cfg in final_df[final_df.Dataset == 'GoEmotions'].Dict:
        # Clone the config
        # Not needed for seeds, but if we change something in the future we'll be grateful
        cfg = copy.deepcopy(cfg)
        cfg = cfg['config']
        cfg['seed'] = experiment
        model_family = cfg['output_path'].split('/')[-2]
        cfg['output_path'] = f'output/GoEmotions/replica/'
        
        # Build the experiment object
        exp_cfg = ExperimentConfig.from_dict(cfg)        
        as_json = json.dumps(exp_cfg._as_flat_dict(), indent=2)
        exp_hash = exp_cfg.hash()
        
        # Save as json
        save_path = f'../configs/GoEmotions/replica/{model_family}/{exp_hash}.json' 
        with open(save_path, 'w') as f:
            f.write(as_json)

# Generate the experiment configuration for Sampled Robust Vent

In [7]:
vent_robust = pd.read_parquet('../preprocessed/vent-robust.parquet')
emotions = pd.read_csv('../data/emotions_clean.csv')
labels = {i: emo for i, emo in enumerate(emotions.name.tolist())}
result_labels = [labels[index] for index in sorted(vent_robust.emotions_label.unique())]

print(f'The robust labels are: {", ".join(result_labels)}')

The robust labels are: Adoring, Adventurous, Affectionate, Afraid, Amazed, Amused, Angry, Annoyed, Anxious, Artistic, Ashamed, Astonished, Awkward, Bitter, Bored, Calm, Caring, Chill, Confident, Conflicted, Confused, Content, Creative, Cuddly, Curious, Determined, Disappointed, Disgusted, Done, Dreamy, Embarrassed, Empty, Excited, Exhausted, Frustrated, Furious, Guilty, Happy, Heartbroken, Hopeful, Hungry, Hurt, Hyped, Imaginative, Impatient, Infatuated, Insecure, Inspired, Interested, Irritated, Jealous, Lazy, Lonely, Lost, Loving, Meh, Miserable, Motivated, Musical, Needy, Nervous, Nostalgic, Numb, Optimistic, Overwhelmed, Passionate, Poetic, Proud, Relaxed, Relieved, Sad, Safe, Shocked, Shy, Sick, Sleepy, Sorry, Spacey, Stressed, Strong, Supportive, Surprised, Thankful, Thoughtful, Tired, Uncomfortable, Upset, Worried


In [8]:
NUM_EXPERIMENTS = 5

for experiment in range(NUM_EXPERIMENTS):
    for cfg in final_df[final_df.Dataset == 'GoEmotions'].Dict:
        # Clone the config
        # Not needed for seeds, but if we change something in the future we'll be grateful
        cfg = copy.deepcopy(cfg)
        cfg = cfg['config']
        cfg['seed'] = experiment
        model_family = cfg['output_path'].split('/')[-2]
        cfg['output_path'] = f'output/Vent/replica/'
        cfg['label_names'] = result_labels
        
        # Higher max length limit in Vent vs GoEmotions
        # slightly higher than 32 tokens max to accomodate for BERT's BPE
        extractor_config = cfg['extractor_config']
        if 'max_length' in extractor_config['ex_args']:
            extractor_config['ex_args']['max_length'] = 40
        
        # Bump the batch sizes for FT for faster training
        model_config = cfg['model_config']
        if extractor_config['ex_type'] == 'fasttext':
            model_config['batch_size'] = 256
        
        # Change data config
        data_config = cfg['data_config']
        data_config['raw_path'] = 'preprocessed/vent-robust-equivalent-sample.parquet'
        data_config['cache_path'] = 'preprocessed/Vent-split-cache/'
        
        # This stays because we do column splits, but it _has_ to be clear
        data_config['dataset_format'] = 'goemotions'
        data_config['target_column'] = 'emotion_index'
        
        # Build the experiment object
        exp_cfg = ExperimentConfig.from_dict(cfg)        
        as_json = json.dumps(exp_cfg._as_flat_dict(), indent=2)
        exp_hash = exp_cfg.hash()
        
        # Save as json
        save_path = f'../configs/Vent/replica/{model_family}/{exp_hash}.json' 
        with open(save_path, 'w') as f:
            f.write(as_json)