In [25]:
import glob
import json
import pandas as pd

pd.options.display.max_rows = 1000

results = []
paths = glob.glob('../output/Vent/replica-fractions-with-test/*.json', recursive=True)
for path in paths:
    with open(path) as fp:
        result_dict = json.load(fp)
        result_tuple = (path, result_dict)
        results.append(result_tuple)

results_df = pd.DataFrame(results, columns=['Path', 'Dict'])

goemotions = []
goem_paths = glob.glob('../output/GoEmotions/replica/*.json')
for path in goem_paths:
    with open(path) as fp:
        result_dict = json.load(fp)
        result_tuple = (path, result_dict)
        goemotions.append(result_tuple)

goemotions_df = pd.DataFrame(goemotions, columns=['Path', 'Dict'])

In [26]:
EXPERIMENT_METRICS = ['macro_f1', 'micro_f1']
EXPERIMENT_COLUMNS = ['Dataset', 'ModelType', 'Experiment']
INTERNAL_METRICS = ['precision', 'recall']
WEIGHT_FIELD = 'support'


def parse_path(path):
    dataset, model_type, experiment = path.split('/')[-3:]
    experiment = experiment.split('.')[0]
    return dataset, model_type, experiment


def build_experiment_columns(dataframe):
    path_parts = [parse_path(path) for path in dataframe.Path]
    for i, col in enumerate(EXPERIMENT_COLUMNS):
        dataframe[col] = [parts[i] for parts in path_parts]
    return dataframe


def extract_macro_stats(dataframe, metrics=EXPERIMENT_METRICS):
    try:
        split_names = dataframe['Dict'][0]['config']['data_config']['split_names']
    except IndexError:
        split_names = ['train', 'valid', 'test']
    for split in split_names:
        for metric in metrics:
            new_col_name = '{}_{}'.format(split, metric)
            dataframe[new_col_name] = [exp_dict['results'][split][metric] for exp_dict in dataframe['Dict']]
    return dataframe


def extract_category_stats(dataframe, metrics=INTERNAL_METRICS, weight_field=WEIGHT_FIELD):
    try:
        split_names = dataframe['Dict'][0]['config']['data_config']['split_names']
    except IndexError:
        split_names = ['train', 'valid', 'test']
    for split in split_names:
        for metric in metrics:
            new_col_name = '{}_{}'.format(split, metric)
            metric_dict = []
            for exp_dict in dataframe['Dict']:
                label_values = list(exp_dict['results'][split]['labels'].values())
                global_weight = sum(label_results[weight_field] for label_results in label_values)
                metric_score = sum(label_results[weight_field] * label_results[metric] / global_weight
                                   for label_results in label_values)
                metric_dict.append(metric_score)
            dataframe[new_col_name] = metric_dict
    return dataframe


def extract_model_details(dataframe):
    dataframe['Extractor'] = [exp_dict['config']['extractor_config']['ex_type'] for exp_dict in dataframe['Dict']]
    dataframe['Model'] = [exp_dict['config']['model_config']['model_name'] for exp_dict in dataframe['Dict']]
    return dataframe


def parse_df(dataframe):
    dataframe = build_experiment_columns(dataframe)
    dataframe = extract_model_details(dataframe)
    dataframe = extract_macro_stats(dataframe)
    dataframe = extract_category_stats(dataframe)
    return dataframe

In [27]:
parsed_df = parse_df(results_df)
parsed_df['Seed'] = [val['config']['seed'] for val in parsed_df.Dict]
parsed_df['DataSource'] = [val['config']['data_config']['cache_path'].split('/')[-2] for val in parsed_df.Dict]
parsed_df['Fraction'] = [int(ds.split('-')[-4]) for ds in parsed_df.DataSource.tolist()]
parsed_df = parsed_df[(parsed_df.Extractor == 'bert') & (parsed_df.Seed == 0)]

parsed_df.sort_values(['Fraction', 'Model'])

Unnamed: 0,Path,Dict,Dataset,ModelType,Experiment,Extractor,Model,train_macro_f1,train_micro_f1,valid_macro_f1,...,test_micro_f1,train_precision,train_recall,valid_precision,valid_recall,test_precision,test_recall,Seed,DataSource,Fraction
133,../output/Vent/replica-fractions-with-test/8ba...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,8ba324ac67ab8141ab6ca395a5d26bb1,bert,dnnpool,0.129547,0.155259,0.129286,...,0.151973,0.128643,0.216959,0.128888,0.225915,0.12404,0.221282,0,vent-split-robust-cache-5-pct-with-test,5
342,../output/Vent/replica-fractions-with-test/058...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,0585fa2c337506ccabcc3654a2698bc9,bert,lstm,0.165875,0.187993,0.159112,...,0.176621,0.159047,0.248938,0.156686,0.241917,0.149336,0.243379,0,vent-split-robust-cache-5-pct-with-test,5
95,../output/Vent/replica-fractions-with-test/8fa...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,8fa8c37a972eefb75aa7c01d464d81ff,bert,dnnpool,0.152261,0.173246,0.150582,...,0.168968,0.147595,0.230402,0.145743,0.233483,0.142427,0.226006,0,vent-split-robust-cache-10-pct-with-test,10
362,../output/Vent/replica-fractions-with-test/790...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,790b902ed8baf1654efc58003c38b992,bert,lstm,0.182248,0.201763,0.173989,...,0.189163,0.176834,0.254837,0.171008,0.245652,0.163424,0.249999,0,vent-split-robust-cache-10-pct-with-test,10
101,../output/Vent/replica-fractions-with-test/d4d...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,d4dbf23858624caafe0f07b636157836,bert,dnnpool,0.162049,0.180862,0.160618,...,0.176827,0.155679,0.233526,0.154595,0.236965,0.15156,0.232908,0,vent-split-robust-cache-20-pct-with-test,20
134,../output/Vent/replica-fractions-with-test/a8c...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,a8c98925ac93a3af236ddbad18856999,bert,lstm,0.194005,0.212108,0.185367,...,0.199392,0.189679,0.256871,0.179534,0.258626,0.177071,0.248999,0,vent-split-robust-cache-20-pct-with-test,20
182,../output/Vent/replica-fractions-with-test/d4d...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,d4de8bdc2877e33f61a5558b5f7dca4d,bert,dnnpool,0.168622,0.186346,0.167097,...,0.182186,0.162024,0.235433,0.158341,0.239715,0.157076,0.238425,0,vent-split-robust-cache-40-pct-with-test,40
40,../output/Vent/replica-fractions-with-test/fe6...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,fe6464ebc27b388f9bd40a7dc489a44e,bert,lstm,0.199101,0.216475,0.191292,...,0.204909,0.198432,0.257342,0.182289,0.262415,0.181967,0.254177,0,vent-split-robust-cache-40-pct-with-test,40
79,../output/Vent/replica-fractions-with-test/144...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,1447b8ae0ef2915c4e5ea733031eb0f5,bert,dnnpool,0.171168,0.188564,0.168599,...,0.184385,0.165841,0.237105,0.163612,0.237134,0.160215,0.23861,0,vent-split-robust-cache-60-pct-with-test,60
71,../output/Vent/replica-fractions-with-test/54f...,{'config': {'data_config': {'raw_path': 'prepr...,Vent,replica-fractions-with-test,54f70325d9a3afccae46afc4a5c7e4b0,bert,lstm,0.201318,0.218326,0.193056,...,0.207825,0.198474,0.260298,0.186715,0.256288,0.183549,0.258382,0,vent-split-robust-cache-60-pct-with-test,60


In [56]:
goem_replica = parse_df(goemotions_df)
goem_replica['Seed'] = [val['config']['seed'] for val in goem_replica.Dict]
goem_replica['ModelPath'] = [val['config']['model_path'] for val in goem_replica.Dict]
goem_replica = goem_replica[(goem_replica.Extractor == 'bert') & (goem_replica.Seed == 0) & (goem_replica.ModelPath.isnull())]

# goem_replica
goem_replica

Unnamed: 0,Path,Dict,Dataset,ModelType,Experiment,Extractor,Model,train_macro_f1,train_micro_f1,valid_macro_f1,...,test_micro_f1,train_precision,train_recall,valid_precision,valid_recall,test_precision,test_recall,Seed,IsFrozen,ModelPath
29,../output/GoEmotions/replica/8a92e0e6f5f8356b3...,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,replica,8a92e0e6f5f8356b3b04f4596e464308,bert,lstm,0.536159,0.604673,0.478436,...,0.567629,0.574528,0.647633,0.536888,0.627116,0.532006,0.629957,0,"{'bert_model': 'bert-base-uncased', 'freeze_ou...",
37,../output/GoEmotions/replica/f7d2e463dc53df248...,{'config': {'data_config': {'raw_path': 'prepr...,GoEmotions,replica,f7d2e463dc53df2481405688ce5ac5cd,bert,dnnpool,0.563597,0.613752,0.500268,...,0.552325,0.581402,0.660842,0.521334,0.626332,0.520427,0.607679,0,"{'bert_model': 'bert-base-uncased', 'freeze_ou...",


In [57]:
experiment_ids = parsed_df.Experiment.tolist()

In [68]:
import os
import sys
import copy
import json

sys.path.append('../src')
from config import ExperimentConfig

NUM_EXPERIMENTS = 5
BASE_MODEL_PATH = 'models/Vent/fractions-with-test'
CONFIG_PATH = '../configs/GoEmotions/transfer-vent'
OUTPUT_PATH = 'output/GoEmotions/transfer-vent/'

if not os.path.exists(CONFIG_PATH):
    os.mkdir(CONFIG_PATH)

for config in goem_replica.Dict.tolist():
    exp_config = config['config']
    for seed in range(NUM_EXPERIMENTS):
        for previous_experiment in experiment_ids:
            config_copy = copy.deepcopy(exp_config)
            config_copy['seed'] = seed
            config_copy['pretrained_model_path'] = f'{BASE_MODEL_PATH}/{previous_experiment}.pkl'
            config_copy['output_path'] = OUTPUT_PATH
            
            # Dump and prepare for the execution
            exp_cfg = ExperimentConfig.from_dict(config_copy)        
            as_json = json.dumps(exp_cfg._as_flat_dict(), indent=2)
            exp_hash = exp_cfg.hash()

            # GoEmotions
            save_path = f'{CONFIG_PATH}/{exp_hash}.json'
            with open(save_path, 'w') as f:
                f.write(as_json)