In [1]:
from os.path import join
import pandas as pd
import numpy as np


In [2]:
from load_and_tidy_posteriors_lib import \
    VALID_METHODS, GetMetadataDataframe, GetMethodDataframe, \
    GetTraceDataframe, GetUnconstraintedParamsDataframe, \
    GetMCMCDiagnosticsDataframe

In [3]:
base_folder = '/home/rgiordan/Documents/git_repos/DADVI/dadvi-experiments'
input_folder = join(base_folder, 'comparison/blade_runs/')
output_folder = input_folder

folder_method_list = (
    (join(input_folder, "nuts_results/"), 'NUTS'),
    (join(input_folder, "dadvi_results/"), 'DADVI'),
    (join(input_folder, "raabbvi_results/"), 'RAABBVI'),
    (join(input_folder, "sadvi_results/"), 'SADVI'),
    (join(input_folder, "sfullrank_advi_results/"), 'SADVI_FR'),
    (join(input_folder, "lrvb_Direct_results/"), 'LRVB'),
    (join(input_folder, 'lrvb_doubling_results'), 'LRVB_Doubling'),
    (join(input_folder, 'lrvb_cg_results'), 'LRVB_CG')
)


In [4]:
posterior_dfs = []
for folder, method in folder_method_list:
    print(f'Loading {method}')
    posterior_dfs.append(GetMethodDataframe(folder, method))
posterior_df = pd.concat(posterior_dfs)

Loading NUTS
Loading DADVI
Loading RAABBVI
Loading SADVI
Loading SADVI_FR
Loading LRVB
Loading LRVB_Doubling
Loading LRVB_CG


# Explore the contents of the metadata.  

Maybe we want to save additional information.

In [5]:
raw_metadata = {}
model_names = {}
for folder, method in folder_method_list:
    print(f'Loading {method}')
    raw_metadata[method], model_names[method] = \
        GetMetadataDataframe(folder, method, return_raw_metadata=True) 


Loading NUTS
Loading DADVI




Loading RAABBVI
Loading SADVI
Loading SADVI_FR
/home/rgiordan/Documents/git_repos/DADVI/dadvi-experiments/comparison/blade_runs/sfullrank_advi_results/info/occ_det.pkl not found.
Loading LRVB
Loading LRVB_Doubling
/home/rgiordan/Documents/git_repos/DADVI/dadvi-experiments/comparison/blade_runs/lrvb_doubling_results/lrvb_info/tennis.pkl not found.
Loading LRVB_CG


# See what's missing

In [6]:
all_model_names = set.union(*[set(model_names[k]) for k in model_names])

for k in model_names:
    missing_models = set.difference(all_model_names, set(model_names[k]))
    print(f'{k} missing {missing_models}')

NUTS missing set()
DADVI missing set()
RAABBVI missing {'sesame_one_pred_b'}
SADVI missing set()
SADVI_FR missing {'occ_det', 'tennis', 'potus'}
LRVB missing {'potus'}
LRVB_Doubling missing {'tennis', 'potus'}
LRVB_CG missing {'ideo_reparam', 'radon_no_pool', 'wells_d100ars', 'election88_full', 'electric_1c', 'electric_one_pred', 'earnings_vary_si', 'radon_intercept_chr', 'radon_group_chr', 'logearn_logheight', 'nes2000_vote', 'mesquite', 'wells_dae', 'earnings1', 'mesquite_log', 'logearn_height_male', 'earnings_interactions', 'sesame_multi_preds_3a', 'kidiq_interaction', 'kidscore_momwork', 'wells_daae_c', 'radon_inter_vary', 'logearn_height', 'radon_vary_si', 'electric', 'kidscore_momhs', 'congress', 'radon_no_pool_chr', 'test', 'wells_dist100', 'election88', 'mesquite_volume', 'mesquite_vas', 'pilots', 'electric_1b', 'microcredit', 'radon_complete_pool', 'wells_predicted_log', 'hiv_inter', 'wells_interaction', 'wells_predicted', 'kidiq_multi_preds', 'kidscore_momiq', 'sesame_multi_p

# This doesn't make sense, LRVB should always take longer than DADVI.

In [7]:
dadvi_times = pd.DataFrame({
    'model': model_names['DADVI'],
    'runtime': np.array([ m['runtime'] for m in raw_metadata['DADVI'] ])})
lrvb_times = pd.DataFrame({
    'model': model_names['LRVB'],
    'runtime': np.array([ m['runtime'] for m in raw_metadata['LRVB'] ])})
comp_df = pd.merge(
    dadvi_times, lrvb_times, on='model', how='inner', suffixes=['_DADVI', '_LRVB'])
diffs = np.array(comp_df['runtime_LRVB'] - comp_df['runtime_DADVI'])

np.min(diffs), np.max(diffs), np.mean(diffs)

(-0.20125889778137207, 3281.6659541130066, 75.18515008146113)

# Not all the unconstrained parameters have posterior samples.  AFAIK that means I don't know their dimension.

# This may be fixed

In [8]:
def GetModelParams(model):
    keep_rows = np.logical_and(
        posterior_df['method'] == 'DADVI', 
        posterior_df['model'] == model)
    return posterior_df[keep_rows]['param'].to_numpy()

def GetUnconstrainedModelParams(model):
    ind = np.argwhere(
        np.array(model_names['DADVI']) == model)[0][0]
    return raw_metadata['DADVI'][ind]['unconstrained_param_names']

model = 'electric_multi_preds'
GetUnconstrainedModelParams(model), GetModelParams(model)

(['Intercept', 'treatment', 'pre_test', 'post_test_sigma_log__'],
 array(['Intercept', 'treatment', 'pre_test', 'post_test_sigma_log__',
        'post_test_sigma'], dtype=object))

# What's in the LRVB CG Metadata

In [11]:
import load_and_tidy_posteriors_lib

method = 'LRVB_CG'
raw_metadata[method][2]


load_and_tidy_posteriors_lib.GetModelHVPCount(method, raw_metadata[method][2])

845