Load results from the `blade_runs` directory and save in a tidy format for R

In [1]:
from os.path import join
import pandas as pd
import numpy as np

In [10]:
from load_and_tidy_posteriors_lib import \
    VALID_METHODS, GetMetadataDataframe, GetMethodDataframe, \
    GetTraceDataframe, GetUnconstraintedParamsDataframe, \
    GetMCMCDiagnosticsDataframe

# Set to false for a dry run; doing so will not overwrite the csv
save_output = True

In [3]:
base_folder = '/home/rgiordan/Documents/git_repos/DADVI/dadvi-experiments'
input_folder = join(base_folder, 'comparison/blade_runs/')
output_folder = input_folder

folder_method_list = (
    (join(input_folder, "nuts_results/"), 'NUTS'),
    (join(input_folder, "dadvi_results/"), 'DADVI'),
    (join(input_folder, "raabbvi_results/"), 'RAABBVI'),
    (join(input_folder, "sadvi_results/"), 'SADVI'),
    (join(input_folder, "sfullrank_advi_results/"), 'SADVI_FR'),
    (join(input_folder, "lrvb_Direct_results/"), 'LRVB'),
    (join(input_folder, 'lrvb_doubling_results'), 'LRVB_Doubling')
)


In [4]:
posterior_dfs = []
for folder, method in folder_method_list:
    print(f'Loading {method}')
    posterior_dfs.append(GetMethodDataframe(folder, method))
posterior_df = pd.concat(posterior_dfs)

Loading NUTS
Loading DADVI
Loading RAABBVI
Loading SADVI
Loading SADVI_FR
Loading LRVB
Loading LRVB_Doubling


In [5]:
metadata_dfs = []
for folder, method in folder_method_list:
    print(f'Loading {method}')
    metadata_dfs.append(GetMetadataDataframe(folder, method))
    
metadata_df = pd.concat(metadata_dfs)


Loading NUTS
Loading DADVI




Loading RAABBVI
Loading SADVI
Loading SADVI_FR
Loading LRVB
Loading LRVB_Doubling
/home/rgiordan/Documents/git_repos/DADVI/dadvi-experiments/comparison/blade_runs/lrvb_doubling_results/lrvb_info/tennis.pkl not found.


In [6]:
trace_dfs = []
for folder, method in folder_method_list:
    print(f'Loading {method}')
    trace_dfs.append(GetTraceDataframe(folder, method))
    
trace_df = pd.concat(trace_dfs)


Loading NUTS
Loading DADVI
Loading RAABBVI
Loading SADVI
Loading SADVI_FR
Loading LRVB
Loading LRVB_Doubling
/home/rgiordan/Documents/git_repos/DADVI/dadvi-experiments/comparison/blade_runs/lrvb_doubling_results/lrvb_info/tennis.pkl not found.


In [7]:
if save_output:
    posterior_df.to_csv(join(output_folder, 'posteriors_tidy.csv'), index=False)
    metadata_df.to_csv(join(output_folder, 'metadata_tidy.csv'), index=False)
    trace_df.to_csv(join(output_folder, 'trace_tidy.csv'), index=False)

# Save the names of unconstrained parameters

In [8]:
if save_output:
    folder, method = folder_method_list[1]
    assert method == 'DADVI'
    param_df = GetUnconstraintedParamsDataframe(folder, method)
    param_df.to_csv(join(output_folder, 'params_tidy.csv'), index=False)

# Save the full MCMC diagnostic information

In [12]:
folder, method = folder_method_list[0]
assert method == 'NUTS'
mcmc_df = GetMCMCDiagnosticsDataframe(folder, method)
if save_output:
    mcmc_df.to_csv(join(output_folder, 'mcmc_diagnostics_tidy.csv'), index=False)

# Explore the contents of the metadata.  

Maybe we want to save additional information.

In [14]:
# Stop execution manually
raise(ValueError)

ValueError: 

In [15]:
raw_metadata = {}
model_names = {}
for folder, method in folder_method_list:
    print(f'Loading {method}')
    raw_metadata[method], model_names[method] = \
        GetMetadataDataframe(folder, method, return_raw_metadata=True) 


Loading NUTS
Loading DADVI
Loading RAABBVI
Loading SADVI
Loading SADVI_FR
Loading LRVB
Loading LRVB_Doubling
/home/rgiordan/Documents/git_repos/DADVI/dadvi-experiments/comparison/blade_runs/lrvb_doubling_results/lrvb_info/tennis.pkl not found.


In [16]:
all_model_names = set.union(*[set(model_names[k]) for k in model_names])

In [17]:
for k in model_names:
    missing_models = set.difference(all_model_names, set(model_names[k]))
    print(f'{k} missing {missing_models}')

NUTS missing set()
DADVI missing set()
RAABBVI missing {'mesquite_vash', 'tennis', 'potus'}
SADVI missing set()
SADVI_FR missing {'occ_det', 'tennis', 'potus'}
LRVB missing {'potus'}
LRVB_Doubling missing {'tennis', 'potus'}


# This doesn't make sense, LRVB should always take longer than DADVI.

In [18]:
dadvi_times = pd.DataFrame({
    'model': model_names['DADVI'],
    'runtime': np.array([ m['runtime'] for m in raw_metadata['DADVI'] ])})
lrvb_times = pd.DataFrame({
    'model': model_names['LRVB'],
    'runtime': np.array([ m['runtime'] for m in raw_metadata['LRVB'] ])})
comp_df = pd.merge(
    dadvi_times, lrvb_times, on='model', how='inner', suffixes=['_DADVI', '_LRVB'])
diffs = np.array(comp_df['runtime_LRVB'] - comp_df['runtime_DADVI'])

np.min(diffs), np.max(diffs), np.mean(diffs)

(-0.7414968013763428, 1871.78377866745, 47.663744030576765)

# Not all the unconstrained parameters have posterior samples.  AFAIK that means I don't know their dimension.

In [58]:
def GetModelParams(model):
    keep_rows = np.logical_and(
        posterior_df['method'] == 'DADVI', 
        posterior_df['model'] == model)
    return posterior_df[keep_rows]['param'].to_numpy()

def GetUnconstrainedModelParams(model):
    ind = np.argwhere(
        np.array(model_names['DADVI']) == model)[0][0]
    return raw_metadata['DADVI'][ind]['unconstrained_param_names']

In [60]:
model = 'electric_multi_preds'
GetUnconstrainedModelParams(model), GetModelParams(model)

(['Intercept', 'treatment', 'pre_test', 'post_test_sigma_log__'],
 array(['Intercept', 'treatment', 'pre_test', 'post_test_sigma'],
       dtype=object))