In [1]:
from glob import glob
import os
from os.path import join, split, splitext, dirname
import pandas as pd
from scipy.stats import norm
import numpy as np
from functools import partial
from scipy import stats

import pickle

In [2]:
base_path = '/home/rgiordan/Documents/git_repos/DADVI/dadvi-experiments/'
input_path = join(base_path, 'comparison/analysis/coverage_warm_starts_rerun')
output_path = join(base_path, 'comparison/analysis/coverage_warm_starts_rerun')

M_vals = [ 8, 16, 32, 64 ]

In [3]:
M = 8
coverage_filenames = glob(join(input_path, f'M_{M}', '*', '*.pkl'))
# raw_model_df = pd.read_pickle(filename)
# print(raw_model_df.keys())
# #raw_model_df['means'] - raw_model_df['reference_means'] # They are different
# reference_means = raw_model_df['reference_means'].iloc[0]
# reference_freq_sds = raw_model_df['reference_freq_sds'].iloc[0]
# assert(len(reference_means) == param_dim)
# reference_freq_sds

In [4]:
def RepList(x, n):
    return [x for _ in range(n)]


def GetCoverageDataframe(filename, M):
    raw_model_df = pd.read_pickle(filename)

    # The final directory is the model name
    model_name = split(os.path.dirname(filename))[-1]

    num_runs = raw_model_df.shape[0]

    means = raw_model_df['means'].to_numpy()
    seeds = raw_model_df['seed'].to_numpy()
    freq_sds = raw_model_df['freq_sds'].to_numpy()
    assert(len(means) == len(freq_sds))
    num_runs = len(means) 
    param_dim = len(means[0]) 

    # A for loop is not the most efficient or elegant but it will let me make sure
    # everything lines up correctly
    model_dict = {
        'seed': [],
        'param': [],
        'mean': [],
        'freq_sd': []
    }
    param_dims = np.arange(param_dim)
    for i in range(num_runs):
        assert(len(means[i]) == len(freq_sds[i]))
        model_dict['seed'].append(RepList(seeds[i], param_dim))
        model_dict['param'].append(param_dims)
        model_dict['mean'].append(means[i])
        model_dict['freq_sd'].append(freq_sds[i])

#     # Save the ''reference'' as well
#     reference_means = raw_model_df['reference_means'].iloc[0]
#     reference_freq_sds = raw_model_df['reference_freq_sds'].iloc[0]
#     assert(len(reference_means) == param_dim)
#     assert(len(reference_means) == len(reference_freq_sds))
#     model_dict['seed'].append(RepList('reference', param_dim))
#     model_dict['param'].append(param_dims)
#     model_dict['mean'].append(reference_means)
#     model_dict['freq_sd'].append(reference_freq_sds)
        
    model_df = pd.DataFrame()
    for k,v in model_dict.items():
        model_df[k] = np.hstack(v)
    model_df['model'] = model_name
    model_df['num_draws'] = M
    
    return model_df

In [5]:
model_dfs = []
for M in M_vals:
    print(f'Loading for {M} draws')
    coverage_filenames = glob(join(input_path, f'M_{M}', '*', '*.pkl'))
    assert(len(coverage_filenames) > 0)
    model_dfs.append(
        pd.concat([GetCoverageDataframe(filename, M) for filename in coverage_filenames]))

model_df = pd.concat(model_dfs)

Loading for 8 draws




Loading for 16 draws
Loading for 32 draws
Loading for 64 draws


In [6]:
np.unique(model_df['num_draws'].to_numpy())

array([ 8, 16, 32, 64])

In [7]:
print(output_path)
model_df.to_csv(join(output_path, 'coverage_tidy.csv'), index=False)

/home/rgiordan/Documents/git_repos/DADVI/dadvi-experiments/comparison/analysis/coverage_warm_starts_rerun


# Load the CG results

At the moment, the CG results are computed separately in a different format.

In [20]:
model = 'tennis'
pickle_file = join(input_path, f"{model}_with_names.pkl")
with open(pickle_file, 'rb') as f:
    pkl_file = pickle.load(f)
pkl_file['seed']

0      1071
1      1043
2      1098
3      1004
4      1077
       ... 
399    1055
400    1093
401    1078
402    1067
403    1096
Name: seed, Length: 404, dtype: int64

In [21]:
def GetRunDF(ind, pkl_file):
    return pd.DataFrame({
        'num_draws': pkl_file['M'][ind],
        'seed': pkl_file['seed'][ind],
        'mean': pkl_file['means'][ind],
        'freq_sd': pkl_file['freq_sds'][ind],
        'norm': pkl_file['newton_step_norm'][ind],
        'param_name': pkl_file['names'][ind],
        'param': pkl_file['indices'][ind],
        'model': model
    })


dfs = []
for model in ['tennis', 'potus', 'occu']:
    # Load a single pkl file
    print(f'Loading {model}')
    pickle_file = join(input_path, f"{model}_with_names.pkl")
    with open(pickle_file, 'rb') as f:
        pkl_file = pickle.load(f)

    dfs = dfs + [ GetRunDF(ind, pkl_file) for ind in range(len(pkl_file))]


Loading tennis
Loading potus
Loading occu


In [22]:
cg_df = pd.concat(dfs)
cg_df.to_csv(join(output_path, 'coverage_tidy_cg.csv'), index=False)