In [1]:
import os
from multiprocessing import Pool, cpu_count

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

In [2]:
root_dir = '/Users/nshah/work/vcimpute/output'
files = list(filter(lambda x: x.startswith('copula'), os.listdir(root_dir)))

def run(path):
    full_path = os.path.join(root_dir, path)
    df = pd.read_csv(full_path)
    df = df.drop_duplicates()
    return df

with Pool(cpu_count()-1) as p:
    out = p.map(run, files)

df = pd.concat(out)

In [3]:
slices = (df[['copula_type', 'vine_structure', 'pattern']]
          .drop_duplicates()
          .sort_values(by=['copula_type', 'vine_structure', 'pattern'])
          .itertuples(index=False, name=None))
out = {}
for copula_type, vine_structure, pattern in slices:
    pred = df['copula_type'] == copula_type
    pred &= df['pattern'] == pattern
    if isinstance(vine_structure, str):
        pred &= df['vine_structure'] == vine_structure
        tag = f'{copula_type}_{vine_structure}_{pattern}'
    else:
        pred &= df['vine_structure'].isnull()
        tag = f'{copula_type}_corrmat_{pattern}'
    out[tag] = df[pred].groupby(['d','mask_fraction']).mean().dropna(axis=1)

In [4]:
d_to_col_mis = df[df['pattern'] == 'monotone'].groupby(['d'])['n_cols'].first().astype(int).to_dict()

In [5]:
sorted(df['d'].unique())

[5, 8, 11, 14, 17, 20]

In [6]:
d_to_col_mis

{5: 2, 8: 3, 11: 4, 14: 5, 17: 6, 20: 6}

In [9]:
for key, sub_df in out.items():
    fig, axes = plt.subplots(3, 6, sharex=True, sharey='row', figsize=(14,7), facecolor='white')
    for i, metric in enumerate(['smae', 'bias', 'elapsed']):
        for j, d in enumerate(sub_df.index.get_level_values('d').unique()):
            for col in filter(lambda x: x.startswith(metric), sub_df.columns):

                    if 'gcimpute' in col:
                        color = 'k'
                    elif 'copfit' in col:
                        color = "#7C0000" # red
                    elif 'copreg' in col:
                        color = "#00FF00" # green
                    else: 
                        color = 'blue'
                    divisor = 1_000_000 if metric == 'elapsed' else 1
                    axes[i][j].plot(sub_df.loc[pd.IndexSlice[d], col] / divisor, color=color)

    for i, label in zip(range(3), ['smae', 'bias', 'elapsed']):
        axes[i][0].set_ylabel(label)
    for j in range(6):
        axes[2][i].set_xlabel('missingness (%)')
    fig.suptitle(key)
    plt.subplots_adjust(top=0.92, wspace=0.0, hspace=0.0)
    plt.tight_layout()
    plt.savefig(f'{key}.png', dpi=300)
    plt.close()