In [1]:
#!/bin/env python3

from argparse import ArgumentParser
from ast import Str

import numpy as np
import os
from os.path import join

from pathlib import Path

import pandas as pd
import pylab as plt
import random
import time
import warnings

from matplotlib.ticker import NullFormatter
from matplotlib.patches import Polygon
from matplotlib.lines import Line2D


# wrapper for pylab, designed to do statistical plots using pandas dataframe 

from scipy.stats import beta, gaussian_kde


import sys
sys.path.append('/home/kblum/local/pism/sources/pism-emulator/')

from pismemulator.utils import param_keys_dict as keys_dict
from pismemulator.utils import kl_divergence
from pismemulator.kyle_utils import plot_random_groups, plot_posteriors, kl_divergences

import seaborn as sns



In [2]:
# Work in progess, this notebook is meant to plot several different randomly selected ensembles, subsets of the 200 that are avaiable

In [3]:
emulator_dirs = ['/data/work/antarctica_calibration/speedemulator/legacy_dirs/2021_11_pseudo_plastic_kb']
samples_files = {}

rc_samples_file = "/data/work/antarctica_calibration/speed_calibration_regularized_coulomb_128.csv"
pp_samples_file = "/data/work/antarctica_calibration/speed_calibration_pseudo_plastic_128.csv"

for emulator_dir in emulator_dirs:
    # This is janky, should find a better way to do this
    if 'rc_' in emulator_dir:
        samples_files[emulator_dir] = rc_samples_file
        print('rc')
    else:
        samples_files[emulator_dir] = pp_samples_file
        print('pp')

frac         = 0.1

print("Loading prior samples\n")
rc_samples = pd.read_csv(rc_samples_file).drop(columns=["id"])
pp_samples = pd.read_csv(pp_samples_file).drop(columns=["id"])

X_keys = rc_samples.keys()

l_post = []


pp
Loading prior samples



In [4]:
posterior_dfs = []
df_models     = []
titles        = []
for emulator_dir in emulator_dirs:
    X_list = []
    p = Path(f"{emulator_dir}/posterior_samples/")
    print("Loading posterior samples\n")
    for m, m_file in enumerate(sorted(p.glob("X_posterior_model_*.csv.gz"))):
        #print(f"  -- {m_file}")
        df = pd.read_csv(m_file).sample(frac=frac)
        if "Unnamed: 0" in df.columns:
            df.drop(columns=["Unnamed: 0"], inplace=True)
        model = m_file.name.split("_")[-1].split(".")[0]
        df["Model"] = int(model)
        X_list.append(df)

    print(f"Merging posteriors into dataframe")
    posterior_df = pd.concat(X_list)
    posterior_dfs.append(posterior_df)
    #X_posterior = posterior_df.drop(columns=["Model"]).values
    models = posterior_df['Model'].unique()
    df_models.append(models)
    titles.append(emulator_dir)



Loading posterior samples

Merging posteriors into dataframe


In [11]:
import time
from joblib import Parallel, delayed

def kl_divergences_p(df=None, variables=None, models=None, per_group=10, num_groups=10):
    # df : data frame of all models for true avg
    # vars: list of variable (column) names
    # models: list of available models (0,1,2 ...)
    # num_groups: number of ensembles
    # per_group: number of emulators per ensemble

    divs = {}
    groups = []

    random.seed(8675309)
    for i in range(num_groups):
        groups.append(random.sample(sorted(models), per_group))
    bar = 0
    for m_var in variables:
        kl_average = 0
        p = np.histogram(df[m_var], bins=30,density=True)[0]
        for group in groups:
            temp = df[df['Model'].isin(group)]
            q = np.histogram(temp[m_var],bins=30,density=True)[0]
            kl_average += np.abs(kl_divergence(p,q))
        kl_average  = kl_average / num_groups
        divs[m_var] = kl_average
    return (per_group, divs)

n_jobs = 4
start_time = time.perf_counter()
result = Parallel(n_jobs=n_jobs)(delayed(kl_divergences_p)(df=posterior_dfs[0], variables=X_keys, 
                                                         per_group=i, 
                                                         models=models, num_groups=10) for i in range(1, 200))
finish_time = time.perf_counter()
print(f"Program finished in {finish_time-start_time} seconds")
kld_df = pd.concat([pd.DataFrame(data=result[i][1], index=[result[i][0]]) for i in range(len(result))])

  np.logical_and(np.logical_and(p != 0, q != 0), np.isfinite(p / q)),
  p * np.log(p / q),
  p * np.log(p / q),
  p * np.log(p / q),
  np.logical_and(np.logical_and(p != 0, q != 0), np.isfinite(p / q)),
  p * np.log(p / q),
  np.logical_and(np.logical_and(p != 0, q != 0), np.isfinite(p / q)),
  p * np.log(p / q),
  np.logical_and(np.logical_and(p != 0, q != 0), np.isfinite(p / q)),
  p * np.log(p / q),
  p * np.log(p / q),
  p * np.log(p / q),
  p * np.log(p / q),
  p * np.log(p / q),
  np.logical_and(np.logical_and(p != 0, q != 0), np.isfinite(p / q)),
  p * np.log(p / q),
  np.logical_and(np.logical_and(p != 0, q != 0), np.isfinite(p / q)),
  p * np.log(p / q),
  np.logical_and(np.logical_and(p != 0, q != 0), np.isfinite(p / q)),
  p * np.log(p / q),
  np.logical_and(np.logical_and(p != 0, q != 0), np.isfinite(p / q)),
  p * np.log(p / q),
  p * np.log(p / q),
  p * np.log(p / q),


Program finished in 304.5258009941317 seconds


In [12]:
kld_df

Unnamed: 0,sia_e,ssa_e,ppq,tefo,phi_min,z_min,z_max,pseudo_plastic_uthreshold
1,219.284268,63.643263,84.624794,1283.266020,2.059423,0.038909,0.031013,0.303332
2,210.820975,53.730696,78.089822,867.808450,2.039187,0.039874,0.022562,0.297089
3,190.529095,53.095395,52.922264,646.925961,1.382969,0.026467,0.018481,0.220973
4,191.898108,37.139291,25.621099,323.395461,0.755303,0.026698,0.010113,0.256163
5,177.114356,36.677830,24.878023,404.485193,0.510155,0.019809,0.007653,0.129113
...,...,...,...,...,...,...,...,...
195,5.870632,0.047806,0.013722,0.122930,0.000512,0.000010,0.000006,0.000066
196,0.001440,0.007671,0.016903,1.605412,0.000445,0.000007,0.000006,0.000379
197,0.000609,0.005318,0.009166,1.636975,0.001556,0.000077,0.000004,0.000051
198,0.000722,0.003081,0.003931,0.058389,0.000147,0.000003,0.000003,0.000014
