In [None]:
import pickle
from pathlib import Path
from pprint import pp

import os
import sys

import h5py
import holodeck as holo
from holodeck import plot
from holodeck.gps import gp_utils as gu
from holodeck.gps import plotting_utils as pu
from holodeck.gps import sam_utils as su

import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import corner

from holodeck.gps.gp_utils import GaussProc

# fixes relative-pathing errors for some GPs
from holodeck.gps import gp_utils 
sys.modules['gp_utils'] = gp_utils

%load_ext autoreload
%autoreload 2

In [None]:
# plotting/visualization control
save_figures = False # if set to true, will automatically save each of the figures created below using a prescribed filenaming convention
box_frac = 0.2 #sets the size of the %error box placed on figures. Default of 0.2 corresponds to 20%.

sigmas = np.asarray([1, 2, 3])
sigma_intervals = np.asarray([sp.stats.norm.cdf([-s, +s]) for s in sigmas])
sigma_widths = np.asarray([np.diff(si)[0] for si in sigma_intervals])
print(sigmas)
print(sigma_intervals)
print(sigma_widths)

def get_gps(spectra_train, path_trained, pattern_mean, pattern_std, path_test):
    gaussproc = GaussProc  # For backwards compatibility before change to camel-case

    ptrain = Path(path_trained)
    files = list(ptrain.glob("*.pkl"))
    mean = None
    std = None
    for ff in files:
        if pattern_mean in ff.name:
            if mean is None:
                mean = ff
            else:
                raise ValueError(f"Found multiple matches for mean pattern {pattern_mean}!")
        if pattern_std in ff.name:
            if std is None:
                std = ff
            else:
                raise ValueError(f"Found multiple matches for std pattern {pattern_std}!")

    if mean is None or std is None:
        raise ValueError(f"Did not find a match for mean ({mean}) or std ({std})!")    

    with open(mean, "rb") as f:
        gp_george_means = pickle.load(f)
    gp_list_means = gu.set_up_predictions(spectra_train, gp_george_means)
    
    with open(std, "rb") as f:
        gp_george_stdevs = pickle.load(f)
    gp_list_stdevs = gu.set_up_predictions(spectra_train, gp_george_stdevs)
    
    return gp_george_means, gp_list_means, gp_george_stdevs, gp_list_stdevs
    

def get_mean_stdev(spectra,
                   gp_george,
                   gp_list,
                   gp_george_stdev,
                   gp_list_stdev,
                   xobs,
                   scale_factor=1.0,   #1.0 for hc^2, 0.5 for hc
                   include_gp_unc = False):
    
    pts = (xobs.shape[0], len(gp_george))
    means = np.zeros(pts)
    stdevs = np.zeros(pts)
    
    for ind in range(pts[0]):
        env_param = xobs[ind,:].copy()
        hc, rho, rho_pred = gu.hc_from_gp(gp_george, gp_list, gp_george_stdev, gp_list_stdev, env_param, include_gp_unc=include_gp_unc)
        means[ind] = rho*scale_factor
        stdevs[ind] = rho_pred[:,1]*scale_factor
        
    return means, stdevs


def fraction_within_error(array, error_bound=0.1):
    return np.count_nonzero(np.abs(10**(np.ravel(array))-1)<error_bound)/(array.size)

In [None]:
# spectra and GPs
# trained_gp_uniform-06_2023-05-04_n1000_r1000_f40_20230508_165227.pkl

library_path = "/Users/lzkelley/Programs/nanograv/15yr_astro_libraries/uniform-06/"
library_path = Path(library_path)
assert library_path.exists()

# the name of the spectrum libraries used to train and test the GPs
# these are the date/time-based codes we use to distinguish GPs

def_spectra_name_test = "uniform-06_n2000_r1000_f40"

libraries = [
    {
        "name": "uniform-06_n500_r100_f40",
        "means": "20230512_192912",
        "stdevs": "20230512_192758",
    },
    {
        "name": "uniform-06_n1000_r100_f40",
        "means": "20230512_220459",
        "stdevs": "20230512_211616",
    },
    {
        "name": "uniform-06_n1000_r1000_f40",
        "means": "20230512_211812",
        "stdevs": "20230512_212606",
    },
    {
        "name": "uniform-06b_n1000_r100_f40",
        "means": "20230513_042106",
        "stdevs": "20230513_042001",
    },   
    {
        "name": "uniform-06c_n1000_r100_f40",
        "means": "20230513_040629",
        "stdevs": "20230513_042007",
    },   
    {
        "name": "uniform-06_n2000_r1000_f40",
        "means": "20230513_112706",
        "stdevs": "20230513_112022",
        "test": "uniform-06_n1000_r1000_f40",
    },   
    {
        "name": "uniform-06_n2000_r100_f40",
        "means": "20230513_094131",
        "stdevs": "20230513_063755",
    },   
]

# Add default test library to libraries that don't have one specified
for lib in libraries:
    if 'test' not in lib:
        lib['test'] = def_spectra_name_test
        

# Compare Libraries

In [None]:
def get_everything(library_spec_dict):
    name_train = library_spec_dict['name']
    spectra_name_test = library_spec_dict['test']
    ext_means = library_spec_dict['means']
    ext_stdevs = library_spec_dict['stdevs']

    spectra_file_train = library_path.joinpath(f"{name_train}/sam_lib.hdf5")
    spectra_train = h5py.File(spectra_file_train, "r")

    spectra_file_test = library_path.joinpath(f"{spectra_name_test}/sam_lib.hdf5")
    spectra_test = h5py.File(spectra_file_test, "r")
    
    path_trained = Path(library_path).joinpath(name_train)
    
    # ---- Load GPs
    
    gp_george_means, gp_means, gp_george_stdevs, gp_stdevs = get_gps(
        spectra_train, path_trained, ext_means, ext_stdevs, None
    )
    
    # ---- Get truth

    # for training data
    _, xobs_train, true_std_train, true_med_train, _true_med_mean_train = gu.get_gwb(
        spectra_train, len(gp_george_means)
    )
    true_med_train += _true_med_mean_train
    # for testing data
    _, xobs_test, true_std_test, true_med_test, _true_med_mean_test = gu.get_gwb(
        spectra_test, len(gp_george_means)
    )
    true_med_test += _true_med_mean_test

    # ---- Get predictions
    
    pred_med_train, pred_std_train = get_mean_stdev(
        spectra_train, gp_george_means, gp_means, gp_george_stdevs, gp_stdevs, xobs_train
    )
    pred_med_test, pred_std_test = get_mean_stdev(
        spectra_test, gp_george_means, gp_means, gp_george_stdevs, gp_stdevs, xobs_test
    )
    
    # ---- package and return
    
    true_train = [true_med_train, true_std_train]
    true_test = [true_med_test, true_std_test]
    pred_train = [pred_med_train, pred_std_train]
    pred_test = [pred_med_test, pred_std_test]
    
    return pred_train, pred_test, true_train, true_test

all_data = []
all_names = []
for lib_spec in libraries:
    # pred_train, pred_test, true_train, true_test = get_everything(libraries[0])
    vals = get_everything(lib_spec)
    all_data.append(vals)
    all_names.append(lib_spec['name'])

In [None]:
import kalepy as kale

scale = 'log'
pnts = kale.utils.spacing([1e-4, 0.4], scale='log', num=100)

# scale = 'linear'
# pnts = kale.utils.spacing([0.0, 0.2], scale=scale, num=100)

spectra_name_test = libraries[0]['test']


width = 0.5
offset = 0.1

train_test_colors = ['black', 'red']
fig, axes = plt.subplots(figsize=[10, 5], ncols=2)
fig.suptitle(f"test: {spectra_name_test}")
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.2, top=0.9)

for ii, (name, vals) in enumerate(zip(all_names, all_data)):
    pred_train, pred_test, true_train, true_test = vals

    # train vs. test
    for jj, (pred, true) in enumerate(zip([pred_train, pred_test], [true_train, true_test])):
        cc = train_test_colors[jj]

        # med vs std
        for kk, (vals_pred, vals_true) in enumerate(zip(pred, true)):
            ax = axes[kk]
            pp = vals_pred.flatten()
            tt = vals_true.flatten()

            err = np.fabs(pp - tt)
            xx = 1.0 + ii + offset - 2*offset*jj

            percs = np.percentile(err, [50, 90, 98])
            markers = ['_', '^', 'v']
            for pp, mm in zip(percs, markers):
                ax.scatter(xx, pp, color=cc, marker=mm, s=20)

            if scale.startswith('lin'):
                yy, zz = kale.density(err, pnts)
            else:
                yy, zz = kale.density(np.log10(err), np.log10(pnts))
                yy = 10.0 ** yy
            plot.violin(ax, xx, yy, zz, width, color=cc, median=None)

xlabels = [
    nn.replace("_f40", "").replace("uniform-06_", "uniform-06a_").replace("uniform-06", "")
    for nn in all_names
]
xticks = 1 + np.arange(len(xlabels))
for ii, ax in enumerate(axes):
    ax.grid(True, alpha=0.25)
    lab = "median" if ii == 0 else "stdev"
    ax.set(yscale=scale, ylabel=lab)
    ax.set_xticks(ticks=xticks, labels=xlabels, rotation=45)

fname = f"gps_compare_violins_{scale}.png"
plt.savefig(fname)
print(fname)

# Run on Single Library

In [None]:
# spectra_name_train = "uniform-06_n1000_r1000_f40"
# gp_ext_means = "20230512_211812"
# gp_ext_stdevs = "20230512_212606"

lib = libraries[-2]
spectra_name_train = lib['name']
spectra_name_test = lib['test']
gp_ext_means = lib['means']
gp_ext_stdevs = lib['stdevs']

output_path = library_path.joinpath(spectra_name_train)
rel_path = f"{'/'.join(output_path.parts[-3:])}/"
print(f"path = `{rel_path}`\n\texists={output_path.exists()}")

def save_fig(fig, fname, **kwargs):
    kwargs.setdefault('dpi', 600)
    fname = output_path.joinpath(fname)
    fig.savefig(fname, **kwargs)
    print(f"Saved to {fname}, size {holo.utils.get_file_size(fname)}")
    return fname


## set up your directory and GPs of interest

In [None]:
spectra_file_train = library_path.joinpath(f"{spectra_name_train}/sam_lib.hdf5")
print_name_train = spectra_name_train.replace("broad-uniform-","bu")
spectra_train = h5py.File(spectra_file_train, "r")

spectra_file_test = library_path.joinpath(f"{spectra_name_test}/sam_lib.hdf5")
print_name_test = spectra_name_test.replace("broad-uniform-","bu")
spectra_test = h5py.File(spectra_file_test, "r")

In [None]:
gaussproc = GaussProc  # For backwards compatibility before change to camel-case

path_trained = Path(library_path).joinpath(spectra_name_train)
gp_george_means, gp_means, gp_george_stdevs, gp_stdevs = get_gps(
    spectra_train, path_trained, gp_ext_means, gp_ext_stdevs, None
)

## recover means and stdevs

In [None]:
# first, get gwb information

# for training data
_, xobs_train, true_std_train, true_med_train, _true_med_mean_train = gu.get_gwb(spectra_train, len(gp_george_means))
true_med_train += _true_med_mean_train
# for testing data
_, xobs_test, true_std_test, true_med_test, _true_med_mean_test = gu.get_gwb(spectra_test, len(gp_george_means))
true_med_test += _true_med_mean_test

In [None]:
# then, find training and test set predictions for mean and stdevs
# note that this step can take many seconds, depending on your hardware and the size of the spectrum library you're using
# for very large libraries, you may want to consider pickling the recovered means and stdevs to easily load them later
pred_med_train, pred_std_train = get_mean_stdev(
    spectra_train, gp_george_means, gp_means, gp_george_stdevs, gp_stdevs, xobs_train
)
pred_med_test, pred_std_test = get_mean_stdev(
    spectra_test, gp_george_means, gp_means, gp_george_stdevs, gp_stdevs, xobs_test
)

## look at test data response, c.f. training

In [None]:
ptsize = 1

med_x = [true_med_train.flatten(), true_med_test.flatten()]
med_y = [pred_med_train.flatten(), pred_med_test.flatten()]
std_x = [true_std_train.flatten(), true_std_test.flatten()]
std_y = [pred_std_train.flatten(), pred_std_test.flatten()]

labels = ['trained', 'not trained']
colors = ['black', 'red']
markers = ['v', '^']

# ---- Setup figure and axes

# row 0 = medians, row 1 = stdevs
fig, axes = plt.subplots(figsize=[16, 6], ncols=4, nrows=2)
fig.suptitle(f"{print_name_train}, tested on {print_name_test}")
plt.subplots_adjust(wspace=0.3, hspace=0.3)

for (ii, jj), ax in np.ndenumerate(axes): 
    ax.grid(True, alpha=0.25)
    if jj == 3:
        for ss in range(sigmas.size):
            ax.axhline(1 - sigma_widths[ss], color='r', ls=':', alpha=0.5)

error_bins = np.linspace(-0.5, 0.5, 41)
hist_kwargs = dict(density=True, histtype='step', alpha=0.75)

xvals_list = [med_x, std_x]
yvals_list = [med_y, std_y]
rows_labels = ['median', 'stdev']
for jj, (xvals, yvals) in enumerate(zip(xvals_list, yvals_list)):
    row_lab = rows_labels[jj]
    for ii, (xx, yy) in enumerate(zip(xvals, yvals)):
        axrow = axes[jj, :]

        # ---- 0: Plot predicted vs. true values
        ax = axrow[0]
        ax.set(xlabel=f"true {row_lab}", ylabel=f"predicted {row_lab}")
        lab = labels[ii]
        cc = colors[ii]
        mm = markers[ii]
        kwargs = dict(s=ptsize, marker=mm, c=cc, label=lab, alpha=0.5)
        ax.scatter(xx, yy, **kwargs)

        # ---- 1: plot error in prediction vs. true values
        ax = axrow[1]
        ax.set(xlabel=f"true {row_lab}", ylabel="error")
        err = yy - xx
        ax.scatter(xx, err, **kwargs)
        # show box for target error region
        if (ii == 1) and (box_frac > 0.0):
            ax.axhspan(np.log10(1-box_frac), np.log10(1+box_frac), color='gray', alpha=0.25, label=r"$\pm20\%$ error")
        ax.legend()

        # ---- 2: plot histogram of errors in prediction
        ax = axrow[2]
        ax.set(xlabel="error", yscale='log')
        hist, *_ = ax.hist(err, bins=error_bins, color=cc, label=lab, **hist_kwargs)
        # if this is test points, calculate fraction of points within target error region
        if ii == 1:
            if box_frac > 0.0:
                box_vals = [np.log10(1-box_frac), np.log10(1+box_frac)]
                ax.axvspan(*box_vals, color='gray', alpha=0.25, label=r"$\pm20\%$ error")
                sel = (box_vals[0] < err) & (err < box_vals[1])
                frac = np.count_nonzero(sel)/sel.size
                # add label with calculated fraction
                ax.text(box_vals[1], hist.max(), f"{frac:.3f}", fontsize=8, ha='left', va='top', color=cc)

        # ---- 3: plot cumulative mass function of errors
        ax = axrow[3]
        ax.set(xscale='linear', xlabel="|error|", yscale='log', ylabel='1 - CMF')
        err = np.fabs(err)
        err = np.sort(err)
        # plot 1 - CMF
        cmf = 1.0 - np.arange(err.size) / (err.size - 1)
        ax.plot(err, cmf, color=cc, alpha=0.75)
        # if this is test points, calculate errors for 1/2/3 sigma fractions of points
        if ii == 1:
            # iterate over 1/2/3 sigma
            for ss in range(sigmas.size):
                # 1 - widths, to correspodn to 1 - CMF
                ww = 1 - sigma_widths[ss]
                # interpolate to find corresponding errors at this sigma value
                xval = np.interp(ww, cmf[::-1], err[::-1])
                # draw line, and annotate value
                ax.axvline(xval, color=cc, ls=':', alpha=0.25)
                ax.text(xval, ww, f"{1-ww:.3f}: {xval:.3f}", fontsize=8, ha='left', va='bottom', color=cc)

fname = f"gp-errors_{print_name_train}-vs-{print_name_test}_{gp_ext_means}+{gp_ext_stdevs}.png"
save_fig(fig, fname)

In [None]:
# print out some fractions w/in errors
print(f"Untrained means, fraction within {box_frac:.1%}: {fraction_within_error(m_test-yobs_test,box_frac):3f}")
print(f"Untrained stdevs, fraction within {box_frac:.1%}: {fraction_within_error(s_test-yerr_test,box_frac):3f}")

## frequency-by-frequency

In [None]:
import kalepy as kale

scale = 'log'
pnts = kale.utils.spacing([1e-4, 0.4], scale='log', num=100)
nfreq = np.shape(true_med_train)[1]
print(f"{nfreq=}")

# scale = 'linear'
# pnts = kale.utils.spacing([0.0, 0.2], scale=scale, num=100)

width = 0.5
offset = 0.1
PERCS = [50, 90, 98]

train_test_colors = ['black', 'red']
train_test_labels = ['train', 'test']
fig, axes = plt.subplots(figsize=[10, 5], ncols=2)
fig.suptitle(f"train: {print_name_train} -- test: {spectra_name_test}")
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.2, top=0.9)

leg_handles = []
leg_labels = []
for ii in range(nfreq):

    pred_train = [pred_med_train[:, ii], pred_std_train[:, ii]]
    pred_test = [pred_med_test[:, ii], pred_std_test[:, ii]]
    true_train = [true_med_train[:, ii], true_std_train[:, ii]]
    true_test = [true_med_test[:, ii], true_std_test[:, ii]]

    # train vs. test
    for jj, (pred, true) in enumerate(zip([pred_train, pred_test], [true_train, true_test])):
        cc = train_test_colors[jj]

        # med vs std
        for kk, (vals_pred, vals_true) in enumerate(zip(pred, true)):
            ax = axes[kk]
            pp = vals_pred.flatten()
            tt = vals_true.flatten()

            err = np.fabs(pp - tt)
            xx = 1.0 + ii + offset - 2*offset*jj

            percs = np.percentile(err, PERCS)
            markers = ['_', '^', 'v']
            for ll, (pp, mm) in enumerate(zip(percs, markers)):
                if kk == 1 and jj == 1 and ii == 0:
                    kw = dict(label=f"${PERCS[ll]:.0f}\%$")
                else:
                    kw = dict()
                ax.scatter(xx, pp, color=cc, marker=mm, s=20, **kw)

            if scale.startswith('lin'):
                yy, zz = kale.density(err, pnts)
            else:
                yy, zz = kale.density(np.log10(err), np.log10(pnts))
                yy = 10.0 ** yy
            hh = plot.violin(ax, xx, yy, zz, width, color=cc, median=None)
            if ii == 0 and kk == 0:
                leg_handles.append(hh)
                leg_labels.append(train_test_labels[jj])            

for ii, ax in enumerate(axes):
    ax.grid(True, alpha=0.25)
    lab = "median" if ii == 0 else "stdev"
    ax.set(yscale=scale, ylabel=lab, xlabel='Frequency bin')
    # ax.set_xticks(ticks=xticks, labels=xlabels, rotation=45)
    if ii == 0:
        kw = [leg_handles, leg_labels]
    else:
        kw = []
    ax.legend(*kw, loc='upper right')

fname = f"gp-errors-freqs_{print_name_train}-vs-{print_name_test}_{gp_ext_means}+{gp_ext_stdevs}.png"
save_fig(fig, fname)

# fname = f"gps_compare_violins_{scale}.png"
# plt.savefig(fname)
# print(fname)

### what's up with them GPs?

In [None]:
fig=corner.corner(gp_george_means[0].emcee_flatchain,labels=['a']+[key for key in gp_george_means[0].par_dict.keys()]+['alpha']);
#plt.savefig(f"{parent_name}_means-corner_{ext_m}.png")

In [None]:
# corner.corner(gp_george_stdevs[0].emcee_flatchain, labels=[key for key in gp_george_stdevs[0].par_dict.keys()]);
#plt.savefig(f"{parent_name}_stds-corner_{ext_v}.png")