In [3]:
import numpy as np
import sklearn
np.random.seed(0)

import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from matplotlib.ticker import NullFormatter
%matplotlib inline
import seaborn as sns
sns.set(palette="bright",style="ticks",font="Arial")
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import PathPatch
from matplotlib.ticker import FormatStrFormatter

from sklearn import linear_model
from sklearn.linear_model import HuberRegressor
from sklearn import manifold, datasets
from sklearn.decomposition import PCA
from sklearn import preprocessing

from functools import partial
import cvxpy as cp
import pandas as pd
import copy
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
import os.path
import pdb
import scipy as sp
import hashlib
import joblib
import pickle
import pdb
import scipy


%load_ext autoreload
%autoreload 2

%env PYTHONHASHSEED=0

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: PYTHONHASHSEED=0


In [4]:
print(f"numpy: {np.__version__}")
print(f"matplotlib: {matplotlib.__version__}")
print(f"seaborn: {sns.__version__}")
print(f"sklearn: {sklearn.__version__}")
print(f"cvxpy: {cp.__version__}")
print(f"pandas: {pd.__version__}")
print(f"scipy: {scipy.__version__}")
print(f"joblib: {joblib.__version__}")
print(f"hashlib (standard library, version not applicable)")
print(f"pickle (standard library, version not applicable)")

numpy: 1.23.5
matplotlib: 3.7.0
seaborn: 0.12.2
sklearn: 1.2.1
cvxpy: 1.4.0
pandas: 1.5.3
scipy: 1.10.0
joblib: 1.1.1
hashlib (standard library, version not applicable)
pickle (standard library, version not applicable)


# Settings

In [None]:
# NOTE: don't forget to set these v.

ff= "../data/jasa_10_07_2023_data/"
out_ff = "../data/jasa_10_07_2023_data/"

datasets = ["bos"]
# datasets = ["kin", "cpu", "bos", "puma", "delta", "ca", "abalone", "airfoil"]

num_trials = 20 # 50

fontsize = 16

linewidth = 2

shift_betas = np.array([0.02, 0.04, 0.08, 0.16, 0.32, 0.64])
shift_betas = np.concatenate([shift_betas, -shift_betas])
shift_betas.sort()
num_shift_betas = shift_betas.shape[0]

shift_taus = [0.5, 0.6, 0.7]

# Read test weights in

In [None]:
ext_jlib = "jlib"

test_idxes_weights_dict_list = []
for dataset_idx, dataset in enumerate(datasets):
    if dataset == "gaus":
        continue
    
    print("Reading in weights on the test set, for data set %s ..." % dataset)
    test_idxes_weights_wksp_fp = ff + dataset + "_test_idxes_weights_wksp." + ext_jlib    
    dataset_test_idxes_weights_wksp = joblib.load(test_idxes_weights_wksp_fp)
    print("Done.")
    
    test_idxes_weights_dict_list += [dataset_test_idxes_weights_wksp["test_idxes_weights"]]

# Make plots

In [None]:
def parse_I_E_str(shift_fn):
    
    dim_idx_shift_tau_str = shift_fn.replace("I-E-", "")
    dash_idx = dim_idx_shift_tau_str.index("-")

    dim_idx_str = dim_idx_shift_tau_str[0:dash_idx]
    shift_tau = float(dim_idx_shift_tau_str.replace(dim_idx_str + "-", ""))
    dim_idx = int(dim_idx_str)
    shift_tau_str = ", tau=" + str(shift_tau)
    
    return shift_tau_str, dim_idx, shift_tau

In [None]:
def get_title(shift_fn, shift_beta, dataset, rev=False):
    
    if "I-C" in shift_fn:
        dim_idx = int(shift_fn.replace("I-C-", ""))
        shift_tau_str = ""

        shift_fn = "I-C, dim. " + str(dim_idx+1)            
        
    elif "I-D" in shift_fn:
        shift_tau = float(shift_fn.replace("I-D-", ""))
        shift_tau_str = ", tau=" + str(shift_tau)

        shift_fn = "I-D"
        
    elif "I-E" in shift_fn:
        shift_tau_str, _, _ = parse_I_E_str(shift_fn)

    else:
        shift_tau_str = ""

    if not rev:
        title = "K-L for "
    else:
        title = "Rev. K-L for "
    title +=shift_fn
        
    if "I-A" not in shift_fn:
        title += ", beta=" + str(shift_beta)            

        title += shift_tau_str        
        
    title += ", " + dataset # + " data"
    
    return title

In [None]:
def get_out_fp(shift_fn, shift_beta, dataset, rev=False):
    
    fp = out_ff + dataset + "_boxplot_kl_"
    
    if rev:
        fp += "rev_"
    
    fp += shift_fn

    if "I-A" not in shift_fn:
        fp += "_beta_" + str(shift_beta)
    
    fp += ".pdf"
    
    return fp

In [None]:
def make_plot(kls, shift_fn, shift_beta, dataset, rev=False):
    f,a = plt.subplots()
    a.boxplot(kls,
              showmeans=True,
              showfliers=True)
    a.set_title(get_title(shift_fn, shift_beta, dataset, rev), fontsize=fontsize+2)

    a.set_ylabel('')    
    a.set_xlabel('')
    a.tick_params(axis='both', labelsize=fontsize)
    a.set_xticks([])

    out_fp = get_out_fp(shift_fn, shift_beta, dataset, rev)
    f.savefig(out_fp, bbox_inches="tight")
    print("Saved plot to %s." % out_fp)       

In [None]:
def convert_dataset_to_friendly_dataset_name(dataset, append_word_data=True):
    
    if(dataset == "gaus"):
        dataset_friendly_name = "Gaussian"

    elif(dataset == "airfoil"):
        dataset_friendly_name = "Airfoil"

    elif(dataset == "abalone"):
        dataset_friendly_name = "Abalone"

    elif(dataset == "ca"):
        dataset_friendly_name = "California housing"

    elif(dataset == "delta"):
        dataset_friendly_name = "Delta ailerons"

    elif(dataset == "ailerons"):
        dataset_friendly_name = "Ailerons"

    elif(dataset == "bank"):
        dataset_friendly_name = "Banking"

    elif(dataset == "bos"):
        dataset_friendly_name = "Boston housing"

    elif(dataset == "cpu"):
        dataset_friendly_name = "CPU"

    elif(dataset == "kin"):
        dataset_friendly_name = "Kinematics"

    elif(dataset == "puma"):
        dataset_friendly_name = "Puma"
        
    if(append_word_data):
        dataset_friendly_name += " data"
        
    return dataset_friendly_name

In [None]:
def get_shift_beta_idx(shift_beta):
    return np.where(shift_betas == shift_beta)[0]

In [None]:
ext_pkl = "pkl"

coverages_dict_list = []
raw_coverages_dict_list = []
for dataset_idx, dataset in enumerate(datasets):
    print("Reading in coverages for data set %s ..." % dataset)
    dataset_wskp_fp = ff + dataset + "_wksp." + ext_pkl   
    dataset_wksp = pickle.load(open(dataset_wskp_fp, "rb"))
    print("Done.")
    
    coverages_dict_list += [dataset_wksp["coverages_dict"]]
    raw_coverages_dict_list += [dataset_wksp["raw_coverages_dict"]]

In [None]:
num_dims_list = []
for dataset_idx, dataset in enumerate(datasets):
    
    if(dataset == "airfoil"):

        X = pd.read_csv(ff + dataset + "_X.csv", index_col=0).to_numpy()

    elif((dataset == "abalone") or
         (dataset == "ca") or
         (dataset == "delta") or
         (dataset == "ailerons") or
         (dataset == "bank") or
         (dataset == "bos") or
         (dataset == "cpu") or
         (dataset == "kin") or
         (dataset == "puma") or
         (dataset == "gaus")):

        X = pd.read_csv(ff + dataset + "_X.csv").to_numpy()

    num_dims_list += [X.shape[1]]

In [None]:
alg_names_ordered = ["Standard", "K-L", "Chi-squared"]
alg_idxes = {"Standard":0, "K-L":1, "Chi-squared":2}

# NOTE: don't forget to set this v.
alg_idx = alg_idxes["Standard"]
# alg_idx = alg_idxes["Chi-squared"]

In [None]:
for dataset_idx, dataset in enumerate(datasets):
    if dataset == "gaus":
        continue
    
    num_dims = num_dims_list[dataset_idx]
        
    shift2shift_beta_idxes2dim_idxes2cvgs = {}
    shift2shift_beta_idxes2dim_idxes2cvgs["I-B"] = np.inf*np.ones((num_shift_betas, 1, num_trials))
    shift2shift_beta_idxes2dim_idxes2cvgs["I-C"] = np.inf*np.ones((num_shift_betas, num_dims, num_trials))
    for shift_tau in shift_taus:
        shift2shift_beta_idxes2dim_idxes2cvgs["I-D-" + str(shift_tau)] = np.inf*np.ones((num_shift_betas, 1, num_trials))
        shift2shift_beta_idxes2dim_idxes2cvgs["I-E-" + str(shift_tau)] = np.inf*np.ones((num_shift_betas, num_dims, num_trials))
    
    shifts = coverages_dict_list[dataset_idx].keys()
    for shift in shifts:
        shift_fn = shift[0]
        shift_beta = shift[1]
        shift_beta_idx = get_shift_beta_idx(shift_beta)   
        
        if "I-B" in shift_fn:
            shift_fn_trim = shift_fn
            dim_idx = 0
        
        elif "I-C" in shift_fn:
            shift_fn_trim = "I-C"
            dim_idx = int(shift_fn.replace("I-C-", ""))
            
        elif "I-D" in shift_fn:
            shift_fn_trim = shift_fn
            dim_idx = 0
            
        elif "I-E" in shift_fn:
            shift_fn_trim = "I-E"
            _, dim_idx, shift_tau = parse_I_E_str(shift_fn)
            shift_fn_trim += "-" + str(shift_tau)
            
        else:
            continue

        shift2shift_beta_idxes2dim_idxes2cvgs[shift_fn_trim][shift_beta_idx, dim_idx, :] = \
        coverages_dict_list[dataset_idx][shift_fn, shift_beta][0,0,:,0,alg_idx]
    
    shift_fn_trims = ["I-B", "I-C"]
    for shift_tau in shift_taus:
        shift_fn_trims += ["I-D-" + str(shift_tau)]
        shift_fn_trims += ["I-E-" + str(shift_tau)]
    for shift_fn_trim in shift_fn_trims:
        
        f,a = plt.subplots()

        my_means = np.mean(shift2shift_beta_idxes2dim_idxes2cvgs[shift_fn_trim][:, :, :], axis=(1,2))
        a.plot(shift_betas, my_means, color="lime", linestyle="-", linewidth=linewidth)
        
        my_medians = np.median(shift2shift_beta_idxes2dim_idxes2cvgs[shift_fn_trim][:, :, :], axis=(1,2))
        a.plot(shift_betas, my_medians, color="black", linestyle="-", linewidth=linewidth)
        
        my_qs = []
        for q in np.arange(0.1, 1, 0.1):
            my_qs += [np.quantile(shift2shift_beta_idxes2dim_idxes2cvgs[shift_fn_trim][:, :, :], q, axis=(1,2))]
        
        for q_idx, q in enumerate(np.arange(0.1, 0.5, 0.1)):
            a.fill_between(shift_betas, my_qs[q_idx], my_qs[len(my_qs)-q_idx-1], color="cornflowerblue", alpha=0.2)

        a.axhline(0.95, c='r', linestyle="-", linewidth=linewidth)
        a.tick_params(axis='both', labelsize=fontsize)

        a.set_ylim([a.get_ylim()[0],1])
        a.set_xlim([min(shift_betas), max(shift_betas)])
        a.set_xlabel("a", fontsize=fontsize)
        a.set_title(convert_dataset_to_friendly_dataset_name(dataset), fontsize=fontsize+2)
        
        out_fp = out_ff + dataset + "_coverage_" + alg_names_ordered[alg_idx] + "_" + shift_fn_trim.replace("D-", "D_").replace("E-", "E_") + ".pdf"
        f.savefig(out_fp, bbox_inches="tight")

print("All done.")        