# A notebook for running kernel thinning and standard thinning experiments


In [1]:
import numpy as np
import numpy.random as npr
import numpy.linalg as npl
# from scipy.spatial.distance import pdist

from argparse import ArgumentParser
import pickle as pkl
import pathlib
import os
import os.path

# import kernel thinning
from goodpoints import kt # kt.thin is the main thinning function; kt.split and kt.swap are other important functions
from goodpoints.util import isnotebook # Check whether this file is being executed as a script or as a notebook
from goodpoints.util import fprint  # for printing while flushing buffer
from goodpoints.tictoc import tic, toc # for timing blocks of code


# utils for generating samples, evaluating kernels, and mmds
from util_sample import sample, compute_mcmc_params_p, compute_diag_mog_params, sample_string
from util_k_mmd import kernel_eval, squared_mmd, get_combined_mmd_filename

# for partial functions, to use kernel_eval for kernel
from functools import partial

# set things a bit when running the notebook
if isnotebook():
    # Autoreload packages that are modified
    %load_ext autoreload
    %autoreload 2
    %matplotlib inline
    %load_ext line_profiler
    # https://jakevdp.github.io/PythonDataScienceHandbook/01.07-timing-and-profiling.html

In [2]:
# If notebook run as a script, parse command-line arguments
if not isnotebook():
    parser = ArgumentParser()
    parser.add_argument('--rep0', '-r0', type=int, default=0,
                        help="starting experiment id")
    parser.add_argument('--repn', '-rn', type=int, default=1,
                        help="number of experiment replication")
    parser.add_argument('--store_K', '-sk', type=bool, default=False,
                        help="whether to save K matrix, 2-3x faster runtime, but larger memory O(n^2)")
    parser.add_argument('--m', '-m', type=int, default=6,
                        help="number of thinning rounds")
    parser.add_argument('--d', '-d', type=int, default=1,
                        help="dimensions")
    parser.add_argument('--M', '-M', type=int, default=None,
                        help="number of mixture for diag mog in d=2")
    parser.add_argument('--filename', '-f', type=str, default=None,
                       help="name for saved (MCMC) samples")
    parser.add_argument('--combine_mmd', '-cm', type=bool, default=False,
                        help="whether to save combined_mmd results; should be set to True once all experiments are done running")
    args, opt = parser.parse_known_args()
else:
    args = None

## Define kernel thinning experiment

In [3]:
def run_kernel_thinning_experiment(m, params_p, params_k_split, params_k_swap, rep_ids,
                     delta=None, store_K=False,
                      sample_seed=1234567, thin_seed=9876543,
                      compute_mmds = True,
                      rerun=False,
                      verbose=False,
                      results_dir="results_new"):
    """Runs kernel thinning experiment using samples from params_p for repetitions over rep_ids,
    saves coresets to disk, saves and returns mmd evaluations to disk mmd evaluation
    
    Args:
      m: Number of halving rounds (number of sample points n = 2^{2m})
      params_p: Dictionary of distribution parameters recognized by sample()
      params_k_split: Dictionary of kernel parameters recognized by kernel_eval()
      params_k_swap: Dictionary of kernel parameters recognized by kernel_eval()
      rep_ids: Which replication numbers of experiment to run; the replication
        number determines the seeds set for reproducibility
      delta: delta/(4^m) is the failure probability for
        adaptive threshold sequence;
      store_K: If False, runs O(nd) space version which does not store kernel
        matrix; if True, stores n x n kernel matrix
      sample_seed: (Optional) random seed is set to sample_seed + rep
        prior to generating input sample for replication rep
      thin_seed: (Optional) random seed is set to thin_seed + rep
        prior to running thinning for replication rep
      rerun: (Optional) If False and results have been previously saved to
        disk, load results from disk instead of rerunning experiment
      verbose: (Optinal) If True print time taken in each kt round 
      results_dir: (Optional) Directory in which results should be saved
      compute_mmds: (Optional) Whether to compute mmds of coresets (using params_k_swap)
        
        returns MMD evaluation of final thinned coresets from each rep using the
        params_k_swap kernel and the params_p target distribution
    """
    # Create results directory if necessary
    pathlib.Path(results_dir).mkdir(parents=True, exist_ok=True)

    split_kernel = partial(kernel_eval, params_k=params_k_split)
    swap_kernel = partial(kernel_eval, params_k=params_k_swap)
    
    # Construct results filename template with placeholder for rep value
    d = params_p["d"]
    assert(d==params_k_split["d"])
    assert(d==params_k_swap["d"])
    
    sample_str = sample_string(params_p, sample_seed)
    split_kernel_str = "{}_var{:.3f}_seed{}".format(params_k_split["name"], params_k_split["var"], thin_seed)
    swap_kernel_str =  "{}_var{:.3f}".format(params_k_swap["name"], params_k_swap["var"])
    thresh_str = f"delta{delta}"
    file_template = os.path.join(results_dir, f"kt-coresets-{sample_str}-split{split_kernel_str}-swap{swap_kernel_str}-d{d}-m{m}-{thresh_str}-rep{{}}.pkl")
    
    # Create array to store MMD evaluations from P, and Sin
    if compute_mmds:
        mmds_p = np.zeros((m+1, len(rep_ids)))
        mmds_sin = np.zeros((m+1, len(rep_ids)))
        mmd_p_file_template = os.path.join(results_dir, 
                                         f"kt-mmd-{sample_str}-split{split_kernel_str}-swap{swap_kernel_str}-d{d}-m{m}-{thresh_str}-rep{{}}.pkl")
        mmd_sin_file_template = os.path.join(results_dir, 
                                         f"kt-mmd-sin-{sample_str}-split{split_kernel_str}-swap{swap_kernel_str}-d{d}-m{m}-{thresh_str}-rep{{}}.pkl")
    split_kernel = partial(kernel_eval, params_k=params_k_split)
    swap_kernel = partial(kernel_eval, params_k=params_k_swap)

    # Number of sample points
    n = int(2**(2*m))
    fprint(f"Running kernel thinning experiment with template {file_template}.....")
    tic()
    for r_i, rep in enumerate(rep_ids):
        # Include replication number in filename
        filename = file_template.format(rep)
        mmd_p_filename = mmd_p_file_template.format(rep)
        mmd_sin_filename = mmd_sin_file_template.format(rep)
        
        # Generate matrix of input sample points
        #print(f"Generating data for rep {rep}", flush=True)
        #tic()
        X = sample(n, params_p, seed=sample_seed+rep)
        #toc()

        if not rerun and os.path.exists(filename):
            # Return previously saved results
            #print(f"Loading coresets from {filename}", flush=True)
            #tic()
            with open(filename, 'rb') as file:
                coresets = pkl.load(file)
            #toc()
        else:
            # Obtain sequence of thinned coresets
            print(f"Kernel Thinning rep {rep}...", flush=True)
            # tic()
            coresets = kt.thin(X, m, split_kernel, swap_kernel, delta=delta, seed=thin_seed+rep, store_K=store_K, verbose=verbose)
            # toc()

            # Save coresets to disk
            # print(f"Saving coresets to {filename}", flush=True)
            # tic()
            with open(filename, 'wb') as file:
                pkl.dump(coresets, file, protocol=pkl.HIGHEST_PROTOCOL)
            #toc()
            
        # Evaluate final coreset MMD
        if compute_mmds:
            if not rerun and os.path.exists(mmd_p_filename):
                # Return previously saved results
                #print(f"Loading KT MMD results from {mmd_filename}", flush=True)
                #tic()
                with open(mmd_p_filename, 'rb') as file:
                    mmds_p[:, r_i] = pkl.load(file)
                #toc()                
            else:
                #print("Evaluating KT MMD", flush=True)
                #tic()
                for j in range(m+1):
                    nj = int(2**j)
                    mmds_p[j, r_i] = np.sqrt(
                        squared_mmd(params_k_swap, params_p, X[coresets[:nj]]))
                #toc()
                # Save MMD results to disk
                # print(f"Saving KT MMD results to {mmd_filename}", flush=True)
                #tic()
                with open(mmd_p_filename, 'wb') as file:
                    pkl.dump(mmds_p[:, r_i], file, protocol=pkl.HIGHEST_PROTOCOL)
                #toc()
                    
            if not rerun and os.path.exists(mmd_sin_filename):
                # Return previously saved results
                #print(f"Loading KT MMD results from {mmd_filename}", flush=True)
                #tic()
                with open(mmd_sin_filename, 'rb') as file:
                    mmds_sin[:, r_i] = pkl.load(file)
                #toc()                
            else:
                #print("Evaluating KT MMD", flush=True)
                #tic()
                # redefining target p as distribution on Sin
                params_p_sin = dict()
                params_p_sin["name"] =  params_p["name"]+ "_sin"
                params_p_sin["Pnmax"] = X
                params_p_sin["d"] = d
                for j in range(m+1):
                    nj = int(2**j)
                    mmds_sin[j, r_i] = np.sqrt(squared_mmd(params_k_swap, params_p_sin, X[coresets[:nj]]))
                #toc()
                # Save MMD results to disk
                # print(f"Saving KT MMD results to {mmd_filename}", flush=True)
                #tic()
                with open(mmd_sin_filename, 'wb') as file:
                    pkl.dump(mmds_sin[:, r_i], file, protocol=pkl.HIGHEST_PROTOCOL)
                #toc()
    toc()
    if compute_mmds:
        return(mmds_p, mmds_sin)

## Define standard thinning experiment

In [4]:
def run_standard_thinning_experiment(m, params_p, params_k_mmd, rep_ids, sample_seed=1234567, 
                      rerun=False, results_dir="results_new", compute_mmds=True,
                      min_mmd=False):
    """Evaluates MMD of iid Monte Carlo draws, and saves it to disk 
    
    Args:
      m: Number of halving rounds (defines number of sample points via n = 2^{2m})
      params_p: Dictionary of distribution parameters recognized by sample()
      params_k_mmd: Dictionary of kernel parameters for MMD evaluation
      rep_ids: Which replication numbers of experiment to run; the replication
        number determines the seeds set for reproducibility
      sample_seed: (Optional) random seed is set to sample_seed + rep
        prior to generating input sample for replication rep
      rerun: (Optional) If False and results have been previously saved to
        disk, load results from disk instead of rerunning experiment
      results_dir: (Optional) Directory in which results should be saved
      min_mmd: (Optional) if True, returns the minimum MMD over all sqrt(n) thinned 
        sequences of n points with step size sqrt(n); if False, returns the MMD
        of the first such thinned sequence
      compute_mmds: (Optional) Whether to compute mmds of coresets (using params_k_mmd)
    """
    
    # Create results directory if necessary
    pathlib.Path(results_dir).mkdir(parents=True, exist_ok=True)

    # Create array to store MMD evaluations
    mmds_p = np.zeros((m+1, len(rep_ids)))
    mmds_sin = np.zeros((m+1, len(rep_ids)))

    # Construct results filename template with placeholder for rep value
    d = params_p["d"]
    assert(d == params_k_mmd["d"])
    sample_str = sample_string(params_p, sample_seed)
    kernel_str = "{}_var{:.3f}".format(params_k_mmd["name"], params_k_mmd["var"])
    min_str = "min_" if min_mmd else ""
    mmd_p_file_template = os.path.join(results_dir, f"{min_str}mc-mmd-{sample_str}-{kernel_str}-d{d}-m{m}-rep{{}}.pkl")
    mmd_sin_file_template = os.path.join(results_dir, f"{min_str}mc-sin-mmd-{sample_str}-{kernel_str}-d{d}-m{m}-rep{{}}.pkl")
    
    # Number of sample points
    n = int(2**(2*m))
    
    fprint(f"Running standard thinning experiment for m={m}")
    tic()
    for r_i, rep in enumerate(rep_ids):
        # Include replication number in filename
        mmd_p_filename = mmd_p_file_template.format(rep)
        mmd_sin_filename = mmd_sin_file_template.format(rep)
        
        if not rerun and os.path.exists(mmd_p_filename):
            # Return previously saved results
            #print(f"Loading {min_str} Monte Carlo MMD results from {filename}", flush=True)
            #tic()
            with open(mmd_p_filename, 'rb') as file:
                mmds_p[:, r_i] = pkl.load(file)
            #toc()
        else:
            #tic()
            #print(f"Generating data for rep {rep}", flush=True)
            #tic()
            X = sample(n, params_p, seed=sample_seed+rep)
            #toc()
            #print(f"Evaluating {min_str} Monte Carlo MMD", flush=True)
            for j in range(m+1):
                # Target coreset size
                coreset_size = int(2**j)
                input_size = int(coreset_size**2)
                if min_mmd:
                    # Consider each coreset obtained by choosing every nj-th point
                    # of the first nj^2 points of X and select the one with smallest MMD
                    # There are nj^2/nj = nj such coresets indexed by their starting point
                    num_starts = coreset_size
                else:
                    num_starts = 1
                step_size = coreset_size
                end = input_size
                mmds_p[j, r_i] = np.inf 
                for start in range(num_starts):
                    mmds_p[j, r_i] = min(mmds_p[j, r_i], np.sqrt(squared_mmd(params_k_mmd, params_p, X[(step_size-1-start):end:step_size])))
                    #print(f"j={j},rep={rep},mmd={mmds[j,rep]}")
            #toc()
            # Save MMD results to disk
            #print(f"Saving {min_str} Monte Carlo MMD results to {filename}", flush=True)
            #tic()
            with open(mmd_p_filename, 'wb') as file:
                pkl.dump(mmds_p[:, r_i], file, protocol=pkl.HIGHEST_PROTOCOL)
            #toc() 
            
        if not rerun and os.path.exists(mmd_sin_filename):
            # Return previously saved results
            #print(f"Loading {min_str} Monte Carlo MMD results from {filename}", flush=True)
            #tic()
            with open(mmd_sin_filename, 'rb') as file:
                mmds_sin[:, r_i] = pkl.load(file)
            #toc()
        else:
            #tic()
            #print(f"Generating data for rep {rep}", flush=True)
            #tic()
            X = sample(n, params_p, seed=sample_seed+rep)
            #toc()
            #print(f"Evaluating {min_str} Monte Carlo MMD", flush=True)
            # redefining target p as distribution on Sin
            params_p_sin = dict()
            params_p_sin["name"] =  params_p["name"]+"_sin"
            params_p_sin["Pnmax"] = X
            params_p_sin["d"] = d
            for j in range(m+1):
                # Target coreset size
                coreset_size = int(2**j)
                input_size = int(coreset_size**2)
                if min_mmd:
                    # Consider each coreset obtained by choosing every nj-th point
                    # of the first nj^2 points of X and select the one with smallest MMD
                    # There are nj^2/nj = nj such coresets indexed by their starting point
                    num_starts = coreset_size
                else:
                    num_starts = 1
                step_size = coreset_size
                end = input_size
                mmds_sin[j, r_i] = np.inf 
                
                for start in range(num_starts):
                    mmds_sin[j, r_i] = min(mmds_sin[j, r_i], np.sqrt(
                        squared_mmd(params_k_mmd, params_p_sin, X[(step_size-1-start):end:step_size])))
                    #print(f"j={j},rep={rep},mmd={mmds[j,rep]}")
            #toc()
            # Save MMD results to disk
            #print(f"Saving {min_str} Monte Carlo MMD results to {filename}", flush=True)
            #tic()
            with open(mmd_sin_filename, 'wb') as file:
                pkl.dump(mmds_sin[:, r_i], file, protocol=pkl.HIGHEST_PROTOCOL)
            #toc() 
    toc()
    return(mmds_p, mmds_sin)

# Deploy thinning experiments

In [5]:
#
# Choose sample and kernel parameters
#
var = 1. # Variance
d = int(2) if args is None else args.d
params_p = {"name": "gauss", "var": var, "d": int(d), "saved_samples": False}

# filename is for MCMC files
filename = None if args is None else args.filename

# k denotes the number of componets for MOG settings
M = None if args is None else args.M

if isnotebook():
    filename = None if args is None else args.filename
    # ['Goodwin_RW', 'Goodwin_ADA-RW', 'Goodwin_MALA', 'Goodwin_PRECOND-MALA', 'Lotka_RW', 'Lotka_ADA-RW', 'Lotka_MALA', 'Lotka_PRECOND-MALA']

if filename is not None:
    # if a filename is specified then compute params_p
    d = int(4)
    params_p = compute_mcmc_params_p(filename, nmax=int(2**15), include_last=True)
    # whether to use median_distance for kernel bandwidth for MCMC settings
    use_median_distance = True 

    if use_median_distance:
        var = (params_p["med_dist"])**2
    
if M is not None:
    # if number of mixture is specified then compute params_p
    d = int(2)
    params_p = compute_diag_mog_params(M)

params_k_swap = {"name": "gauss", "var": var, "d": int(d)}
params_k_split = {"name": "gauss_rt", "var": var/2., "d": int(d)}

In [6]:
#
# Choose experiment parameters
#

# List of replicate ID numbers
rep_ids = range(2) if args is None else np.arange(args.rep0, args.rep0+args.repn)

# List of halving round numbers m to evaluate
ms = range(5+1) if args is None else range(args.m)

# whether store_k during thinning, saves computation but requires O(n^2) memory
# issue with larger n; if False, requires O(nd) memory
store_K = False if args is None else args.store_K

# Failure probability
delta = .5

# Which methods to run?
run_standard_thinning = False
run_kernel_thinning = True

rerun = True
verbose = False # time updates only for m>=7

In [7]:
if run_standard_thinning: 
    mmds_st = np.zeros((max(ms)+1, len(rep_ids))) # mmds from P
    mmds_st_sin = np.zeros((max(ms)+1, len(rep_ids))) # mmds from Sin
if run_kernel_thinning: 
    mmds_kt = np.zeros((max(ms)+1, len(rep_ids))) # mmds from P
    mmds_kt_sin = np.zeros((max(ms)+1, len(rep_ids))) # mmds from Sin

print("Exp setting:", params_p["name"], ms)       
for m in ms:
    #
    # Run experiments and store quality of the 2^m thinned coreset
    #
    if run_standard_thinning:
        mmd_st, mmd_st_sin = run_standard_thinning_experiment(m, params_p, params_k_swap, rep_ids, rerun=rerun)
        mmds_st[m, :] = mmd_st[m, :]
        mmds_st_sin[m, :] = mmd_st_sin[m, :] 

    if run_kernel_thinning:
        mmd_kt, mmd_kt_sin = run_kernel_thinning_experiment(m, params_p, params_k_split, params_k_swap, rep_ids, delta, store_K, rerun=rerun, verbose=verbose)
        mmds_kt[m, :] = mmd_kt[m, :]
        mmds_kt_sin[m, :] = mmd_kt_sin[m, :]


Exp setting: gauss range(0, 6)
Running kernel thinning experiment with template results_new/kt-coresets-gauss_var1.0_seed1234567-splitgauss_rt_var0.500_seed9876543-swapgauss_var1.000-d2-m0-delta0.5-rep{}.pkl.....
Kernel Thinning rep 0...
Kernel Thinning rep 1...
-elapsed time: 0.00602 (s)
Running kernel thinning experiment with template results_new/kt-coresets-gauss_var1.0_seed1234567-splitgauss_rt_var0.500_seed9876543-swapgauss_var1.000-d2-m1-delta0.5-rep{}.pkl.....
Kernel Thinning rep 0...
Kernel Thinning rep 1...
-elapsed time: 0.00888 (s)
Running kernel thinning experiment with template results_new/kt-coresets-gauss_var1.0_seed1234567-splitgauss_rt_var0.500_seed9876543-swapgauss_var1.000-d2-m2-delta0.5-rep{}.pkl.....
Kernel Thinning rep 0...
Kernel Thinning rep 1...
-elapsed time: 0.0229 (s)
Running kernel thinning experiment with template results_new/kt-coresets-gauss_var1.0_seed1234567-splitgauss_rt_var0.500_seed9876543-swapgauss_var1.000-d2-m3-delta0.5-rep{}.pkl.....
Kernel Thin

# Save MMD Results

In [8]:
#
# Save all mmd settings
#
save_combined_mmd = False if args is None else args.combine_mmd

if save_combined_mmd:
    if run_standard_thinning:
        filename = get_combined_mmd_filename(f"mc", ms, params_p, params_k_split, params_k_swap, rep_ids, delta)
        with open(filename, 'wb') as file:
            print(f"Saving combined mc mmd to {filename}")
            pkl.dump(mmds_st, file, protocol=pkl.HIGHEST_PROTOCOL)
        
        filename = get_combined_mmd_filename(f"mc-sin", ms, params_p, params_k_split, params_k_swap, rep_ids, delta)
        with open(filename, 'wb') as file:
            print(f"Saving combined mc mmd_sin to {filename}")
            pkl.dump(mmds_st_sin, file, protocol=pkl.HIGHEST_PROTOCOL)

    if run_kernel_thinning:
        filename = get_combined_mmd_filename(f"kt", ms, params_p, params_k_split, params_k_swap, rep_ids, delta)
        with open(filename, 'wb') as file:
            print(f"Saving combined kt mmd to {filename}")
            pkl.dump(mmds_kt, file, protocol=pkl.HIGHEST_PROTOCOL)
        
        filename = get_combined_mmd_filename(f"kt-sin", ms, params_p, params_k_split, params_k_swap, rep_ids, delta)
        with open(filename, 'wb') as file:
            print(f"Saving combined kt mmd_sin to {filename}")
            pkl.dump(mmds_kt_sin, file, protocol=pkl.HIGHEST_PROTOCOL)