# Notebook that runs thinning experiments as separate jobs locally or on Slurm cluster

## 1. Specify whether jobs will be run on Slurm cluster or locally

In [None]:
deploy_slurm = True

import subprocess # to use subprocess 
import pathlib
import os
import os.path
import pickle as pkl
from datetime import datetime
if deploy_slurm:
    from slurmpy import Slurm
import numpy as np
from goodpoints.util import isnotebook # Check whether this file is being executed as a script or as a notebook
if isnotebook():
    %load_ext autoreload
    %autoreload 2

## 2. Functions for generating python commands, and deploying jobs

In [None]:
def get_python_command(d, m, sz, rep0, repn, rerun, compute_mmd, recompute_mmd, log_folder,
                       symm1=1, rh2 = 0,
                   alg = "kt", compress_alg=None, n_compress=None, g=None,
                       setting = "gauss", M=None, filename=None,
                       results_folder = None, mcmc_folder = None,
                       timing_experiment=False, 
                         ):
    '''
    return python string deploying an experiment
    Args:
        d: (int) dimension of the problem
        m: (int) thinning factor in log_2 base (useful typically only for kt.thin)
        sz: (int) the size of input data in log_4 base 
        rep0: (int) starting rep id
        repn: (int) number of reps
        rerun: (int) whether to rerun experiments when set to anything but 0, else 0
        compute_mmd: (int) whether to compute mmd when set to anything but 0, else 0
        recompute_mmd: (bool) whether to recompute mmd (i.e., ignore pkl file if mmd already exists) when set to anything but 0, else 0
        log_folder: (str) folder for saving out/err files (useful only when deplying jobs via terminal, and not for slurm)
        symm1: (int) whether to use symmetrize in stage 1 of compress++ when set to anything but 0, else 0
        alg: (str) name of the main thinning algorithm
        compress_alg: (int) the algorithm to be used for thinning in compress
        n_compress: (int) number of compress coresets to be generated
        g: (int) the oversampling factor "g" for compress++ experiments
        setting: (str) target P; take values in gauss/mog/mcmc
        M: (int) number of mog components
        filename: (str) name of mcmc file
        results_folder: (str) location for saving results
        mcmc_folder : (str) location from where to load the mcmc data
        timing_experiment: (bool) whether this experiment is about computing runtime only
    
    '''
    assert(results_folder is not None)
    pathlib.Path(results_folder).mkdir(parents=True, exist_ok=True)
    
    if not timing_experiment:
        exp_name =  ['python3', f'construct_{alg}_coresets.py']
    else:
        exp_name =  ['python3', f'run_time.py']
    
    exp_name.extend(['--m', f'{m}', f'--size', f'{sz}', '--setting', f'{setting}', '--rep0', f'{rep0}', 
                     '--repn', f'{repn}','--rerun', str(rerun), '--computemmd', str(compute_mmd), 
                     '--recomputemmd', str(recompute_mmd), '--symm1', str(symm1), 
                     '--resultsfolder', f'{results_folder}'
                    ])
    
    if alg == "compresspp":
        assert(compress_alg is not None)
        exp_name.extend(['--compressalg', f'{compress_alg}']) 
        assert(g is not None)
        exp_name.extend(['--g', f'{g}']) 
         
    if setting == "mog":
        exp_name.extend(['--M', f'{M}'])
    
    if setting != "mcmc":
        exp_name.extend(['--d', f'{d}'])# add dimension for non mcmc settings
    else:
        exp_name.extend(['--filename', f'{filename}'])
        assert(mcmc_folder is not None)
        exp_name.extend(['--mcmcfolder', f'{mcmc_folder}'])
        
       

    
    log_file = ''.join(exp_name[1:])
    # removing redundant characters to save space
    log_file = log_file.replace("--", "-")
    for s in ["construct_", "_coresets", ".py"]:
        log_file = log_file.replace(s, "")
    # adding time stamp
    suffix = datetime.now().strftime('%H_%M')
    out_file = os.path.join(log_folder, log_file+suffix+".out")
    err_file = os.path.join(log_folder, log_file+suffix+".err")
    
    return(exp_name, out_file, err_file)

def deploy_terminal_run(exp_name, out, err):
    '''
    deploy a python job as a python subprocess on terminal
    
    Args:
        exp_name: (str) command for the job
        out: (str) filename to save std_out
        err: (str) filename to save std_err
    '''
    with open(out, "wb") as f:
        with open(err, "wb") as f2:
            subprocess.Popen(exp_name, stdout=f, stderr=f2)
        return
    
def deploy_slurm_run(exp_name, partition, prefix):
    '''
    deploy a python job as a slurm job
    
    Args:
        exp_name: (str) command for the job
        partition: (str) partion name on the cluster
        prefix: (str) prefix for the slurm job that is displayed in squeue
    '''
    if partition != "jsteinhardt":
        s = Slurm(prefix, {"partition": partition,
                     "c": 1
                       
                    })
    else:
        
        s = Slurm(prefix, {"partition": partition,  })
    s.run('module load python; ' + " ".join(exp_name))
    return(s)

def deploy_experiment(deploy_slurm, ids, exp_name, out, err, partition, prefix, debug=False):
    '''
    deploy a python experiment via terminal or slurm and append the job id / experiment name to ids
    
    Args:
        deploy_slurm: (Bool) if True deploy a slurm job, else a terminal job
        ids: (list) list of deployed experiments
        exp_name: (str) command for the job
        out: (str) filename to save std_out when deploying a terminal job
        err: (str) filename to save std_err when deploying a terminal job
        partition: (str) partion name on the cluster when deploying a slurm job
        prefix: (str) prefix for the slurm job that is displayed in squeue when deploying a slurm job
        debug: (bool) do not deploy when true, else deploy
    '''
    if debug:
        ids.append(" ".join(exp_name))
        return
    
    if deploy_slurm:
        ids.append(deploy_slurm_run(partition=partition, prefix=prefix, exp_name=exp_name))
    else:
        deploy_terminal_run(exp_name, out, err)
        ids.append(exp_name)
    return

In [None]:
def print_exp_setting(ds, total_reps, reps_per_job, alg, target, Ms=None, filenames=None, gs=None, ms=None):
    '''
    print out experiment setting
    
    Args:
        ds: (list) the dimensionality of experiments
        total_reps: (int) the number of reps
        reps_per_job: (int) number of reps for each job
        alg: (str) the algorithm
        target: (str) target P
        Ms: (list) range of M for Mog target
        filenames: (list) list of mcmc settings
        gs: (list) list of oversampling factors for Compress++
        ms : (list) sizes (useful for MCMC experiments where these differ file by file)
    '''
    if filenames is not None:
        print(f'Running {alg} experiments, gs={gs}, for {target} target for settings = {filenames} in d = {ds}, '
              + f'm = {ms}, total_reps = {total_reps},' +
                 f' with {reps_per_job} reps per python call')
    elif Ms is not None:
        print(f'Running {alg} experiments, gs={gs}, for {target} target for M = {Ms} in d = {ds}, ' 
                + f'm = {ms}, total_reps = {total_reps},' +
                 f' with {reps_per_job} reps per python call')
    else:
        print(f'Running {alg} experiments, gs={gs}, for {target} target for d = {ds}, ' 
                + f'm = {ms}, total_reps = {total_reps},' +
                 f' with {reps_per_job} reps per python call')


In [None]:
partitions = ["high", "yugroup", "jsteinhardt", "low"]
results_folder = "coresets_folder"

## 4. Run Gauss Experiments

In [None]:
# coresets to generate
st_coresets = False
kt_coresets = False
herding_coresets = False

cpp_kt_coresets = True
cpp_herding_coresets = False

### ASSUMING SIZE = 4^m; and final output = sqrt(SIZE) = 2^m #####
ms = range(9, 10+1) # range of m
gs = [0, 4] # oversampling factors; will ignore Compress++ run for g>m

# define repetition
total_reps = 10 # set this to the max number of repetitions
reps_per_job = 1 # number of reps per python call

symm1 = 1 # whether want use symmetrized halving in compress

compute_mmd = 1 # whether to compute mmd

### whether to regenerate coreset / recompute MMD / IMPORTANT NOTE DIFFERENT FLAGS ####
### if we want to recompute mmds (for some reason) then we have to set that flag to True ##
rerun = 0 # regenerate coresets?
recompute_mmd = 0 # recompute mmd? works ONLY if compute_mmd is true in the first place

### All experiments are run with Gauss(sigma) as k and Gauss(sigma/sqrt(2)) as krt ###
run_gauss_experiments = True # run experiments with Gauss P
ds = [2, 4, 10, 100]

In [None]:
debug = False # whether to actually deploy experiments (False), or just return a list of python commands to be deployed (True)
partition = partitions[2] # "high", "yugroup", "jsteinhardt", "low"

In [None]:
gauss_ids = []
if run_gauss_experiments:
    log_folder = f"logs/{datetime.now().strftime('%b_%d_%Y')}"
    pathlib.Path(log_folder).mkdir(parents=True, exist_ok=True)
    
    target="gauss"
    
    # run st/kt/herding experiments
    for flag, alg in zip([st_coresets, kt_coresets, herding_coresets], ["st", "kt", "herding"]):
        if flag:
            print_exp_setting(ds, total_reps, reps_per_job, alg=alg, target=target)
            for d in ds:
                for m in ms:
                    for i in range(0, total_reps, reps_per_job):
                        # experiment command, and filenames
                        exp_name, out, err = get_python_command(d=d, m=m, sz=m, rep0=i, repn=reps_per_job, 
                                                  rerun=rerun, compute_mmd=compute_mmd, 
                                                  recompute_mmd=recompute_mmd, log_folder=log_folder,
                                                  alg=alg, compress_alg=None, n_compress=None, g=None,
                                                  setting=target, symm1=symm1, results_folder=results_folder)
                        prefix = alg[:1] + target[:1] + f"d{d}n{m}r{i}"
                        deploy_experiment(deploy_slurm, gauss_ids, exp_name, out, err, partition, prefix=prefix, debug=debug)

    # compress++ experiments
    for flag, compress_alg in zip([cpp_kt_coresets, cpp_herding_coresets], ["kt", "herding"]):
        if flag:
            alg = "compresspp"
            print_exp_setting(ds, total_reps, reps_per_job, alg=alg+f"({compress_alg})", target=target, gs=gs, ms=ms)
            for d in ds:
                for m in ms:
                    for g in gs:
                        if g > m:
                            continue # skip the loop
                        for i in range(0, total_reps, reps_per_job):
                            # experiment command, and filenames
                            exp_name, out, err = get_python_command(d=d, m=m, sz=m, rep0=i, repn=reps_per_job, 
                                                  rerun=rerun, compute_mmd=compute_mmd, 
                                                  recompute_mmd=recompute_mmd, log_folder=log_folder,
                                                  alg=alg, compress_alg=compress_alg,  n_compress=None, g=g,
                                                setting=target, symm1=symm1, results_folder=results_folder)
                            prefix = alg[:1] + str(g) + compress_alg[:1] + target[:1] + f"d{d}n{m}r{i}"
                            deploy_experiment(deploy_slurm, gauss_ids, exp_name, out, err, partition, prefix=prefix, debug=debug)
                                            
                            
    print(f'{partition} partition; Number of processes/jobs:{len(gauss_ids)} slurm:{deploy_slurm}, terminal: {not deploy_slurm}')

## 5. Run MOG experiments

In [None]:
### All experiments are run with Gauss(sigma) as k and Gauss(sigma/sqrt(2)) as krt ###
run_mog_experiments = True # run experiments with MoG P
Ms = [4, 6, 8] # supports, 3, 4, 6, 8

In [None]:
# coresets to generate
st_coresets = False
kt_coresets = False
herding_coresets = False

cpp_kt_coresets = True
cpp_herding_coresets = False

### ASSUMING SIZE = 4^m; and final output = sqrt(SIZE) = 2^m #####
ms = range(9, 10+1) # range of m
gs = [0, 4] # oversampling factors; will ignore Compress++ run for g>m

# define repetition
total_reps = 10 # set this to the max number of repetitions
reps_per_job = 1 # number of reps per python call

symm1 = 1 # whether want use symmetrized halving in compress

compute_mmd = 1 # whether to compute mmd

### whether to regenerate coreset / recompute MMD / IMPORTANT NOTE DIFFERENT FLAGS ####
### if we want to recompute mmds (for some reason) then we have to set that flag to True ##
rerun = 0 # regenerate coresets?
recompute_mmd = 0 # recompute mmd? works ONLY if compute_mmd is true in the first place

In [None]:
debug = False
partition = partitions[1]  # "high", "yugroup", "jsteinhardt", "low"
mog_ids = []

In [None]:
if run_mog_experiments:
    log_folder = f"logs/{datetime.now().strftime('%b_%d_%Y')}"
    pathlib.Path(log_folder).mkdir(parents=True, exist_ok=True)
    
    d = 2
    target="mog"
    
    # run st/kt/herding experiments
    for flag, alg in zip([st_coresets, kt_coresets, herding_coresets], ["st", "kt", "herding"]):
        if flag:
            print_exp_setting(d, total_reps, reps_per_job, alg=alg, target=target, Ms=Ms, ms=ms)
            for M in Ms:
                for m in ms:
                    for i in range(0, total_reps, reps_per_job):
                        # experiment command, and filenames
                        exp_name, out, err = get_python_command(d=d, m=m, sz=m, rep0=i, repn=reps_per_job, 
                                                  rerun=rerun, compute_mmd=compute_mmd, 
                                                  recompute_mmd=recompute_mmd, log_folder=log_folder,
                                                  alg=alg, compress_alg=None, n_compress=None, g=None,
                                                setting=target, M=M, symm1=symm1, results_folder=results_folder)
                        prefix = alg[:1] + target[1] + f"M{M}n{m}r{i}" 
                        deploy_experiment(deploy_slurm, mog_ids, exp_name, out, err, partition, prefix=prefix, debug=debug)



    # compress++ experiments
    for flag, compress_alg in zip([cpp_kt_coresets, cpp_herding_coresets], ["kt", "herding"]):
        if flag:
            alg = "compresspp"
            print_exp_setting(d, total_reps, reps_per_job, alg=alg+f"({compress_alg})", 
                              target=target, Ms=Ms, gs=gs, ms=ms)
            for M in Ms:
                for m in ms:
                    for g in gs:
                        if g > m:
                            continue # skip the loop
                        for i in range(0, total_reps, reps_per_job):
                            # experiment command, and filenames
                            exp_name, out, err = get_python_command(d=d, m=m, sz=m, rep0=i, repn=reps_per_job, 
                                                  rerun=rerun, compute_mmd=compute_mmd, 
                                                  recompute_mmd=recompute_mmd, log_folder=log_folder,
                                                  alg=alg, compress_alg=compress_alg,  n_compress=None, 
                                                  g=g, setting=target, M=M, symm1=symm1, results_folder=results_folder)
                            prefix = alg[:1] + str(g) + compress_alg[:1] +  target[1] + f"M{M}n{m}r{i}" 
                            deploy_experiment(deploy_slurm, mog_ids, exp_name, out, err, partition, prefix=prefix, debug=debug)
                                
                        
    print(f'{partition} partition; Number of processes/jobs:{len(mog_ids)} slurm:{deploy_slurm}, terminal: {not deploy_slurm}')

## 6. Run MCMC experiments (we set parameters again)

In [None]:
# coresets to generate
st_coresets = False
kt_coresets = True
herding_coresets = False

cpp_kt_coresets = False
cpp_herding_coresets = False

### ASSUMING SIZE = 4^m; and final output = sqrt(SIZE) = 2^m #####
ms = range(4, 8+1) # range of m
gs = [0, 4] # oversampling factors; will ignore Compress++ run for g>m

# define repetition
total_reps = 10 # set this to the max number of repetitions
reps_per_job = 1 # number of reps per python call

symm1 = 1 # whether want use symmetrized halving in compress

compute_mmd = 1 # whether to compute mmd

### whether to regenerate coreset / recompute MMD / IMPORTANT NOTE DIFFERENT FLAGS ####
### if we want to recompute mmds (for some reason) then we have to set that flag to True ##
rerun = 0 # regenerate coresets?
recompute_mmd = 0 # recompute mmd? works ONLY if compute_mmd is true in the first place

run_mcmc_experiments = True # run experiments with MCMC P
all_mcmc_filenames = ['Goodwin_RW', 'Goodwin_ADA-RW', 
'Goodwin_MALA', 'Goodwin_PRECOND-MALA', 
'Lotka_RW', 'Lotka_ADA-RW', 
'Lotka_MALA', 'Lotka_PRECOND-MALA',  
     'Hinch_P_seed_1_temp_1_scaled_nosplit',
     'Hinch_P_seed_2_temp_1_scaled_nosplit',
     'Hinch_TP_seed_1_temp_8_scaled_nosplit', 
     'Hinch_TP_seed_2_temp_8_scaled_nosplit']

run_mcmc_experiments = True

In [None]:
mcmc_folder = "/accounts/projects/binyu/raaz.rsk/kernel_thinning/kernel_thinning_plus/data"

In [None]:
mcmc_file_range = slice(0, 12)
mcmc_filenames = all_mcmc_filenames[mcmc_file_range]
print(mcmc_filenames)

In [None]:
### ASSUMING SIZE = 4^m; and final output = sqrt(SIZE) = 2^m #####
# the range of allowed ms is 4^4 to 4^8 for all Hinch experiments and Lotka_ADA-RW;
# the range of allowed ms is 4^4 to 4^9 for all other experiments;

min_m = 4 # to cap the min size of experiments regardless of allowed ms;
max_m = 8 # to cap the max size of experiments regardless of allowed ms;
# the range is truncted to min(max_allowed_range, max_m)

ms_ranges = dict()
for filename in all_mcmc_filenames:
    if ('Hinch' in filename) or (filename == 'Lotka_ADA-RW'):
        ms_ranges[filename] = range(max(4, min_m), min(8, max_m)+1)
    else:
        ms_ranges[filename] = range(max(4, min_m), min(9, max_m)+1)

In [None]:
debug = False
partition = partitions[2] # "high", "yugroup", "jsteinhardt", "low"

In [None]:
mcmc_ids = []
if run_mcmc_experiments:
    log_folder = f"logs/{datetime.now().strftime('%b_%d_%Y')}"
    pathlib.Path(log_folder).mkdir(parents=True, exist_ok=True)
    
    d = 0 # None # no fixed d
    target="mcmc"
    
    # run st/kt/herding experiments
    for flag, alg in zip([st_coresets, kt_coresets, herding_coresets], ["st", "kt", "herding"]):
        if flag:
            print_exp_setting(d, total_reps, reps_per_job, alg=alg, target=target, Ms=None, filenames=mcmc_filenames, ms=ms_ranges)
            for filename in mcmc_filenames:
                for m in ms_ranges[filename]:
                    for i in range(0, total_reps, reps_per_job):
                        # experiment command, and filenames
                        exp_name, out, err = get_python_command(d=d, m=m, sz=m, rep0=i, repn=reps_per_job, 
                                                  rerun=rerun, compute_mmd=compute_mmd, 
                                                  recompute_mmd=recompute_mmd, log_folder=log_folder,
                                                  alg=alg, compress_alg=None, n_compress=None, g=None,
                                                setting=target, filename=filename, symm1=symm1, results_folder=results_folder, mcmc_folder = mcmc_folder)
                        prefix = alg[:1] + filename[:2] + f"d{d}n{m}r{i}" 
                        deploy_experiment(deploy_slurm, mcmc_ids, exp_name, out, err, partition, prefix=prefix, debug=debug)
                        
    # compress++ experiments
    for flag, compress_alg in zip([cpp_kt_coresets, cpp_herding_coresets], ["kt", "herding"]):
        if flag:
            alg = "compresspp"
            print_exp_setting(d, total_reps, reps_per_job, alg=alg+f"({compress_alg})", 
                              target=target, Ms=None,filenames=mcmc_filenames, gs=gs, ms=ms_ranges)
            for filename in mcmc_filenames:
                for m in ms_ranges[filename]:
                    for g in gs:
                        if g > m:
                            continue
                        for i in range(0, total_reps, reps_per_job):
                            # experiment command, and filenames
                            exp_name, out, err = get_python_command(d=d, m=m, sz=m, rep0=i, repn=reps_per_job, 
                                                  rerun=rerun, compute_mmd=compute_mmd, 
                                                  recompute_mmd=recompute_mmd, log_folder=log_folder,
                                                  alg=alg, compress_alg=compress_alg,  n_compress=None, 
                                                  g=g, setting=target, filename=filename, symm1=symm1, results_folder=results_folder, mcmc_folder = mcmc_folder)
                            prefix = alg[:1] + compress_alg[:1]  + filename[:1] + filename[5:6] + f"n{m}r{i}" 
                            deploy_experiment(deploy_slurm, mcmc_ids, exp_name, out, err, partition, prefix=prefix, debug=debug)
                                
    print(f'{partition} partition; Number of processes/jobs:{len(mcmc_ids)} slurm:{deploy_slurm}, terminal: {not deploy_slurm}')

## 7. Runtime experiments

In [None]:
results_folder = "results/run_time"

In [None]:
# coresets to generate

st_coresets = False
kt_coresets = True
herding_coresets = False

cpp_kt_coresets = False
cpp_herding_coresets = False

In [None]:
### ASSUMING SIZE = 4^m; and final output = sqrt(SIZE) = 2^m #####
ms = range(4, 5+1) # range of m
gs = [0, 4] # oversampling factors; will ignore C++ run for g>m

# define repetition
total_reps = 3 # set this to the max number of repetitions
reps_per_job = 1 # number of reps per python call

symm1 = 1 # whether want use symetric compress in stage 1;

rerun = 0 # re generate coresets

### All experiments are run with Gauss(sigma) as k and Gauss(sigma/sqrt(2)) as krt ###
runtime_gauss_experiments = True # run experiments with Gauss P
ds = [2] #, 4, 10, 100]



In [None]:
gauss_ids = []
if runtime_gauss_experiments:
    log_folder = f"logs/{datetime.now().strftime('%b_%d_%Y')}"
    pathlib.Path(log_folder).mkdir(parents=True, exist_ok=True)
    
    target="gauss"
    
    # run st/kt/herding experiments
    for flag, alg in zip([st_coresets, kt_coresets, herding_coresets], ["st", "kt", "herding"]):
        if flag:
            print_exp_setting(ds, total_reps, reps_per_job, alg=alg, target=target)
            for d in ds:
                for m in ms:
                    for i in range(0, total_reps, reps_per_job):
                        # experiment command, and filenames
                        exp_name, out, err = get_python_command(d=d, m=m, sz=m, rep0=i, repn=reps_per_job, 
                                                  rerun=rerun, compute_mmd=compute_mmd, 
                                                  recompute_mmd=recompute_mmd, log_folder=log_folder,
                                                  alg=alg, compress_alg=None, n_compress=None, g=None,
                                                  setting=target,
                                                symm1=symm1, timing_experiment=True, results_folder = results_folder)
                        prefix = 'rt' + alg[:1] + target[:1] + f"d{d}n{m}r{i}"
                        deploy_experiment(deploy_slurm, gauss_ids, exp_name, out, err, partition, prefix=prefix, debug=debug)

    # compress++ experiments
    for flag, compress_alg in zip([cpp_kt_coresets, cpp_herding_coresets], ["kt", "herding"]):
        if flag:
            alg = "compresspp"
            print_exp_setting(ds, total_reps, reps_per_job, alg=alg+f"({compress_alg})", target=target, gs=gs)
            for d in ds:
                for m in ms:
                    for g in gs:
                        if g > m:
                            continue # skip the loop
                        for i in range(0, total_reps, reps_per_job):
                            # experiment command, and filenames
                            exp_name, out, err = get_python_command(d=d, m=m, sz=m, rep0=i, repn=reps_per_job, 
                                                  rerun=rerun, compute_mmd=compute_mmd, 
                                                  recompute_mmd=recompute_mmd, log_folder=log_folder,
                                                  alg=alg, compress_alg=compress_alg,  n_compress=None, g=g,
                                                setting=target,
                                                symm1=symm1, timing_experiment=True, results_folder = results_folder)
                            prefix = 'rt' + alg[:1] + str(g) + compress_alg[:1] + target[:1] + f"d{d}n{m}r{i}"
                            deploy_experiment(deploy_slurm, gauss_ids, exp_name, out, err, partition, prefix=prefix, debug=debug)
                                            
                            
    print(f'{partition} partition; Number of processes/jobs:{len(gauss_ids)} slurm:{deploy_slurm}, terminal: {not deploy_slurm}')