In [1]:
import numpy as np
import pandas as pd

from itertools import product

In [2]:
np.random.seed(42)

### Global hyperparameters

In [3]:
alpha = 0.05

### Base DGP/learner settings

These are common to all methods, because we need to run each of these settings against every method

In [4]:
base_dgp_learner_settings = {"dgp": "NullDGPs",
                             "alpha": alpha,
                             "SPECIAL_TOKEN_NULLDGPS": True,
                             "n": None,
                             "d": None,
                             "binary": None,
                             "cov_type": None,
                             "err_type": None,
                             "SPECIAL_TOKEN_NULLDGPS_TRANSFORM_TYPE": None,
                             "test_thresh": None,
                             "learner": None}

In [5]:
dgp_learner_settings_df = []
n_space = np.linspace(100,2000,10).astype(int)
# Each tuple consists of options for
# (d, binary, cov_type, err_type, transform_type, test_thresh, learner)
dgp_settings_space = [(50, True, "corrnorm", "none", "none", 0.5, "ridgecv_learner"),
                      (100, True, "binary", "none", "none", 0.5, "mlp_regressor_learner"),
                      (150, True, "expo", "none", "none", 0.5, "random_forest_learner"),
                      (50, False, "corrnorm", "expo", "ipw", 0, "ridgecv_learner"),
                      (100, False, "binary", "t5", "ipw", 0, "mlp_regressor_learner"),
                      (150, False, "expo", "hetnorm", "ipw", 0, "random_forest_learner"),
                      (50, False, "corrnorm", "expo", "aipw", 0, "ridgecv_learner"),
                      (100, False, "binary", "t5", "aipw", 0, "mlp_regressor_learner"),
                      (150, False, "expo", "hetnorm", "aipw", 0, "random_forest_learner")]

for n, (d, binary, cov_type, err_type, transform_type, test_thresh, learner) in product(n_space, dgp_settings_space):
    task_settings = dict(base_dgp_learner_settings)
    task_settings["n"] = n
    task_settings["d"] = d
    task_settings["binary"] = binary
    task_settings["cov_type"] = cov_type
    task_settings["err_type"] = err_type
    task_settings["SPECIAL_TOKEN_NULLDGPS_TRANSFORM_TYPE"] = transform_type
    task_settings["test_thresh"] = test_thresh
    task_settings["learner"] = learner
    dgp_learner_settings_df.append(task_settings)
dgp_learner_settings_df = pd.DataFrame(dgp_learner_settings_df)

### Method settings

In [6]:
# Chiseling
n_burn_in = 0.1
margin_width = 1
reveal_batch_prop = 0.05
refit_batch_prop = 0.05
n_min = 30
shrink_to_boundary = False

In [7]:
base_method_settings = {"strategy": "Chiseling",
                        "n_burn_in": n_burn_in,
                        "margin_width": margin_width,
                        "reveal_batch_prop": reveal_batch_prop,
                        "refit_batch_prop": refit_batch_prop,
                        "n_min": n_min,
                        "shrink_to_boundary": shrink_to_boundary}

method_settings_df = [base_method_settings]
method_settings_df = pd.DataFrame(method_settings_df)

### Combine and format

In [8]:
chiseling_n_sims_per_task = 25
chiseling_n_tasks_per_setting = 100

Join the dataframes

In [9]:
base_task_df = dgp_learner_settings_df.merge(method_settings_df, how='cross')

In [10]:
task_df = []
for i, row in base_task_df.iterrows():
    row["n_sims"] = chiseling_n_sims_per_task
    for _ in range(chiseling_n_tasks_per_setting):
        task_df.append(row)
task_df = pd.DataFrame(task_df)

In [11]:
# Add task ids and random seeds
task_df["task_id"] = np.arange(task_df.shape[0])
task_df["random_seed"] = np.random.randint(0, 2**32 - 1, size=task_df.shape[0])

In [12]:
# Inspect
task_df.head(10)

Unnamed: 0,dgp,alpha,SPECIAL_TOKEN_NULLDGPS,n,d,binary,cov_type,err_type,SPECIAL_TOKEN_NULLDGPS_TRANSFORM_TYPE,test_thresh,...,strategy,n_burn_in,margin_width,reveal_batch_prop,refit_batch_prop,n_min,shrink_to_boundary,n_sims,task_id,random_seed
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,0,1608637542
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,1,3421126067
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,2,4083286876
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,3,787846414
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,4,3143890026
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,5,3348747335
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,6,2571218620
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,7,2563451924
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,8,670094950
0,NullDGPs,0.05,True,100,50,True,corrnorm,none,none,0.5,...,Chiseling,0.1,1,0.05,0.05,30,False,25,9,1914837113


In [13]:
# # DEVELOPER ONLY: SET N_SIMS TO 1 AND DOWNSAMPLE
# task_df["n_sims"] = 1
# task_df = task_df.sample(n=300, replace=False, random_state=42)

In [14]:
print(task_df.shape)

(9000, 21)


In [15]:
task_df.to_csv("../../task_arrays/null_dgps.tasks.tsv", sep="\t", index=False)