In [1]:
import numpy as np
import pandas as pd

from itertools import product

In [2]:
np.random.seed(42)

### Global hyperparameters

In [3]:
test_thresh = 0
alpha = 0.05

### Base DGP/learner settings

These are common to all methods, because we need to run each of these settings against every method

In [4]:
# Hyperparameters
d = 2
s = 2

In [5]:
base_dgp_learner_settings = {"dgp": "NonNegRCT",
                             "n": 1000,
                             "d": d,
                             "s": s,
                             "q": None,
                             "tau": None,
                             "tau0": None,
                             "alpha": alpha,
                             "test_thresh": test_thresh,
                             "learner": "nonneg_rct_causal_random_forest_learner",
                             "task_label": None}

In [6]:
dgp_learner_settings_df = []
# Nonnegative setup
q_space = np.linspace(0.1,1,20)
tauq = 0.1
tau0q = 0
for q in q_space:
    task_settings = dict(base_dgp_learner_settings)
    task_settings["q"] = q
    task_settings["tau"] = tauq / q
    task_settings["tau0"] = tau0q / q
    task_settings["task_label"] = "nonneg"
    dgp_learner_settings_df.append(task_settings)
# Slightly negative setup
q_space = np.linspace(0.1,1,20)
tauq = 0.1
tau0q = -0.01
for q in q_space:
    task_settings = dict(base_dgp_learner_settings)
    task_settings["q"] = q
    task_settings["tau"] = tauq / q
    task_settings["tau0"] = tau0q / q
    task_settings["task_label"] = "smallneg"
    dgp_learner_settings_df.append(task_settings)
dgp_learner_settings_df = pd.DataFrame(dgp_learner_settings_df)

### Method settings

In [7]:
# Chiseling
n_burn_in_space = [0.1]
margin_width_settings = [1]
alpha_init_space = [alpha/2]

In [8]:
base_method_settings = {"strategy": None,
                        "n_burn_in": None,
                        "margin_width": None,
                        "alpha_init": None,
                        "n_min": 30,
                        "reveal_batch_prop": 0.01,
                        "refit_batch_prop": 0.05}

In [9]:
method_settings_df = []

# Chiseling
strategy = "Chiseling"
for n_burn_in, margin_width, alpha_init in product(n_burn_in_space, margin_width_settings, alpha_init_space):
    task_settings = dict(base_method_settings)
    task_settings["strategy"] = strategy
    task_settings["n_burn_in"] = n_burn_in
    task_settings["margin_width"] = margin_width
    task_settings["alpha_init"] = alpha_init
    method_settings_df.append(task_settings)

# T-test
task_settings = dict(base_method_settings)
strategy = "TTestStrategy"
task_settings["strategy"] = strategy
method_settings_df.append(task_settings)

# Oracle
task_settings = dict(base_method_settings)
strategy = "OracleStrategy"
task_settings["strategy"] = strategy
method_settings_df.append(task_settings)

# Oracle simultaneous
task_settings = dict(base_method_settings)
strategy = "OracleSimulStrategy"
task_settings["strategy"] = strategy
method_settings_df.append(task_settings)

In [10]:
method_settings_df = pd.DataFrame(method_settings_df)

### Combine and format

In [11]:
chiseling_n_sims_per_task = 25
chiseling_n_tasks_per_setting = 100
datasplit_n_sims_per_task = 250
datasplit_n_tasks_per_setting = 10

Join the dataframes

In [12]:
base_task_df = dgp_learner_settings_df.merge(method_settings_df, how='cross')

In [13]:
task_df = []
for i, row in base_task_df.iterrows():
    if row.strategy == "Chiseling":
        n_sims_per_task = chiseling_n_sims_per_task
        n_tasks_per_setting = chiseling_n_tasks_per_setting
    else:
        n_sims_per_task = datasplit_n_sims_per_task
        n_tasks_per_setting = datasplit_n_tasks_per_setting
    row["n_sims"] = n_sims_per_task
    for _ in range(n_tasks_per_setting):
        task_df.append(row)
task_df = pd.DataFrame(task_df)

In [14]:
# Add task ids and random seeds
task_df["task_id"] = np.arange(task_df.shape[0])
task_df["random_seed"] = np.random.randint(0, 2**32 - 1, size=task_df.shape[0])

In [15]:
# Inspect
task_df.head(10)

Unnamed: 0,dgp,n,d,s,q,tau,tau0,alpha,test_thresh,learner,...,strategy,n_burn_in,margin_width,alpha_init,n_min,reveal_batch_prop,refit_batch_prop,n_sims,task_id,random_seed
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,0,1608637542
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,1,3421126067
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,2,4083286876
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,3,787846414
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,4,3143890026
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,5,3348747335
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,6,2571218620
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,7,2563451924
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,8,670094950
0,NonNegRCT,1000,2,2,0.1,1.0,0.0,0.05,0,nonneg_rct_causal_random_forest_learner,...,Chiseling,0.1,1.0,0.025,30,0.01,0.05,25,9,1914837113


In [16]:
# # DEVELOPER ONLY: SET N_SIMS TO 1 AND DOWNSAMPLE
# task_df["n_sims"] = 1
# task_df = task_df.sample(n=300, replace=False, random_state=42)

In [17]:
print(task_df.shape)

(5200, 21)


In [18]:
task_df.to_csv("../../task_arrays/nonneg_rct.tasks.tsv", sep="\t", index=False)