In [1]:
import numpy as np
import pandas as pd

from itertools import product

In [2]:
np.random.seed(42)

### Global hyperparameters

In [3]:
alpha = 0.05

### Base DGP/learner settings

These are common to all methods, because we need to run each of these settings against every method

In [4]:
BART_PATH = "/n/home04/ncheng/research/chiseling/chiseling/notebooks/bart_analysis/bart_dataset_processed.tsv.gz"

In [5]:
base_dgp_learner_settings = {"dgp": "BARTDataset",
                             "bart_path": BART_PATH,
                             "n": None,
                             "alpha": alpha,
                             "test_thresh": None,
                             "n_min": None,
                             "learner": "causal_random_forest_classifier_learner"}

In [6]:
dgp_learner_settings_df = []
test_thresh_n_space = [(0.3,500),
                       (0.35,1500),
                       (0.4,6000),
                       (0.45,10000)]
for test_thresh, n in test_thresh_n_space:
    task_settings = dict(base_dgp_learner_settings)
    task_settings["n"] = n
    task_settings["test_thresh"] = test_thresh
    task_settings["n_min"] = max(30, int(0.01 * n))
    dgp_learner_settings_df.append(task_settings)
dgp_learner_settings_df = pd.DataFrame(dgp_learner_settings_df)

### Method settings

In [7]:
# Chiseling
n_burn_in_space = np.round(np.linspace(0.1,0.9,9), 4)
margin_width_settings = [0,1]

# Data splitting
train_ratio_space = np.round(np.linspace(0.1,0.9,9), 4)

In [8]:
base_method_settings = {"strategy": None,
                        "bonf_strategy": None,
                        "n_burn_in": None,
                        "margin_width": None,
                        "train_ratio": None,
                        "reveal_batch_prop": 0.01,
                        "refit_batch_prop": 0.05}

In [9]:
method_settings_df = []

# Chiseling
strategy = "Chiseling"
for n_burn_in, margin_width in product(n_burn_in_space, margin_width_settings):
    task_settings = dict(base_method_settings)
    task_settings["strategy"] = strategy
    task_settings["n_burn_in"] = n_burn_in
    task_settings["margin_width"] = margin_width
    method_settings_df.append(task_settings)

# Data splitting
strategy = "DataSplittingStrategy"
for train_ratio in train_ratio_space:
    task_settings = dict(base_method_settings)
    task_settings["strategy"] = strategy
    task_settings["train_ratio"] = train_ratio
    method_settings_df.append(task_settings)

# Simultaneous data splitting
strategy = "SimulDataSplittingStrategy"
for train_ratio in train_ratio_space:
    task_settings = dict(base_method_settings)
    task_settings["strategy"] = strategy
    task_settings["train_ratio"] = train_ratio
    method_settings_df.append(task_settings)

# Oracle
task_settings = dict(base_method_settings)
strategy = "OracleStrategy"
task_settings["strategy"] = strategy
method_settings_df.append(task_settings)

# Oracle simultaneous
task_settings = dict(base_method_settings)
strategy = "OracleSimulStrategy"
task_settings["strategy"] = strategy
method_settings_df.append(task_settings)

# T-test
task_settings = dict(base_method_settings)
strategy = "TTestStrategy"
task_settings["strategy"] = strategy
method_settings_df.append(task_settings)

In [10]:
# Bonferroni chiseling
strategy = "BonferroniCombiner"
for margin_width in margin_width_settings:
    task_settings = dict(base_method_settings)
    task_settings["strategy"] = strategy
    task_settings["bonf_strategy"] = "Chiseling"
    task_settings["margin_width"] = margin_width
    method_settings_df.append(task_settings)

# Bonferroni data splitting
strategy = "BonferroniCombiner"
task_settings = dict(base_method_settings)
task_settings["strategy"] = strategy
task_settings["bonf_strategy"] = "DataSplittingStrategy"
method_settings_df.append(task_settings)

# Bonferroni simultaneous data splitting
strategy = "BonferroniCombiner"
task_settings = dict(base_method_settings)
task_settings["strategy"] = strategy
task_settings["bonf_strategy"] = "SimulDataSplittingStrategy"
method_settings_df.append(task_settings)

In [11]:
method_settings_df = pd.DataFrame(method_settings_df)

### Combine and format

In [12]:
chiseling_n_sims_per_task = 25
chiseling_n_tasks_per_setting = 100
datasplit_n_sims_per_task = 125
datasplit_n_tasks_per_setting = 20

Join the dataframes

In [13]:
base_task_df = dgp_learner_settings_df.merge(method_settings_df, how='cross')

In [14]:
task_df = []
for i, row in base_task_df.iterrows():
    if row.strategy == "Chiseling" or row.bonf_strategy == "Chiseling":
        n_sims_per_task = chiseling_n_sims_per_task
        n_tasks_per_setting = chiseling_n_tasks_per_setting
    else:
        n_sims_per_task = datasplit_n_sims_per_task
        n_tasks_per_setting = datasplit_n_tasks_per_setting
    row["n_sims"] = n_sims_per_task
    for _ in range(n_tasks_per_setting):
        task_df.append(row)
task_df = pd.DataFrame(task_df)

In [15]:
# For chiseling methods, save subgroup membership
# task_df["save_subgroup_membership"] = ((task_df.strategy == "Chiseling") | 
#                                        (task_df.bonf_strategy == "Chiseling"))
task_df["save_subgroup_membership"] = False

In [16]:
# Add task ids and random seeds
task_df["task_id"] = np.arange(task_df.shape[0])
task_df["random_seed"] = np.random.randint(0, 2**32 - 1, size=task_df.shape[0])

In [17]:
# Inspect
task_df.head(10)

Unnamed: 0,dgp,bart_path,n,alpha,test_thresh,n_min,learner,strategy,bonf_strategy,n_burn_in,margin_width,train_ratio,reveal_batch_prop,refit_batch_prop,n_sims,save_subgroup_membership,task_id,random_seed
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,0,1608637542
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,1,3421126067
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,2,4083286876
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,3,787846414
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,4,3143890026
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,5,3348747335
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,6,2571218620
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,7,2563451924
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,8,670094950
0,BARTDataset,/n/home04/ncheng/research/chiseling/chiseling/...,500,0.05,0.3,30,causal_random_forest_classifier_learner,Chiseling,,0.1,0.0,,0.01,0.05,25,False,9,1914837113


In [18]:
## DEVELOPER ONLY: SET N_SIMS TO 1 AND DOWNSAMPLE
# task_df["n_sims"] = 1
# task_df = task_df.sample(n=300, replace=False, random_state=42)

In [19]:
print(task_df.shape)

(9840, 18)


In [20]:
task_df.to_csv("../../task_arrays/bart_analysis.tasks.tsv", sep="\t", index=False)