In [1]:
import numpy as np
import pandas as pd

from itertools import product

In [2]:
np.random.seed(42)

### Global hyperparameters

In [3]:
test_thresh = 0
alpha = 0.05

### Base DGP/learner settings

These are common to all methods, because we need to run each of these settings against every method

In [4]:
# Hyperparameters
d = 100
s = 5
rho = 0.2

In [5]:
base_dgp_learner_settings = {"dgp": "HeterogeneousLinearRCT",
                             "n": None,
                             "d": d,
                             "s": s,
                             "rho": rho,
                             "theta": None,
                             "tau": None,
                             "alpha": alpha,
                             "test_thresh": test_thresh,
                             "learner": None,
                             "subgroup_size": None}

In [6]:
dgp_learner_settings_df = []
size_n_theta_tau_space = [(0.5, 1000, 0.4, 0),
                          (0.25, 2000, 0.4, -0.320),
                          (0.1, 5000, 0.3, -0.450),
                          (0.05, 10000, 0.4, -0.772),
                          (0.01, 25000, 0.5, -1.375)]
for (size, n, theta, tau) in size_n_theta_tau_space:
    task_settings = dict(base_dgp_learner_settings)
    task_settings["n"] = n
    task_settings["theta"] = theta
    task_settings["tau"] = tau
    task_settings["learner"] = "ridgecv_learner"
    task_settings["subgroup_size"] = size
    dgp_learner_settings_df.append(task_settings)
dgp_learner_settings_df = pd.DataFrame(dgp_learner_settings_df)

### Method settings

In [7]:
# Chiseling
n_burn_in_space = np.round(np.linspace(0.1,0.9,9), 4)
margin_width_settings = [0,0.5,1]

# Data splitting
train_ratio_space = np.round(np.linspace(0.1,0.9,9), 4)

# General
n_min = 30

In [8]:
base_method_settings = {"strategy": None,
                        "bonf_strategy": None,
                        "n_burn_in": None,
                        "margin_width": None,
                        "train_ratio": None,
                        "n_min": None}

In [9]:
method_settings_df = []

# Chiseling
strategy = "Chiseling"
for n_burn_in, margin_width in product(n_burn_in_space, margin_width_settings):
    task_settings = dict(base_method_settings)
    task_settings["strategy"] = strategy
    task_settings["n_burn_in"] = n_burn_in
    task_settings["margin_width"] = margin_width
    task_settings["n_min"] = n_min
    method_settings_df.append(task_settings)

# Data splitting
strategy = "DataSplittingStrategy"
for train_ratio in train_ratio_space:
    task_settings = dict(base_method_settings)
    task_settings["strategy"] = strategy
    task_settings["train_ratio"] = train_ratio
    task_settings["n_min"] = n_min
    method_settings_df.append(task_settings)

# Simultaneous data splitting
strategy = "SimulDataSplittingStrategy"
for train_ratio in train_ratio_space:
    task_settings = dict(base_method_settings)
    task_settings["strategy"] = strategy
    task_settings["train_ratio"] = train_ratio
    task_settings["n_min"] = n_min
    method_settings_df.append(task_settings)

# Oracle
task_settings = dict(base_method_settings)
strategy = "OracleStrategy"
task_settings["strategy"] = strategy
task_settings["n_min"] = n_min
method_settings_df.append(task_settings)

In [10]:
# Bonferroni chiseling
strategy = "BonferroniCombiner"
for margin_width in margin_width_settings:
    task_settings = dict(base_method_settings)
    task_settings["strategy"] = strategy
    task_settings["bonf_strategy"] = "Chiseling"
    task_settings["margin_width"] = margin_width
    task_settings["n_min"] = n_min
    method_settings_df.append(task_settings)

# Bonferroni data splitting
strategy = "BonferroniCombiner"
task_settings = dict(base_method_settings)
task_settings["strategy"] = strategy
task_settings["bonf_strategy"] = "DataSplittingStrategy"
task_settings["n_min"] = n_min
method_settings_df.append(task_settings)

# Bonferroni simultaneous data splitting
strategy = "BonferroniCombiner"
task_settings = dict(base_method_settings)
task_settings["strategy"] = strategy
task_settings["bonf_strategy"] = "SimulDataSplittingStrategy"
task_settings["n_min"] = n_min
method_settings_df.append(task_settings)

In [11]:
method_settings_df = pd.DataFrame(method_settings_df)

### Combine and format

In [12]:
chiseling_n_sims_per_task = 100
chiseling_n_tasks_per_setting = 25
datasplit_n_sims_per_task = 50
datasplit_n_tasks_per_setting = 50

Join the dataframes

In [13]:
base_task_df = dgp_learner_settings_df.merge(method_settings_df, how='cross')

In [14]:
task_df = []
for i, row in base_task_df.iterrows():
    if row.strategy == "Chiseling" or row.bonf_strategy == "Chiseling":
        n_sims_per_task = chiseling_n_sims_per_task
        n_tasks_per_setting = chiseling_n_tasks_per_setting
    else:
        n_sims_per_task = datasplit_n_sims_per_task
        n_tasks_per_setting = datasplit_n_tasks_per_setting
    row["n_sims"] = n_sims_per_task
    for _ in range(n_tasks_per_setting):
        task_df.append(row)
task_df = pd.DataFrame(task_df)

In [15]:
# Add task ids and random seeds
task_df["task_id"] = np.arange(task_df.shape[0])
task_df["random_seed"] = np.random.randint(0, 2**32 - 1, size=task_df.shape[0])

In [16]:
# Inspect
task_df.head(10)

Unnamed: 0,dgp,n,d,s,rho,theta,tau,alpha,test_thresh,learner,subgroup_size,strategy,bonf_strategy,n_burn_in,margin_width,train_ratio,n_min,n_sims,task_id,random_seed
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,0,1608637542
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,1,3421126067
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,2,4083286876
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,3,787846414
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,4,3143890026
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,5,3348747335
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,6,2571218620
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,7,2563451924
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,8,670094950
0,HeterogeneousLinearRCT,1000,100,5,0.2,0.4,0.0,0.05,0,ridgecv_learner,0.5,Chiseling,,0.1,0.0,,30,100,9,1914837113


In [17]:
# # DEVELOPER ONLY: SET N_SIMS TO 1 AND DOWNSAMPLE
# task_df["n_sims"] = 1
# task_df = task_df.sample(n=300, replace=False, random_state=42)

In [18]:
print(task_df.shape)

(9000, 20)


In [19]:
task_df.to_csv("../../task_arrays/heterogeneous_linear_rct_ridgecv.tasks.tsv", sep="\t", index=False)