In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import scipy.stats

import sys
sys.path.append("../../../")

from chiseling.dgps.basic_linear_rct import BasicLinearRCT
from chiseling.source.learners.baselearners_general import linreg_learner
from chiseling.source.strategies.simul_data_splitting_strategy import SimulDataSplittingStrategy

In [3]:
# Hyperparameters
n = 1000
d = 20
s_prog = 20
s_effect = 20
theta_prog = 0
theta_effect = 0
tau = 0

alpha = 0.05

random_seed = 42

In [4]:
# Sample data
sampler = BasicLinearRCT(d, s_prog, s_effect, theta_prog, theta_effect, tau, ipw_transform=True, random_seed=random_seed)
X, Y = sampler.sample(n)

In [5]:
# Run data splitting
simul_datasplit = SimulDataSplittingStrategy(X=X,
                                             Y=Y,
                                             train_ratio=0.2,
                                             learner=linreg_learner,
                                             alpha=alpha,
                                             test_thresh=0,
                                             random_seed=random_seed)

simul_datasplit.run_strategy()

In [6]:
# Data split rejection
simul_datasplit.rejected

False

In [7]:
# Chiseling region metrics
sampler.estimate_region_metrics(simul_datasplit.region)

(0, 0, nan, 0, 0, nan)

In [8]:
# Inspect cutoffs
simul_datasplit.test_cutoffs

array([0.        , 0.08365663, 0.1694563 , 0.2342705 , 0.31184793,
       0.42406887, 0.52500027, 0.60388579, 0.75644881, 0.97838346])

In [9]:
# Inspect LCB
simul_datasplit.lcb

array([-0.17415008, -0.18434898, -0.11085581, -0.08880834, -0.05217202,
       -0.14311253, -0.24716928, -0.35845606, -0.50111139, -0.79545064])

### Check coverage

In [10]:
n_sims = 2000

In [11]:
all_rejections = []
for _ in range(n_sims):
    X, Y = sampler.sample(n)
    # Run data splitting
    simul_datasplit = SimulDataSplittingStrategy(X=X,
                                                 Y=Y,
                                                 train_ratio=0.2,
                                                 learner=linreg_learner,
                                                 alpha=alpha,
                                                 test_thresh=0,
                                                 random_seed=random_seed)
    simul_datasplit.run_strategy()
    # Data split rejection
    all_rejections.append(simul_datasplit.rejected)
all_rejections = np.array(all_rejections)

In [12]:
all_rejections.mean(), all_rejections.std() / np.sqrt(n_sims)

(np.float64(0.057), np.float64(0.00518415856238985))

In [13]:
all_rejections.mean() - 2 * (all_rejections.std() / np.sqrt(n_sims))

np.float64(0.0466316828752203)