In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt

import sys
sys.path.append("../../../")

from chiseling.dgps.basic_binary_regression import BasicBinaryRegression, search_for_target_specification_binary_regression
from chiseling.source.learners.baselearners_binary import make_logregcv_learner
from chiseling.source.protocol.IRST import UnitRegistrar, IRSTBinary
from chiseling.source.strategies.alpha_spending_strategy import AlphaSpendingStrategy
from chiseling.source.strategies.data_splitting_strategy import DataSplittingStrategy
from chiseling.source.strategies.simul_data_splitting_strategy import SimulDataSplittingStrategy

In [3]:
# Hyperparameters
n = 3000
d = 100
s = 5
rho = 0.25
theta = 2

mu = 0.9
alpha = 0.05

random_seed = 42

In [4]:
tau = search_for_target_specification_binary_regression(d, s, rho, theta, mu, 0.3, random_seed=random_seed)
print(tau)

0.9563642393294414


In [5]:
# Sample data
sampler = BasicBinaryRegression(d, s, rho, theta, tau, random_seed=random_seed)
X, Y = sampler.sample(n)
pY1X = sampler.calculate_probs(X)

In [6]:
sampler.get_optimal_region_metrics(mu)

(np.float64(0.0190926477941471),
 np.float64(0.30411),
 np.float64(0.9627820452933055),
 np.float64(0.00010421588924076496),
 np.float64(0.001454740897548426),
 np.float64(0.00016505379462710193))

In [7]:
# Initialize protocol
unit_reg = UnitRegistrar(random_seed)
regX = unit_reg.register_units(X)
protocol = IRSTBinary(regX, Y, test_thresh=mu, alpha=alpha)

In [8]:
# Make random forest learner
logregcv_learner = make_logregcv_learner(random_seed=random_seed)

In [9]:
# Run chiseling
strategy = AlphaSpendingStrategy(protocol=protocol,
                                 test_thresh=mu,
                                 learner=logregcv_learner,
                                 n_burn_in=0.2,
                                 batch_size=int(0.05 * n),
                                 n_min=2,
                                 alpha_min=0,
                                 alpha_spending_fn='instantaneous',
                                 tiebreak=False,
                                 use_learner_weights=True,
                                 skip_const_predictor=False,
                                 random_seed=random_seed)

strategy.run_strategy(verbose=True)

METRICS = {'curr_sample_efficiency': 1.0, 'spent_alpha': np.float64(0.0), 'remaining_alpha': np.float64(0.05), 'n_shrink_revealed': 1645, 'n_left_in_region': 755, 'region_mass_estimate': np.float64(0.3145833333333333)}
SUMMARY: rejected = True, spent_alpha = 0.050000000000000044, n_left_in_region = 755, region_mass-estimate = 0.3145833333333333


In [10]:
protocol.testing_history

Unnamed: 0,stage_number,test_stat,sample_size,alloc_alpha,remaining_alpha,crit_val,rejection
0,-1,1895,3000,0.0,0.05,inf,False
1,0,1895,3000,0.0,0.05,inf,False
2,13,733,755,0.05,0.0,692.0,True


In [11]:
# Run data splitting
datasplit = DataSplittingStrategy(X=X,
                                  Y=Y,
                                  train_ratio=0.2,
                                  learner=logregcv_learner,
                                  alpha=alpha,
                                  test_thresh=mu,
                                  binary=True,
                                  n_min=1,
                                  random_seed=random_seed)
datasplit.run_strategy()

In [12]:
# Data split rejection
datasplit.rejected

np.True_

In [13]:
# Chiseling region metrics
sampler.estimate_region_metrics(mu, protocol.rejected_region)

(np.float64(0.018631077207550906),
 np.float64(0.29968),
 np.float64(0.9621699052574443),
 np.float64(0.00010550513145491585),
 np.float64(0.00144869561192129),
 np.float64(0.00018336460904315872))

In [14]:
# Data region metrics
sampler.estimate_region_metrics(mu, datasplit.region)

(np.float64(0.01105204541986031),
 np.float64(0.12259),
 np.float64(0.9901545429468988),
 np.float64(9.379430026336653e-05),
 np.float64(0.0010371195297553702),
 np.float64(6.045568293229058e-05))

In [15]:
# Run simultaneous data splitting
datasplit = SimulDataSplittingStrategy(X=X,
                                       Y=Y,
                                       pY1X=pY1X,
                                       train_ratio=0.2,
                                       learner=logregcv_learner,
                                       alpha=alpha,
                                       test_thresh=mu,
                                       n_min=1,
                                       random_seed=random_seed)
datasplit.run_strategy()

In [16]:
datasplit.lcb

array([0.98734177, 0.99029706, 0.98997144, 0.98949353, 0.98881728,
       0.99416261, 0.99422767, 0.99432462, 0.99448631, 0.9947453 ])

In [17]:
# Simul split rejection
datasplit.rejected

True

In [18]:
# Simul split region metrics
sampler.estimate_region_metrics(mu, datasplit.region)

(np.float64(0.011201043879296692),
 np.float64(0.12434),
 np.float64(0.990083994525468),
 np.float64(9.428875580425302e-05),
 np.float64(0.0010434537095626235),
 np.float64(5.94574521599615e-05))