In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import scipy.stats

import sys
sys.path.append("../../../")

from chiseling.dgps.basic_binary_regression import BasicBinaryRegression
from chiseling.source.learners.baselearners_binary import logreg_learner, make_logregcv_learner
from chiseling.source.strategies.simul_data_splitting_strategy import SimulDataSplittingStrategy
from chiseling.source.strategies.data_splitting_strategy import DataSplittingStrategy

In [3]:
# Hyperparameters
n = 500
d = 10
s = 5
rho = 0.2
theta = 0
tau = 0

test_thresh = 0.5

alpha = 0.05

random_seed = 42

In [4]:
# logregcv_learner = make_logregcv_learner()
logregcv_learner = logreg_learner

In [5]:
# Sample data
sampler = BasicBinaryRegression(d, s, rho, theta, tau, random_seed=random_seed)
X, Y = sampler.sample(n)
pY1X = sampler.calculate_probs(X)

In [6]:
# Run data splitting
simul_datasplit = SimulDataSplittingStrategy(X=X,
                                             Y=Y,
                                             pY1X=pY1X,
                                             train_ratio=0.2,
                                             learner=logregcv_learner,
                                             alpha=alpha,
                                             test_thresh=test_thresh,
                                             random_seed=random_seed)

simul_datasplit.run_strategy()

In [7]:
# Data split rejection
simul_datasplit.rejected

False

In [8]:
# Chiseling region metrics
sampler.estimate_region_metrics(test_thresh, simul_datasplit.region)

(0, 0, nan, 0, 0, nan)

In [9]:
# Inspect cutoffs
simul_datasplit.test_cutoffs

array([0.5       , 0.51496306, 0.52680911, 0.54220859, 0.55467855,
       0.57790781, 0.58900664])

In [10]:
# Inspect LCB
simul_datasplit.lcb

array([0.44908989, 0.47172674, 0.47629735, 0.47266549, 0.43695448,
       0.44022503, 0.46243744])

### Check coverage

In [11]:
n_sims = 2000

In [12]:
all_rejections = []
for _ in range(n_sims):
    # Sample data
    X, Y = sampler.sample(n)
    pY1X = sampler.calculate_probs(X)
    # Run data splitting
    simul_datasplit = SimulDataSplittingStrategy(X=X,
                                                 Y=Y,
                                                 pY1X=pY1X,
                                                 train_ratio=0.2,
                                                 learner=logregcv_learner,
                                                 alpha=alpha,
                                                 test_thresh=test_thresh,
                                                 random_seed=random_seed)
    
    simul_datasplit.run_strategy()
    # Data split rejection
    all_rejections.append(simul_datasplit.rejected)
all_rejections = np.array(all_rejections)

In [13]:
all_rejections.mean(), all_rejections.std() / np.sqrt(n_sims)

(np.float64(0.0535), np.float64(0.005031786462082824))

In [14]:
all_rejections.mean() - 2 * (all_rejections.std() / np.sqrt(n_sims))

np.float64(0.04343642707583435)

Check coverage of data splitting, binary randomized

In [15]:
n_sims = 2000

In [16]:
all_rejections = []
for _ in range(n_sims):
    # Sample data
    X, Y = sampler.sample(n)
    # Run data splitting
    simul_datasplit = DataSplittingStrategy(X=X,
                                            Y=Y,
                                            train_ratio=0.2,
                                            learner=logregcv_learner,
                                            alpha=alpha,
                                            test_thresh=test_thresh,
                                            binary=True,
                                            randomize=True,
                                            random_seed=random_seed)
    
    simul_datasplit.run_strategy()
    # Data split rejection
    all_rejections.append(simul_datasplit.rejected)
all_rejections = np.array(all_rejections)

In [17]:
all_rejections.mean(), all_rejections.std() / np.sqrt(n_sims)

(np.float64(0.042), np.float64(0.004485309353879618))

In [18]:
all_rejections.mean() - 2 * (all_rejections.std() / np.sqrt(n_sims))

np.float64(0.03302938129224077)

In [19]:
all_rejections.mean() + 2 * (all_rejections.std() / np.sqrt(n_sims))

np.float64(0.050970618707759235)