# Standard errors

This is a notebook demonstrating how to obtain standard errors for your generated impact estimates.

In [1]:
%load_ext autoreload
%autoreload 2
import os, sys
import warnings
warnings.filterwarnings('ignore') # suppress sklearn deprecation warnings for now..

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# the below checks for whether we run dowhy, causaltune, and FLAML from source
root_path = root_path = os.path.realpath('../..')
try:
    import causaltune
except ModuleNotFoundError:
    sys.path.append(os.path.join(root_path, "causaltune"))

try:
    import dowhy
except ModuleNotFoundError:
    sys.path.append(os.path.join(root_path, "dowhy"))

try:
    import flaml
except ModuleNotFoundError:
    sys.path.append(os.path.join(root_path, "FLAML"))


In [2]:
# this makes the notebook expand to full width of the browser window
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
%%javascript

// turn off scrollable windows for large output
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [4]:
from causaltune import CausalTune
from causaltune.datasets import synth_ihdp

## Loading data

In [5]:
# load toy dataset and apply standard pre-processing
cd = synth_ihdp()
cd.preprocess_dataset()

In [6]:
# inspect the preprocessed dataset
display(cd.data.head())

Unnamed: 0,treatment,y_factual,random,x1,x2,x3,x4,x5,x6,x7,...,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25
0,1,5.599916,1.0,-0.528603,-0.343455,1.128554,0.161703,-0.316603,1.295216,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,6.875856,1.0,-1.736945,-1.802002,0.383828,2.244319,-0.629189,1.295216,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2.996273,1.0,-0.807451,-0.202946,-0.360898,-0.879606,0.808706,-0.526556,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1.366206,0.0,0.390083,0.596582,-1.85035,-0.879606,-0.004017,-0.857787,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1.963538,1.0,-1.045228,-0.60271,0.011465,0.161703,0.683672,-0.36094,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model training and standard errors

In [7]:
# training configs

# set evaluation metric
metric = "energy_distance"

# it's best to specify either time_budget or components_time_budget, 
# and let the other one be inferred; time in seconds
time_budget = None
components_time_budget = 10

# specify training set size
train_size = 0.7


Note that in the example below, we are passing `'cheap_inference'` to `estimator_list`. This configuration will restrict the selection of estimators to the ones that have analytical standard errors.

In [8]:
ct = CausalTune(
    estimator_list='cheap_inference',
    metric=metric,
    verbose=0,
    components_verbose=0,
    time_budget=time_budget,
    components_time_budget=components_time_budget,
    train_size=train_size
)


# run causaltune
ct.fit(data=cd, outcome=cd.outcomes[0])

print('---------------------')
# return best estimator
print(f"Best estimator: {ct.best_estimator}")
# config of best estimator:
print(f"Best config: {ct.best_config}")
# best score:
print(f"Best score: {ct.best_score}")

Fitting a Propensity-Weighted scoring estimator to be used in scoring tasks
Initial configs: [{'estimator': {'estimator_name': 'backdoor.econml.dr.ForestDRLearner', 'min_propensity': 1e-06, 'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 5, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto', 'min_impurity_decrease': 0.0, 'max_samples': 0.45, 'min_balancedness_tol': 0.45, 'honest': True, 'subforest_size': 4}}, {'estimator': {'estimator_name': 'backdoor.econml.dr.LinearDRLearner', 'fit_cate_intercept': True, 'min_propensity': 1e-06}}, {'estimator': {'estimator_name': 'backdoor.econml.dr.SparseLinearDRLearner', 'fit_cate_intercept': True, 'n_alphas': 100, 'n_alphas_cov': 10, 'min_propensity': 1e-06, 'tol': 0.0001, 'max_iter': 10000, 'mc_agg': 'mean'}}, {'estimator': {'estimator_name': 'backdoor.econml.dml.LinearDML', 'fit_cate_intercept': True, 'mc_agg': 'mean'}}, {'estimator': {'estimator_name': 'backdoor.econml.dml.SparseLinearDML', 'fit_cate_intercept': True, 'n_a

In [9]:
# obtaining effect estimates

test_df = ct.test_df

cates = ct.effect(test_df)
display(cates[:5,])

array([[3.08417039],
       [4.10807041],
       [4.32885751],
       [4.53901377],
       [4.19668172]])

Below we show how to generate standard errors using `CausalTune.effect_stderr()`. By default, this will use the `best_estimator` identified during training.

If this estimator does not have analytical standard errors, it will be refitted `n_bootstrap_samples`-times on the training data.

In [10]:
# generating standard errors by refitting train_df 
se = ct.effect_stderr(ct.test_df)
display(se[:5,])

array([[0.28758771],
       [0.2267228 ],
       [0.29267037],
       [0.22686985],
       [0.28054057]])

In addition to merely generating standard errors, we have the option to generate various other statistical inferences for the effect, such as the standard error, z-test score, and p-value for each sample `X{i}`.

In [11]:
ct.effect_inference(test_df)[0].summary_frame(alpha=0.1, value=0, decimals=3).head()

Unnamed: 0_level_0,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.084,0.288,10.724,0.0,2.611,3.557
1,4.108,0.227,18.119,0.0,3.735,4.481
2,4.329,0.293,14.791,0.0,3.847,4.81
3,4.539,0.227,20.007,0.0,4.166,4.912
4,4.197,0.281,14.959,0.0,3.735,4.658
