In [None]:
import sys
from pathlib import Path

# Add project root to sys.path
sys.path.append(str(Path.cwd().parent))

from methods.ipsw import estimate_sampling_scores, estimate_ate_ipsw
from methods.ipsw import IPSWEstimator
import pandas as pd 
import numpy as np 
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [18]:
def simulate_data(n=100000, seed=123):
    rng = np.random.default_rng(seed)

    # Covariates
    X1 = rng.normal(0, 1, size=n)
    X2 = rng.binomial(1, 0.5, size=n)

    # Sampling mechanism
    # Trial oversamples high X1 and high X2 individuals
    logits_S = -0.3 + 1.0 * X1 + 1.0 * X2
    pS = 1 / (1 + np.exp(-logits_S))
    S = rng.binomial(1, pS)

    # Randomized treatment in trial only
    T = np.zeros(n)
    mask_trial = S == 1
    T[mask_trial] = rng.binomial(1, 0.5, mask_trial.sum())

    # Heterogeneous treatment effect:
    # tau_i = 2 + 3 * X1
    tau_i = 2 + 3 * X1

    # Outcome model
    beta1, beta2 = 1.0, -1.0
    eps = rng.normal(0, 1, size=n)

    Y = tau_i * T + beta1 * X1 + beta2 * X2 + eps

    df = pd.DataFrame({
        "X1": X1,
        "X2": X2,
        "S": S,
        "T": T,
        "Y": Y,
        "tau_true": tau_i,
        "pS_true": pS
    })

    return df

# IPSW

In [19]:
df = simulate_data(n=8000, seed=42)
print(df.head())

# 1. Design matrices / vectors
X = df[["X1", "X2"]]          # covariates used in sampling model
s = df["S"].to_numpy()        # sample indicator (1 = trial, 0 = target)
a = df["T"].to_numpy()        # treatment
y = df["Y"].to_numpy()        # outcome

# True ATE in the target population (S = 0)
ATE_target_true = df.loc[df["S"] == 0, "tau_true"].mean()



         X1  X2  S    T         Y  tau_true   pS_true
0  0.304717   1  1  1.0  0.854248  2.914151  0.731985
1 -1.039984   1  0  0.0 -1.686244 -1.119952  0.415813
2  0.750451   0  1  1.0  5.555570  4.251354  0.610747
3  0.940565   0  1  1.0  5.099770  4.821694  0.654881
4 -1.951035   0  0  0.0 -0.290278 -3.853106  0.095260


In [34]:
# methods/bootstrap_utils.py

# 2. Estimate P(S = 1 | X)
ps_hat, model = estimate_sampling_scores(X, s)

# 3. Use IPSW to estimate ATE in the *target* population
# For generalizing to S=0, inverse-odds weights are standard:
result = estimate_ate_ipsw(
    y=y,
    a=a,
    s=s,
    ps=ps_hat,
    weight_type = "inverse_odds",   # key for generalizability to S=0
    stabilized=True,
    clip=(0.01, 50),              # optional, to tame extreme weights
)

AIPSW (logit, XGB) ATE: 0.656005755717767
SE: 0.19902598040204234
95% CI: (0.2456615817648171, 1.0459322244102058)


In [21]:
est_xgb = IPSWEstimator(
    model=XGBClassifier(
        max_depth=3,
        n_estimators=300,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
    ),
    weight_type="inverse_odds",   # generalizability to S = 0
    stabilized=True,
    clip=(0.01, 50),
)

est_xgb.fit(X, s)
res_xgb = est_xgb.estimate(y, a)

In [22]:
est_rf = IPSWEstimator(
    model=RandomForestClassifier(
        n_estimators=500,
        max_depth=6,
    ),
    weight_type="inverse_odds",  # generalizability to S = 0
    stabilized=True,
)

est_rf.fit(X, s)
res_rf = est_rf.estimate(y, a)


In [23]:
print("TRUE IPSW ATE (target):", ATE_target_true)
print("XGB IPSW ATE (target):", res_xgb.ate)
print("RF IPSW ATE (target):", res_rf.ate)
print("IPSW ATE estimate LR (target):", result.ate)

TRUE IPSW ATE (target): 0.6804878754636353
XGB IPSW ATE (target): 0.6301638712287438
RF IPSW ATE (target): 0.9923600517584062
IPSW ATE estimate LR (target): 0.656005755717767


# Augmented IPSW

In [24]:
ps, _ = estimate_sampling_scores(X, s)

result = estimate_ate_aipsw(
    y=y,
    a=a,
    s=s,
    X=X,
    ps=ps,                    # or let it estimate inside
    weight_type="inverse_odds",
    stabilized=True,
    clip=(0.1, 10.0),
    target="nontrial",
)

print("AIPSW ATE:", result.ate)

from sklearn.ensemble import RandomForestRegressor
from methods.aipsw import estimate_ate_aipsw

rf_outcome = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=42,
)

result = estimate_ate_aipsw(
    y=y,
    a=a,
    s=s,
    X=X,
    sampling_model=None,              # maybe still logistic for sampling
    sampling_model_kwargs={},
    outcome_model=rf_outcome,         # <--- your flexible regression model
    outcome_model_kwargs={},          # ignored because model is provided
    weight_type="inverse_odds",
    stabilized=True,
    clip=(0.1, 10.0),
    target="nontrial",
)

print("AIPSW ATE:", result.ate)

from xgboost import XGBRegressor
from methods.aipsw import estimate_ate_aipsw

xgb_outcome = XGBRegressor(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
)

result = estimate_ate_aipsw(
    y=y,
    a=a,
    s=s,
    X=X,
    sampling_model=None,          # still logistic for sampling by default
    sampling_model_kwargs={},
    outcome_model=xgb_outcome,    # <-- XGBoost outcome model
    outcome_model_kwargs={},      # ignored because you passed a model
    weight_type="inverse_odds",
    stabilized=True,
    clip=(0.1, 10.0),
    target="nontrial",
)

print("AIPSW ATE (XGB outcome):", result.ate)


AIPSW ATE: 0.6572131063034092
AIPSW ATE: 0.6363389715675143
AIPSW ATE (XGB outcome): 0.6413754454949849


# Stratification

In [25]:
from methods.stratification import estimate_ate_stratified

result = estimate_ate_stratified(
    Y=df["Y"].values,
    T=df["T"].values,
    S=df["S"].values,
    X=df[["X1", "X2"]],
    n_strata=5,
    target="nontrial",
)

print("Stratified ATE:", result.ate)
print("Stratum-specific ATEs:", result.stratum_ates)


Stratified ATE: 0.9602307004504487
Stratum-specific ATEs: [-1.53538298  0.75035567  2.22220991  3.40606334  5.79762477]


In [26]:
from xgboost import XGBClassifier
from methods.stratification import StratifiedGeneralizabilityEstimator

xgb_sampler = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    use_label_encoder=False,
)

est = StratifiedGeneralizabilityEstimator(
    n_strata=5,
    target="nontrial",
    sampling_model=xgb_sampler,
)

# Note: capital S in the argument name
est.fit(X=df[["X1", "X2"]], S=df["S"].values)

# Use Y, T, S and capitalized argument names
result = est.estimate_ate(
    Y=df["Y"].values,
    T=df["T"].values,   # or df["A"].values if your column is still named A
    S=df["S"].values,
)

print("Stratified ATE:", result.ate)
print("Stratum weights (target pop):", result.stratum_weights)



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Stratified ATE: 0.8299342430824019
Stratum weights (target pop): [0.35310269 0.24766612 0.19824272 0.1394838  0.06150467]


# G-formulation

In [27]:
from methods.outcomeM import (
    OutcomeModelGeneralizabilityEstimator,
    estimate_ate_outcome,
)

# One-shot function
result = estimate_ate_outcome(
    Y=df["Y"].values,
    T=df["T"].values,
    S=df["S"].values,
    X=df[["X1", "X2"]],
    target="nontrial",
)

print("Outcome-model ATE:", result.ate)
print("μ1:", result.ate_treated, "μ0:", result.ate_control)


Outcome-model ATE: 0.6467929949439102
μ1: -0.16718033678216815 μ0: -0.8139733317260783


In [28]:
from xgboost import XGBRegressor
from methods.outcomeM import OutcomeModelGeneralizabilityEstimator

xgb_outcome = XGBRegressor(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
)

est = OutcomeModelGeneralizabilityEstimator(
    target="nontrial",
    outcome_model=xgb_outcome,
)

est.fit(
    X=df[["X1", "X2"]],
    Y=df["Y"].values,
    T=df["T"].values,
    S=df["S"].values,
)

result = est.estimate_ate()

print("Outcome-model ATE (XGB):", result.ate)
print("μ1:", result.ate_treated, "μ0:", result.ate_control)


Outcome-model ATE (XGB): 0.6484324038028717
μ1: -0.14614489674568176 μ0: -0.7945773005485535


# Calbriation Weighing

In [29]:
from methods.calibration import estimate_ate_calibration

result = estimate_ate_calibration(
    Y=df["Y"].values,
    T=df["T"].values,
    S=df["S"].values,
    X=df[["X1", "X2"]],
    target="nontrial",
)

print("Calibration-weighted ATE:", result.ate)

Calibration-weighted ATE: 0.6628224815065443


# Augmented Calbriation Weighing

In [30]:
from methods.acalibration import estimate_ate_calibration_augmented

result = estimate_ate_calibration_augmented(
    Y=df["Y"].values,
    T=df["T"].values,
    S=df["S"].values,
    X=df[["X1", "X2"]],
    target="nontrial",
)

print("Augmented Calibration ATE:", result.ate)


Augmented Calibration ATE: 0.6686458435544305


In [31]:
from xgboost import XGBRegressor
from methods.acalibration import AugmentedCalibrationGeneralizabilityEstimator

xgb_outcome = XGBRegressor(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
)

est = AugmentedCalibrationGeneralizabilityEstimator(
    target="nontrial",
    outcome_model=xgb_outcome,
)

est.fit(
    X=df[["X1", "X2"]],
    Y=df["Y"].values,
    T=df["T"].values,
    S=df["S"].values,
)

result = est.estimate_ate()

print("Augmented Calibration ATE (XGB):", result.ate)


Augmented Calibration ATE (XGB): 0.6330386800608776
