In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [2]:
def create_data(N):
    np.random.seed(123)
    ps = 0.5
    d = pd.DataFrame({
      'T': np.random.binomial(1, ps, size = N),
      'X1': np.random.normal(size = N),
      'X2': np.random.normal(size = N),
      'X3': np.random.normal(size = N),
      'X4': np.random.normal(size = N)})
    d['Y'] = 0.36*d['T'] + 0.1*d['X1'] + 0.2*d['X2'] + 0.3*d['X3']+0.4*d['X4'] + np.random.normal(size = N)
    d["ps"] = ps
    return d

df = create_data(1000)

df.head()

Unnamed: 0,T,X1,X2,X3,X4,Y,ps
0,1,-0.010033,1.323274,1.432953,1.13801,2.039036,0.5
1,0,-0.845644,-0.003155,0.792809,2.540514,1.351514,0.5
2,0,0.911461,-0.43066,-2.030271,-0.082706,-1.043401,0.5
3,1,-1.374497,-0.146416,-1.880218,0.444621,0.906816,0.5
4,1,-0.547066,1.160176,-1.46947,1.896404,2.519107,0.5


In [3]:
smf.ols('Y ~ T', data = df).fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0182,0.051,0.359,0.719,-0.081,0.118
T,0.4097,0.072,5.663,0.000,0.268,0.552


# Variance of the OLS Estimator

In [4]:
smf.ols('Y ~ T + X1 + X2 + X3 + X4', data = df).fit().summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0345,0.044,-0.781,0.435,-0.121,0.052
T,0.4266,0.063,6.784,0.000,0.303,0.550
X1,0.1243,0.033,3.797,0.000,0.060,0.189
X2,0.1471,0.032,4.557,0.000,0.084,0.210
X3,0.2872,0.032,8.929,0.000,0.224,0.350
X4,0.4663,0.031,14.813,0.000,0.405,0.528


# Variance of the IPTW Estimator

In [5]:

def iptw_estimator(df):
    w = df['T']*(1/df['ps']) + (1-df['T'])*(1/(1-df['ps']))
    return  np.mean(w * df["Y"])

iptw_estimator(df)

0.4396176216632461

In [6]:
bs_runs = [iptw_estimator(df.sample(frac=1, replace=True)) for _ in range(1000)]

In [7]:
print(np.mean(bs_runs))
print(np.std(bs_runs))
print(np.quantile(bs_runs, q=[0.025, 0.975]))

0.43612833914650967
0.07391360132262122
[0.28754911 0.58549411]


# Variance of the Estimated IPTW Estimator

In [8]:
from sklearn.linear_model import LogisticRegression

def est_iptw_estimator(df):
    est_ps = LogisticRegression(C=1e6, max_iter=1000).fit(df[["X1", "X2", "X3", "X4"]], df["T"]).predict_proba(df[["X1", "X2", "X3", "X4"]])[:, 1]
    
    w = df['T']*(1/est_ps) + (1-df['T'])*(1/(1-est_ps))
    return  np.mean(w * df["Y"])

est_iptw_estimator(df)

0.45000165228866035

In [9]:
bs_runs = [est_iptw_estimator(df.sample(frac=1, replace=True)) for _ in range(1000)]

In [10]:
print(np.mean(bs_runs))
print(np.std(bs_runs))
print(np.quantile(bs_runs, q=[0.025, 0.975]))

0.4484444353779022
0.06989721458317427
[0.3097113  0.57723598]
