In [1]:
# python version of https://migariane.github.io/TMLE.nb.html
import numpy as np
import pandas as pd
import statsmodels.api as sm

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def logit(p):
    return np.log(p) - np.log(1 - p)

def generate_data(n):
    np.random.seed(2)
    w1 = np.random.binomial(1, 0.5, n)        
    w2 = np.random.binomial(1, 0.65, n)    
    w3 = np.round(np.random.uniform(0, 4, n), 3)
    w4 = np.round(np.random.uniform(0, 5, n), 3)
    p_A = sigmoid(-0.4 + 0.2*w2 + 0.15*w3 + 0.2*w4 +0.15*w2*w4)
    
    A = np.random.binomial(1, p_A, n)
    
    p_y1 = sigmoid(-1 + 1 -0.1*w1 + 0.3*w2 + 0.25*w3+ 0.2*w4 + 0.15*w2*w4)
    p_y0 = sigmoid(-1 + 0 -0.1*w1 + 0.3*w2 + 0.25*w3 + 0.2*w4 + 0.15*w2*w4)
    Y1 = np.random.binomial(1, p_y1, n)
    Y0 = np.random.binomial(1, p_y0, n)
    
    Y = Y1 * A + Y0*(1-A)
    
    cols = ['w1', 'w2', 'w3', 'w4', 'A','Y', 'Y1', 'Y0']
    df = pd.DataFrame([w1, w2, w3, w4, A, Y, Y1, Y0]).T
    df.columns = cols
    return df
    

In [2]:
df = generate_data(30000)

true_psi = (df.Y1 - df.Y0).mean()
print('True Psi:', true_psi)
X_cols = ['w1', 'w2', 'w3', 'w4', 'A']
y_cols = ['Y']
print(df.head())

True Psi: 0.20123333333333332
    w1   w2     w3     w4    A    Y   Y1   Y0
0  0.0  0.0  3.154  3.421  1.0  1.0  1.0  0.0
1  0.0  0.0  3.020  4.957  1.0  1.0  1.0  0.0
2  1.0  1.0  1.738  0.657  1.0  1.0  1.0  0.0
3  0.0  0.0  2.264  2.357  0.0  0.0  0.0  0.0
4  0.0  0.0  2.763  0.263  0.0  0.0  0.0  0.0


In [3]:
mod = sm.OLS(df[y_cols], df[X_cols])
res = mod.fit()
res.summary()
biased_psi = res.params.A
print('Biased estimate of Psi:', biased_psi)

print('Amount of naive bias:', np.abs(biased_psi - true_psi))

naive_relative_bias = ((biased_psi - true_psi) / true_psi)*100
print('Relative naive bias:', naive_relative_bias, '%')

Biased estimate of Psi: 0.2606985126971462
Amount of naive bias: 0.0594651793638129
Relative naive bias: 29.550362446817747 %


In [4]:
# Intervene on A and add new columns to dataset
df['A0'] = 0
df['A1'] = 1

# Find Q0 by predicting the outcome Y using all covariates and treatment vars
log_reg = sm.GLM(df[y_cols], df[X_cols], family=sm.families.Binomial()).fit()
QAW = logit(log_reg.predict( df[X_cols]))
df['QAW'] = QAW
# Find QA1 and QA2
X_cols = ['w1', 'w2', 'w3', 'w4', 'A1']
Q1W = logit(log_reg.predict( df[X_cols]))
X_cols = ['w1', 'w2', 'w3', 'w4', 'A0']
Q0W = logit(log_reg.predict( df[X_cols]))

df['Q0W'] = Q0W
df['Q1W'] = Q1W

df['pY1'] = sigmoid(Q1W)
df['pY0'] = sigmoid(Q0W)

psi = (df['pY1'] - df['pY0'])
df['psi'] = psi
print('new psi estiamte:', psi.mean())
df.head()

new psi estiamte: 0.16504501359096158


Unnamed: 0,w1,w2,w3,w4,A,Y,Y1,Y0,A0,A1,QAW,Q0W,Q1W,pY1,pY0,psi
0,0.0,0.0,3.154,3.421,1.0,1.0,1.0,0.0,0,1,1.535674,0.70799,1.535674,0.822835,0.669957,0.152878
1,0.0,0.0,3.02,4.957,1.0,1.0,1.0,0.0,0,1,1.757634,0.92995,1.757634,0.852913,0.717065,0.135848
2,1.0,1.0,1.738,0.657,1.0,1.0,1.0,0.0,0,1,0.981649,0.153964,0.981649,0.727435,0.538415,0.18902
3,0.0,0.0,2.264,2.357,0.0,0.0,0.0,0.0,0,1,0.493419,0.493419,1.321104,0.789365,0.620912,0.168454
4,0.0,0.0,2.763,0.263,0.0,0.0,0.0,0.0,0,1,0.210399,0.210399,1.038083,0.73848,0.552406,0.186074


In [5]:
# Find g by predicting the treatment A using covariates 
X_cols = ['w1', 'w2', 'w3', 'w4']
y_cols = ['A']
log_reg = sm.GLM(df[y_cols], df[X_cols], family=sm.families.Binomial()).fit()
g = logit(log_reg.predict( df[X_cols]))
df['g'] = g
df['pA1'] = sigmoid(g)

In [6]:
# Find clever covariates
df['Hgaw'] = df['A']/df['pA1'] - (1-df['A'])/(1-df['pA1'])
df['Hg1w'] = (1/df['pA1'])
df['Hg0w'] = -1/(1-df['pA1'])


In [7]:
# Find epsilon
y_cols = ['Y']
X_cols = ['Hgaw']
log_reg = sm.GLM(df[y_cols], df[X_cols], family=sm.families.Binomial(), offset=df['QAW']).fit()
epsilon = log_reg.params.Hgaw
df['epsilon'] = epsilon
df.head()

Unnamed: 0,w1,w2,w3,w4,A,Y,Y1,Y0,A0,A1,...,Q1W,pY1,pY0,psi,g,pA1,Hgaw,Hg1w,Hg0w,epsilon
0,0.0,0.0,3.154,3.421,1.0,1.0,1.0,0.0,0,1,...,1.535674,0.822835,0.669957,0.152878,0.888269,0.708533,1.411367,1.411367,-3.430918,0.050837
1,0.0,0.0,3.02,4.957,1.0,1.0,1.0,0.0,0,1,...,1.757634,0.852913,0.717065,0.135848,1.213697,0.770952,1.297097,1.297097,-4.365905,0.050837
2,1.0,1.0,1.738,0.657,1.0,1.0,1.0,0.0,0,1,...,0.981649,0.727435,0.538415,0.18902,0.482803,0.61841,1.617051,1.617051,-2.62061,0.050837
3,0.0,0.0,2.264,2.357,0.0,0.0,0.0,0.0,0,1,...,1.321104,0.789365,0.620912,0.168454,0.616306,0.649378,-2.852074,1.539935,-2.852074,0.050837
4,0.0,0.0,2.763,0.263,0.0,0.0,0.0,0.0,0,1,...,1.038083,0.73848,0.552406,0.186074,0.187634,0.546771,-2.206392,1.828918,-2.206392,0.050837


In [8]:
# Update original estimator
Qstar1 = sigmoid(df['Q1W'] + df['epsilon']*df['Hg1w'])
Qstar0 = sigmoid(df['Q0W'] + df['epsilon']*df['Hg0w'])
df['Qstar1'] = Qstar1
df['Qstar2'] = Qstar0
df['debiased_psi'] = Qstar1 - Qstar0
TMLE_Psi = df['debiased_psi'].mean()
print('Debiased Psi:', TMLE_Psi)
print('TMLE bias:', np.abs(true_psi - TMLE_Psi))
naive_relative_TMLE_bias = ((TMLE_Psi - true_psi) / true_psi)*100
print('Relative TMLE bias:', naive_relative_TMLE_bias, '%')
print('This is a reduction in bias of :', naive_relative_bias-naive_relative_TMLE_bias, '%')

Debiased Psi: 0.21733312733626242
TMLE bias: 0.016099794002929102
Relative TMLE bias: 8.000560213481416 %
This is a reduction in bias of : 21.54980223333633 %
