In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:


def gen_late_med(N):
    
    X = np.random.randn(N, 1)
    UM = np.random.randn(N, 1)
    UY = np.random.randn(N, 1)
    UC = np.random.randn(N, 1)
    
    M = 0.5 * X + UM
    C = 0.5 * M + UC
    Y = 0.5 * M + 0.5 * C + UY
    
    # true effect of X -> Y is 0.5*((0.5*0.5) + 0.5) = 0.375
    
    return X, M, C, Y

def gen_late_confounder(N):
    
    X = np.random.randn(N, 1)
    UM = np.random.randn(N, 1)
    UY = np.random.randn(N, 1)
    C = np.random.randn(N, 1)
    
    M = 0.5 * X + 0.5 * C + UM
    Y = 0.5 * M + 0.5 * C + UY
    
    # true effect of X -> Y is 0.5**2 = 0.25
    
    return X, M, C, Y

In [3]:

# LATE MEDIATION
results_late_med_no_c = []
results_late_med_c = []

for i in tqdm(range(5000)):
    
    N = 200000
    X, M, C, Y = gen_late_med(N)
    
    preds_nocontrol = X
    preds_control_C = np.concatenate([X, C],1)
    
    
    model = LinearRegression()
    model.fit(preds_nocontrol, Y)
    results_late_med_no_c.append(model.coef_)
    
    model = LinearRegression()
    model.fit(preds_control_C, Y)
    results_late_med_c.append(np.asarray(model.coef_[0][0]).reshape(-1,1))
    
results_late_med_no_c = np.concatenate(results_late_med_no_c)
results_late_med_c = np.concatenate(results_late_med_c)

100%|███████████████████████████████████████| 5000/5000 [02:14<00:00, 37.12it/s]


In [4]:
print('RESULTS FOR LATE MEDIATION MODEL.... \n')

print('Not including variable C: ', results_late_med_no_c.mean())
print('Including variable C: ', results_late_med_c.mean())

RESULTS FOR LATE MEDIATION MODEL.... 

Not including variable C:  0.3750047125453725
Including variable C:  0.19998935775081927


In [None]:

# LATE BACKDOOR
results_late_conf_no_c = []
results_late_conf_c = []

for i in tqdm(range(5000)):
    
    N = 200000
    X, M, C, Y = gen_late_confounder(N)
    
    preds_nocontrol = X
    preds_control_C = np.concatenate([X, C],1)
    
    
    model = LinearRegression()
    model.fit(preds_nocontrol, Y)
    results_late_conf_no_c.append(model.coef_)
    
    model = LinearRegression()
    model.fit(preds_control_C, Y)
    results_late_conf_c.append(np.asarray(model.coef_[0][0]).reshape(-1,1))
    
results_late_conf_no_c = np.concatenate(results_late_conf_no_c)
results_late_conf_c = np.concatenate(results_late_conf_c)

 48%|██████████████████▊                    | 2408/5000 [01:03<01:08, 38.04it/s]

In [None]:
print('RESULTS FOR LATE CONFOUNDER MODEL.... \n')

print('Not including variable C: ', results_late_conf_no_c.mean())
print('Including variable C: ', results_late_conf_c.mean())