In [5]:
import numpy as np
import statsmodels.formula.api as smf
import pandas as pd

In [6]:
N = 100000
np.random.seed(140125)

# Define the coefficients of the causal graph
x_ab = 1
x_bc = 2
x_cf = 1
x_dc = -1
x_ea = 3
x_ec = 2

# Generate additive noise
n_a = np.random.randn(N)
n_b = np.random.randn(N)
n_c = np.random.randn(N)
n_d = np.random.randn(N)
n_e = np.random.randn(N)
n_f = np.random.randn(N)

# Create the variables
D = n_d
E = n_e
A = x_ea * E + n_a
B = x_ab * A + n_b
C = x_bc * B + x_dc * D + x_ec * E + n_c
F = x_cf * C + n_f

# Collect in a dataframe
data = pd.DataFrame({'A': A, 'B': B, 'C': C, 'D': D, 'E': E, 'F': F})

In [7]:
# c: estimate the causal effect of A on C through linear regression C ~ A + S, with the set S you found in b.
results = smf.ols('C ~ A + E - 1', data=data).fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      C   R-squared (uncentered):                   0.920
Model:                            OLS   Adj. R-squared (uncentered):              0.920
Method:                 Least Squares   F-statistic:                          5.711e+05
Date:                Tue, 14 Jan 2025   Prob (F-statistic):                        0.00
Time:                        14:49:50   Log-Likelihood:                     -2.3162e+05
No. Observations:              100000   AIC:                                  4.632e+05
Df Residuals:                   99998   BIC:                                  4.633e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [12]:
# e: if possible, estimate the causal effect of A on C using the frontdoor criterion with a suitable set M.
# ...

results1 = smf.ols('C ~ A + E - 1', data=data).fit()
print(results1.summary())

                                 OLS Regression Results                                
Dep. Variable:                      C   R-squared (uncentered):                   0.920
Model:                            OLS   Adj. R-squared (uncentered):              0.920
Method:                 Least Squares   F-statistic:                          5.711e+05
Date:                Tue, 14 Jan 2025   Prob (F-statistic):                        0.00
Time:                        14:56:30   Log-Likelihood:                     -2.3162e+05
No. Observations:              100000   AIC:                                  4.632e+05
Df Residuals:                   99998   BIC:                                  4.633e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------