# Frisch–Waugh–Lovell Theorem

Causal Inference - Otherwise known as Double ML Approach

In [69]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

data = load_boston()

X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = data['target']

In [70]:
'''
y = dependent variable
d = treatment variables
X = confounding variables
a confounder (also confounding variable, confounding factor, or lurking variable) is a variable that influences both the dependent variable and independent variable, causing a spurious association.
'''
# Declare treatment variables (what we want to test the causality of)
# You should have very few treament variables
d1 = X['RM']
d2 = X['LSTAT']
X = X.drop(['RM', 'LSTAT'], axis=1)

# Get Dependent Residuals ("outcome surprise")
y_hat = LinearRegression().fit(X, y).predict(X)  # This should be best model possible (I'm using LR as an example)
y_residual = y - y_hat

# Get Treatment Residuals ("treatment surprise")
d1_hat = LinearRegression().fit(X, d1).predict(X)  # This should be best model possible (I'm using LR as an example)
d1_residual = d1 - d1_hat
d2_hat = LinearRegression().fit(X, d2).predict(X)  # This should be best model possible (I'm using LR as an example)
d2_residual = d2 - d2_hat

# Model Causal by modeling the relationalship between dependent and treament residuals
# linear weights are your causal estimate
d_residual = pd.concat([d1_residual, d2_residual], axis=1)
causal_model = sm.OLS(y_residual, d_residual).fit()

print(causal_model.summary())
print()

# Causal Estimate +/- SEM
for feature_name, coef, sem in zip(causal_model.params.index.values, causal_model.params, causal_model.bse):
    print(f'{feature_name}:\t{coef:0.4f} +/- {sem:0.4f}')
# RM has a higher casual effect than LSTAT
# Note: always supply your error with the causal estimate (regression coefficient)

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.453
Model:                            OLS   Adj. R-squared (uncentered):              0.451
Method:                 Least Squares   F-statistic:                              208.9
Date:                Mon, 29 Jul 2019   Prob (F-statistic):                    8.61e-67
Time:                        22:58:37   Log-Likelihood:                         -1498.8
No. Observations:                 506   AIC:                                      3002.
Df Residuals:                     504   BIC:                                      3010.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------