# DoubleML, Estimating Causal Effects

## Bonus data

In [1]:
import pandas as pd
import numpy as np
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR

np.random.seed(3141)

Xy = fetch_bonus(return_type='DataFrame')

y_col = 'inuidur1'
d_cols = 'tg'
x_cols = [
    'female', 'black', 'othrace', 'dep1', 'dep2',
    'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54',
    'durable', 'lusd', 'husd'
]

dml_data = DoubleMLData(
    Xy,
    y_col=y_col,
    d_cols=d_cols,
    x_cols=x_cols
)

learner = RandomForestRegressor(n_estimators=500, max_features='sqrt', max_depth=5)
ml_l = clone(learner)
ml_m = clone(learner)

dml_model = DoubleMLPLR(dml_data, ml_l, ml_m)
dml_model.fit(store_models=True)
dml_model.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
tg,-0.076684,0.035411,-2.165549,0.030346,-0.146087,-0.00728


## Net financial assets data

In [2]:
from doubleml.datasets import fetch_401K
from sklearn.ensemble import RandomForestClassifier

np.random.seed(3141)

Xy = fetch_401K(return_type='DataFrame')

y_col = 'net_tfa'
d_cols = 'e401'
x_cols = [
    'age', 'inc', 'educ', 
    'fsize', 'marr', 'twoearn', 
    'db', 'pira', 'hown'
]

dml_data = DoubleMLData(
    Xy,
    y_col=y_col,
    d_cols=d_cols,
    x_cols=x_cols
)

ml_l = RandomForestRegressor(n_estimators=500, max_depth=7, max_features=3, min_samples_leaf=3)
ml_m = RandomForestClassifier(n_estimators=500, max_depth=5, max_features=4, min_samples_leaf=7)

dml_model = DoubleMLPLR(dml_data, ml_l, ml_m)
dml_model.fit(store_models=True)
dml_model.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
e401,9059.917779,1306.284784,6.935638,4.043932e-12,6499.646648,11620.188909


## Simulated data

In [3]:
np.random.seed(3141)
n_obs = 500
n_vars = 100
theta = 3

X = np.random.normal(size=(n_obs, n_vars))
d = np.dot(X[:, :3], np.array([5, 5, 5])) + np.random.standard_normal(size=(n_obs,))
y = theta * d + np.dot(X[:, :3], np.array([5, 5, 5])) + np.random.standard_normal(size=(n_obs,))

dml_data = DoubleMLData.from_arrays(X, y, d)

learner = LassoCV()
ml_l = clone(learner)
ml_m = clone(learner)

dml_model = DoubleMLPLR(dml_data, ml_l, ml_m)
dml_model.fit(store_models=True)
dml_model.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
d,3.051458,0.045425,67.175124,0.0,2.962426,3.14049


## Admissions data

In [4]:
url = 'https://raw.githubusercontent.com/selva86/datasets/master/Admission.csv'
Xy = pd.read_csv(url) \
    .drop(columns=['Chance of Admit ', 'Serial No.'])

In [5]:
from sklearn.base import BaseEstimator

class MyEstimator(BaseEstimator):
    def __init__(self):
        self.m = RandomForestClassifier(n_estimators=50, random_state=37, n_jobs=-1)
        self._estimator_type = 'regressor'

    def fit(self, X, y):
        self.m.fit(X, y)

    def predict(self, X):
        return self.m.predict_proba(X)[:,1]

    def score(self, X, y):
        return self.m.score(X, y)
    
    @property
    def feature_importances_(self):
        return self.m.feature_importances_
    
def get_model(d_col):
    np.random.seed(3141)
    dml_data = DoubleMLData(
        Xy,
        y_col='Research',
        d_cols=d_col,
        x_cols=list(Xy.drop(columns=['Research', d_col]).columns)
    )

    ml_l = MyEstimator()
    ml_m = RandomForestRegressor(n_estimators=50, random_state=37, n_jobs=-1)

    dml_model = DoubleMLPLR(dml_data, ml_l, ml_m)
    dml_model.fit(store_models=True)
    
    return dml_model

def get_summary(d_col):
    dml_model = get_model(d_col)
    return dml_model.summary

In [6]:
pd.concat([get_summary(c) for c in Xy.drop(columns=['Research']).columns])

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
GRE Score,0.0191,0.003413,5.596341,2.189232e-08,0.012411,0.025789
TOEFL Score,-0.00594,0.007867,-0.755013,0.4502409,-0.021358,0.009479
University Rating,-0.006588,0.026446,-0.24911,0.8032754,-0.058421,0.045245
SOP,-0.005823,0.034981,-0.16645,0.8678026,-0.074383,0.062738
LOR,0.032732,0.032516,1.006638,0.314109,-0.030998,0.096462
CGPA,0.086724,0.082471,1.051576,0.2929943,-0.074916,0.248364
