# DoubleML, Estimating Causal Effects

## Bonus data

In [1]:
import pandas as pd
import numpy as np
from doubleml.datasets import fetch_bonus
from doubleml import DoubleMLData
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from doubleml import DoubleMLPLR

np.random.seed(3141)

Xy = fetch_bonus(return_type='DataFrame')

dml_data = DoubleMLData(
    Xy,
    y_col='inuidur1',
    d_cols='tg',
    x_cols=[
        'female', 'black', 'othrace', 'dep1', 'dep2',
        'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54',
        'durable', 'lusd', 'husd']
)

learner = RandomForestRegressor(n_estimators = 500, max_features = 'sqrt', max_depth= 5)
ml_l = clone(learner)
ml_m = clone(learner)

dml_model = DoubleMLPLR(dml_data, ml_l, ml_m)
dml_model.fit(store_models=True)
dml_model.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
tg,-0.076684,0.035411,-2.165549,0.030346,-0.146087,-0.00728


## Net financial assets data

In [26]:
from doubleml.datasets import fetch_401K
from sklearn.ensemble import RandomForestClassifier

np.random.seed(3141)

Xy = fetch_401K(return_type='DataFrame')

dml_data = DoubleMLData(
    Xy, 
    y_col='net_tfa', 
    d_cols='e401', 
    x_cols=['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown']
)

ml_l = RandomForestRegressor(n_estimators=500, max_depth=7, max_features=3, min_samples_leaf=3)
ml_m = RandomForestClassifier(n_estimators=500, max_depth=5, max_features=4, min_samples_leaf=7)

dml_model = DoubleMLPLR(dml_data, ml_l, ml_m)
dml_model.fit(store_models=True)
dml_model.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
e401,9059.917779,1306.284784,6.935638,4.043932e-12,6499.646648,11620.188909


In [29]:
X = Xy[['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown']]
d = Xy['e401']
y = Xy['net_tfa']

ml_l = RandomForestRegressor(n_estimators=500, max_depth=7, max_features=3, min_samples_leaf=3)
ml_m = RandomForestClassifier(n_estimators=500, max_depth=5, max_features=4, min_samples_leaf=7)

In [30]:
ml_m.fit(X, d)

In [35]:
ml_l.fit(pd.DataFrame({ 'd': d, 'm': ml_m.predict(X)}), y)

## Simulated data

In [3]:
np.random.seed(3141)
n_obs = 500
n_vars = 100
theta = 3

X = np.random.normal(size=(n_obs, n_vars))
d = np.dot(X[:, :3], np.array([5, 5, 5])) + np.random.standard_normal(size=(n_obs,))
y = theta * d + np.dot(X[:, :3], np.array([5, 5, 5])) + np.random.standard_normal(size=(n_obs,))

dml_data = DoubleMLData.from_arrays(X, y, d)

learner = LassoCV()
ml_l = clone(learner)
ml_m = clone(learner)

dml_model = DoubleMLPLR(dml_data, ml_l, ml_m)
dml_model.fit(store_models=True)
dml_model.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
d,3.051458,0.045425,67.175124,0.0,2.962426,3.14049


## Admissions data

In [4]:
url = 'https://raw.githubusercontent.com/selva86/datasets/master/Admission.csv'
Xy = pd.read_csv(url) \
    .drop(columns=['Chance of Admit ', 'Serial No.'])

In [5]:
from sklearn.base import BaseEstimator

class MyEstimator(BaseEstimator):
    def __init__(self):
        self.m = RandomForestClassifier(n_estimators=50, random_state=37, n_jobs=-1)
        self._estimator_type = 'regressor'

    def fit(self, X, y):
        self.m.fit(X, y)

    def predict(self, X):
        return self.m.predict_proba(X)[:,1]

    def score(self, X, y):
        return self.m.score(X, y)
    
    @property
    def feature_importances_(self):
        return self.m.feature_importances_

In [6]:
def get_model(d_col):
    np.random.seed(3141)
    dml_data = DoubleMLData(
        Xy,
        y_col='Research',
        d_cols=d_col,
        x_cols=list(Xy.drop(columns=['Research', d_col]).columns)
    )

    ml_l = MyEstimator()
    ml_m = RandomForestRegressor(n_estimators=50, random_state=37, n_jobs=-1)

    dml_model = DoubleMLPLR(dml_data, ml_l, ml_m)
    dml_model.fit(store_models=True)
    
    return dml_model

def get_summary(d_col):
    dml_model = get_model(d_col)
    return dml_model.summary

### Admissions, ATE

In [23]:
pd.concat([get_summary(c) for c in Xy.drop(columns=['Research']).columns])

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
GRE Score,0.0191,0.003413,5.596341,2.189232e-08,0.012411,0.025789
TOEFL Score,-0.00594,0.007867,-0.755013,0.4502409,-0.021358,0.009479
University Rating,-0.006588,0.026446,-0.24911,0.8032754,-0.058421,0.045245
SOP,-0.005823,0.034981,-0.16645,0.8678026,-0.074383,0.062738
LOR,0.032732,0.032516,1.006638,0.314109,-0.030998,0.096462
CGPA,0.086724,0.082471,1.051576,0.2929943,-0.074916,0.248364


### Admissions, feature importances

In [12]:
def get_feature_importances(m, d_col):
    return pd.Series(
        m.feature_importances_,
        list(Xy.drop(columns=['Research', d_col]).columns)
    )

def get_all_feature_importances(m, d_col, k):
    n = len(m.models[k][d_col][0])
    return pd.DataFrame({f'{i}': get_feature_importances(m.models[k][d_col][0][i], d_col) for i in range(n)}).T

def get_avg_feature_importances(d_col):
    m = get_model(d_col)
    return pd.DataFrame({
        'ml_m': get_all_feature_importances(m, d_col, 'ml_m').mean(),
        'ml_l': get_all_feature_importances(m, d_col, 'ml_l').mean()
    })

In [13]:
get_avg_feature_importances('GRE Score')

Unnamed: 0,ml_m,ml_l
TOEFL Score,0.416697,0.295821
University Rating,0.028318,0.086886
SOP,0.033369,0.117475
LOR,0.032381,0.122812
CGPA,0.489235,0.377006


In [14]:
get_avg_feature_importances('TOEFL Score')

Unnamed: 0,ml_m,ml_l
GRE Score,0.615354,0.412481
University Rating,0.026497,0.068826
SOP,0.040517,0.103956
LOR,0.033903,0.101162
CGPA,0.28373,0.313575


In [15]:
get_avg_feature_importances('University Rating')

Unnamed: 0,ml_m,ml_l
GRE Score,0.178181,0.386554
TOEFL Score,0.095144,0.16207
SOP,0.216887,0.097775
LOR,0.067901,0.091976
CGPA,0.441887,0.261624


In [16]:
get_avg_feature_importances('SOP')

Unnamed: 0,ml_m,ml_l
GRE Score,0.084909,0.393891
TOEFL Score,0.107692,0.166184
University Rating,0.243269,0.068985
LOR,0.239526,0.093901
CGPA,0.324604,0.277039


In [17]:
get_avg_feature_importances('CGPA')

Unnamed: 0,ml_m,ml_l
GRE Score,0.445866,0.450488
TOEFL Score,0.386038,0.21672
University Rating,0.045223,0.092789
SOP,0.057816,0.118835
LOR,0.065057,0.121167
