In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier, XGBRegressor

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.system("pip install doubleml")
import doubleml as dml
from doubleml.datasets import fetch_401K

  import pandas.util.testing as tm


In [4]:
# data reading and binarization
dataset = pd.read_csv('/content/drive/MyDrive/stat274/IVdata-1.csv')
dataset['instrument'] = (dataset.xunshi).apply(int)
dataset['treatment'] = (dataset.tenure >= 3).apply(int)
dataset['outcome'] = (dataset.xc_lockdown).apply(int)

In [5]:
# Basic model
data_dml_base_iv = dml.DoubleMLData(dataset,
                                    y_col='outcome',
                                    d_cols='treatment',
                                    z_cols='instrument',
                                    x_cols=['sub_prov_ct', 'gdp_per_10k', 'primary_emp_share_total'])

In [6]:
# Random Forest
randomForest = RandomForestRegressor(n_estimators=500)
randomForest_class = RandomForestClassifier(n_estimators=500)

np.random.seed(123)
dml_iivm_forest = dml.DoubleMLIIVM(data_dml_base_iv,
                                   ml_g = randomForest,
                                   ml_m = randomForest_class,
                                   ml_r = randomForest_class,
                                   subgroups = {'always_takers': False,
                                                'never_takers': True},
                                   trimming_threshold = 0.01,
                                   n_folds = 3)

# Set nuisance-part specific parameters
dml_iivm_forest.set_ml_nuisance_params('ml_g0', 'treatment', {
    'max_depth': 6, 'max_features': 3, 'min_samples_leaf': 7})
dml_iivm_forest.set_ml_nuisance_params('ml_g1', 'treatment', {
    'max_depth': 6, 'max_features': 3, 'min_samples_leaf': 5})
dml_iivm_forest.set_ml_nuisance_params('ml_r1', 'treatment', {
    'max_depth': 4, 'max_features': 3, 'min_samples_leaf': 6})

dml_iivm_forest.fit(store_predictions=True) 
forest_summary = dml_iivm_forest.summary

forest_summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
treatment,-0.090977,0.519723,-0.175049,0.861042,-1.109615,0.927661


In [7]:
# Trees
trees = DecisionTreeRegressor(max_depth=30)
trees_class = DecisionTreeClassifier(max_depth=30)

np.random.seed(123)
dml_iivm_tree = dml.DoubleMLIIVM(data_dml_base_iv,
                                 ml_g = trees,
                                 ml_m = trees_class,
                                 ml_r = trees_class,
                                 subgroups = {'always_takers': False,
                                              'never_takers': True},
                                 trimming_threshold = 0.01,
                                 n_folds = 3)

# Set nuisance-part specific parameters
dml_iivm_tree.set_ml_nuisance_params('ml_g0', 'treatment', {
    'ccp_alpha': 0.0016, 'min_samples_split': 74, 'min_samples_leaf': 24})
dml_iivm_tree.set_ml_nuisance_params('ml_g1', 'treatment', {
    'ccp_alpha': 0.0018, 'min_samples_split': 70, 'min_samples_leaf': 23})
dml_iivm_tree.set_ml_nuisance_params('ml_m', 'treatment', {
    'ccp_alpha': 0.0028, 'min_samples_split': 167, 'min_samples_leaf': 55})
dml_iivm_tree.set_ml_nuisance_params('ml_r1', 'treatment', {
    'ccp_alpha': 0.0576, 'min_samples_split': 55, 'min_samples_leaf': 18})

dml_iivm_tree.fit(store_predictions=True)
tree_summary = dml_iivm_tree.summary

tree_summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
treatment,0.87162,0.433403,2.011106,0.044314,0.022165,1.721075
