In [1]:
from importlib import reload

In [2]:
import statsmodels.api as sm

In [3]:
from datetime import datetime, timedelta

import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

from IPython.display import Image, display

from dowhy import CausalModel

In [4]:
pd.set_option("mode.chained_assignment", None)

In [5]:
import util.load as load_util
import util.explore as explore_util
import util.experiment as experiment

In [6]:
wdi_df, series_df = load_util.load_wdi()
project_df = load_util.load_projects()
wb_df = load_util.extract_wb_projects(project_df)

## First approach: binary treatment, replicating papers on causal structure

In [7]:
# load in panel assembled by DG, and country code cross-matches
# for each country-year, calculate mean growth in education indicators at year + lag

In [117]:
df, panel_source = experiment.assemble_replication_panel('education')

1. Value counts:  False    8754
True      710
Name: project_completed_year, dtype: int64
2. Value counts:  False    8754
True      710
Name: project_completed_year, dtype: int64


In [118]:
df.project_completed_year.value_counts()

False    8754
True      710
Name: project_completed_year, dtype: int64

In [119]:
panel_non_data = ['year', 'countrycode', 'regionname', 'fcv_ind', 'lendingtype', 'incomelevel']
non_data_cols = ['year', 'country', 'ppd_countrycode', 'wdi_countryname', 'project_completed_year'] + panel_non_data
data_cols = [col for col in df.columns if col not in non_data_cols]
ddf_data_cols = [col for col in panel_source.columns if col not in panel_non_data]

In [120]:
non_zero_culps = lambda culpc: (
    { key: value for key, value in sorted(culprit_counts.items(), key=lambda item: item[1], reverse=True) if value > 0 }
) 
culprit_counts, null_df = experiment.extract_culprit_counts(df, data_cols)
print(non_zero_culps(culprit_counts))

{'cash_surplus_deficit': 103, 'hiv_prevalence': 46, 'education_lag_9_growth': 45, 'health_share_gov_exp': 38, 'trade_share_gdp': 18, 'edu_share_gov_exp': 17, 'inflation': 7, 'education_lag_-4_growth': 6, 'edu_completion': 5, 'edu_pupil_teacher': 3}


In [121]:
ddf_culprits, ddf_nulls = experiment.extract_culprit_counts(panel_source, ddf_data_cols)
print(non_zero_culps(ddf_culprits))

{'cash_surplus_deficit': 103, 'hiv_prevalence': 46, 'education_lag_9_growth': 45, 'health_share_gov_exp': 38, 'trade_share_gdp': 18, 'edu_share_gov_exp': 17, 'inflation': 7, 'education_lag_-4_growth': 6, 'edu_completion': 5, 'edu_pupil_teacher': 3}


*Note*: There is no surplus deficit in the standard WDI, so using net borrowing in year (is available)

In [None]:
# Adding categorical variables for country, for replication purposes, although means dimensionality explosion (for unclear gain)

Education: Specification 2
```
regress  last_ner <- first_ner pc_commit_education [per capita commitment amount=
        edu_share_gov_exp edu_pupil_teacher young_population
        gdp_pc_ppp cash_surplus_deficit inflation trade_share_gdp
        freedom_house i.period i.ncountrycode if countrytoinclude == 1, r
```

In [122]:
df = df.rename(columns={ 'education_lag_-4_growth': 'prior_4year_growth' })

In [123]:
cols_for_replication = ['country', 'project_completed_year', 'pc_commit_education', 'edu_pupil_teacher', 
                        'young_population', 'gdp_pc_ppp', 'cash_surplus_deficit', 'inflation', 'trade_share_gdp',
                        'freedom_house', 'prior_4year_growth']

target_col = 'education_lag_4_growth'

In [124]:
est = experiment.plain_vanilla_ols(df, target_col, cols_for_replication + [target_col], add_country_feffects=True)

In [125]:
potential_cols = [f'education_lag_{i}_growth' for i in range(3, 10)]
potential_cols

['education_lag_3_growth',
 'education_lag_4_growth',
 'education_lag_5_growth',
 'education_lag_6_growth',
 'education_lag_7_growth',
 'education_lag_8_growth',
 'education_lag_9_growth']

In [126]:
print('Coefficient on primary: ', est.params['project_completed_year'], ' and P value: ', round(est.pvalues['project_completed_year'], 3))

Coefficient on primary:  0.009051655999692976  and P value:  0.119


In [127]:
dml_est, target_est, treatment_est = experiment.partial_out_ols(df, target_col, 'project_completed_year',
                                                               cols_for_replication + [target_col], True, True)

In [128]:
dml_est.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,2.723
Date:,"Tue, 08 Jun 2021",Prob (F-statistic):,0.0992
Time:,16:28:03,Log-Likelihood:,1249.5
No. Observations:,938,AIC:,-2495.0
Df Residuals:,936,BIC:,-2485.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.565e-13,0.002,7.5e-11,1.000,-0.004,0.004
0,0.0091,0.005,1.650,0.099,-0.002,0.020

0,1,2,3
Omnibus:,216.234,Durbin-Watson:,1.959
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4260.021
Skew:,-0.51,Prob(JB):,0.0
Kurtosis:,13.39,Cond. No.,2.63


In [129]:
print(est.summary())

                              OLS Regression Results                              
Dep. Variable:     education_lag_4_growth   R-squared:                       0.357
Model:                                OLS   Adj. R-squared:                  0.282
Method:                     Least Squares   F-statistic:                     4.759
Date:                    Tue, 08 Jun 2021   Prob (F-statistic):           1.21e-36
Time:                            16:28:22   Log-Likelihood:                 1249.5
No. Observations:                     938   AIC:                            -2301.
Df Residuals:                         839   BIC:                            -1822.
Df Model:                              98                                         
Covariance Type:                nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------

In [130]:
print(target_est.summary())

                              OLS Regression Results                              
Dep. Variable:     education_lag_4_growth   R-squared:                       0.355
Model:                                OLS   Adj. R-squared:                  0.281
Method:                     Least Squares   F-statistic:                     4.775
Date:                    Tue, 08 Jun 2021   Prob (F-statistic):           1.45e-36
Time:                            16:28:27   Log-Likelihood:                 1248.2
No. Observations:                     938   AIC:                            -2300.
Df Residuals:                         840   BIC:                            -1826.
Df Model:                              97                                         
Covariance Type:                nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------

## Now use EconML

In [154]:
from econml.dml import LinearDML
from sklearn.linear_model import LassoCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from econml.inference import BootstrapInference

In [135]:
treatment_col = 'project_completed_year'

In [136]:
feature_cols = ['pc_commit_education', 'edu_pupil_teacher', 'young_population', 'gdp_pc_ppp', 
                'cash_surplus_deficit', 'inflation', 'trade_share_gdp', 'freedom_house', 'prior_4year_growth']

In [165]:
Y, T, X, W = experiment.assemble_econml_tuples(df, target_col, treatment_col, feature_cols)

In [166]:
est = LinearDML(model_t=LogisticRegressionCV(max_iter=500), discrete_treatment=True)
# est = LinearDML(model_t=RandomForestClassifier(), discrete_treatment=True)
est.fit(Y, T, X=X, W=W) # W -> high-dimensional confounders, X -> features

In [168]:
print(est.score_)
print(est.summary())

0.006123389834152523
                           Coefficient Results                            
                     point_estimate stderr zstat  pvalue ci_lower ci_upper
--------------------------------------------------------------------------
pc_commit_education            -0.0    0.0 -2.072  0.038   -0.001     -0.0
edu_pupil_teacher            -0.001  0.001 -0.514  0.607   -0.003    0.002
young_population                0.0  0.001  0.162  0.872   -0.002    0.002
gdp_pc_ppp                     -0.0    0.0  -1.62  0.105     -0.0      0.0
cash_surplus_deficit            0.0  0.001  0.149  0.882   -0.002    0.003
inflation                       0.0  0.001  0.379  0.705   -0.001    0.002
trade_share_gdp                -0.0    0.0 -1.504  0.132   -0.001      0.0
freedom_house                -0.007  0.005 -1.386  0.166   -0.014    0.001
prior_4year_growth            0.138  0.216   0.64  0.522   -0.217    0.492
                       CATE Intercept Results                       
          

In [151]:
# point = est.const_marginddal_effect(X)
# print(point)
# est.effect(X, T0=False, T1=True)

In [169]:
# initiating some crawls, to find anything