In [1]:
%config Completer.use_jedi = False
from importlib import reload

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

pd.set_option("mode.chained_assignment", None)

import util.load as load_util
import util.explore as explore_util
import util.experiment as experiment

from econml.dml import LinearDML, SparseLinearDML, NonParamDML
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor

### Load in data frames

1. Load in panel assembled by DG, and country code cross-matches
2. For each country-year, calculate mean growth in education indicators at year + lag

In [2]:
project_df = load_util.load_projects() # loads in aid data projects
edu_treatment_df = load_util.assemble_sector_ratings(project_df, 'Education').fillna(0) # loads in 

In [3]:
recreate_df = False

if recreate_df:
    panel_df, panel_source = experiment.assemble_replication_panel('education', reload=False)
    edu_df = panel_df.merge(edu_treatment_df, how='left', left_on=['year', 'country'], right_on=['end_year', 'country_code'])
    treatment_cols = [col for col in edu_treatment_df.columns if col not in ["end_year", "country_code"]]
    edu_df[treatment_cols] = edu_df[treatment_cols].fillna(0)
    
    sectors = ['total', 'education', 'health', 'wash']
    suffixes = ['wb', 'ppd']

    for sector in sectors:
        for suffix in suffixes:
            edu_df[f"pc_commit_{sector}_{suffix}"] = edu_df[f"commit_{sector}_{suffix}"] / edu_df["population"]
else:
    edu_df = pd.read_csv('../data/transformed_data/education_df.csv', index_col=0)
    # deal with some legacy (quicker than regenerating)
    edu_df = edu_df.drop(columns=[f"education_lag_{i}_growth" for i in range(1, 10)], errors="ignore")
    edu_df = edu_df.drop(columns=[f"education_lag_{i}_count" for i in range(1, 10)], errors="ignore")
    edu_df = edu_df.drop(columns=["education_lag_-4_count", "mean_pc_last_5", "future_edu_ner", "lagged_edu_ner", "ner_growth"], errors="ignore")

### Construct lagged indicators, fille in needed columns

In [4]:
edu_df.project_completed_year.value_counts()

False    8754
True      710
Name: project_completed_year, dtype: int64

In [5]:
edu_df = experiment.add_project_and_aid_cols(edu_df, rated_too=False)

if 'edu_ner_lag5' not in edu_df:
    print('Generating past net enrollment rates')
    edu_df = explore_util.lag_variable_simple(edu_df, 'edu_ner', 5)

if 'future_edu_ner' not in edu_df:
    print('Generating future net enrollment rates')
    edu_df = explore_util.lag_variable_simple(edu_df, 'edu_ner', -5)
    edu_df = edu_df.rename(columns = { 'edu_ner_lag-5': 'future_edu_ner'})
    
edu_df['period'] = round((edu_df.year - 1900) / 5) - 10
edu_df['prior_ner_growth'] = edu_df['edu_ner'] / edu_df['edu_ner_lag5']

Generating future net enrollment rates


In [6]:
# edu_df[edu_df.edu_ner_lag5.notna()][['year', 'country', 'edu_ner', 'edu_ner_lag5', 'pc_commit_education', 'education_mean_pc_rolling_5', 'education_mean_pc_rolling_5_lag1', 'education_wb_mean_pc_rolling_5_lag1']].head()

In [7]:
store_df = False

if store_df:
    edu_df.to_csv('../data/transformed_data/education_df.csv')

## Education: replicate original, then probe the specification

Education: Specification 2
```
regress  last_ner <- first_ner pc_commit_education [per capita commitment amount=
        edu_share_gov_exp edu_pupil_teacher young_population
        gdp_pc_ppp cash_surplus_deficit inflation trade_share_gdp
        freedom_house i.period i.ncountrycode if countrytoinclude == 1, r
```

In [8]:
treatment_cols = [
    'mean_pc_last_5',
    'mean_pc_last_5_ppd',
    'mean_pc_last_5_wb'
]

rating_cols = ['education_max_proj_5yr', 'education_satisfactory_proj']

data_cols = ['edu_ner', 'edu_share_gov_exp', 'edu_pupil_teacher', 'young_population', 'gdp_pc_ppp', 
             'cash_surplus_deficit', 'inflation', 'trade_share_gdp', 'freedom_house', 'prior_ner_growth'] + rating_cols
    
edu_treatment_col = 'mean_pc_last_5'

initial_drop = ['prior_ner_growth', 'edu_share_gov_exp', 'prior_4year_growth']
r_est = experiment.evaluate_treatment(edu_df, 'edu_ner', edu_treatment_col, data_cols,
                          remove_feature_cols=initial_drop,
                          add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
                                     add_period_feffects=False)

experiment.extract_treatment_results('Replication', r_est, 'edu_ner', edu_treatment_col, data_cols, None)

{'Label': 'Replication',
 'Target': 'edu_ner',
 'Regression P': 1.5460182563223376e-136,
 'Treatment column': 'mean_pc_last_5',
 'Treatment significance': 0.0004336594879360134,
 'Treatment coefficient': 0.12312822239126353,
 'Sig feature coefficient': {'edu_pupil_teacher': -0.0249,
  'young_population': -0.0877,
  'gdp_pc_ppp': 0.0001},
 'All p-values': {'edu_pupil_teacher': 0.0255,
  'young_population': 0.0,
  'gdp_pc_ppp': 0.0234,
  'cash_surplus_deficit': 0.7152,
  'inflation': 0.2844,
  'trade_share_gdp': 0.0859,
  'freedom_house': 0.2645,
  'education_max_proj_5yr': 0.401,
  'education_satisfactory_proj': 0.7879,
  'mean_pc_last_5': 0.0004,
  'AGO': 0.0,
  'ALB': 0.0,
  'ARG': 0.1334,
  'ARM': 0.0,
  'ATG': 0.0015,
  'AZE': 0.0,
  'BDI': 0.0,
  'BEN': 0.0,
  'BFA': 0.0,
  'BGD': 0.0,
  'BGR': 0.0,
  'BIH': 0.0966,
  'BLR': 0.0,
  'BLZ': 0.1002,
  'BOL': 0.0,
  'BRA': 0.0,
  'BRB': 0.0994,
  'BTN': 0.0,
  'BWA': 0.0,
  'CAF': 0.0,
  'CHL': 0.0005,
  'CHN': 0.0946,
  'CIV': 0.0,
  

In [9]:
print(r_est.summary())

                            OLS Regression Results                            
Dep. Variable:                edu_ner   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.512
Method:                 Least Squares   F-statistic:                     13.46
Date:                Wed, 08 Sep 2021   Prob (F-statistic):          1.55e-136
Time:                        12:05:24   Log-Likelihood:                -1868.0
No. Observations:                1238   AIC:                             3946.
Df Residuals:                    1133   BIC:                             4484.
Df Model:                         104                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
edu_pupil_teacher 

In [104]:
search_grid = {
    'Straight replication, all aid, no ratings': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'w_avg_rating', 'satisfactory_proj', 'mean_pc_last_5_ppd']
        )
    },
    'Include rating information': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5_ppd',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'mean_pc_last_5']
        )
    },
    'Include period fixed effects': {
        'target_col': 'edu_ner', 
        'treatment_col': edu_treatment_col,
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=True, log_target=True, log_treatment=True, add_period_feffects=True,
            remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'w_avg_rating', 'satisfactory_proj']
        )        
    },
    'Include prior growth across education outcomes': {
        'target_col': 'edu_ner', 
        'treatment_col': edu_treatment_col,
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'satisfactory_proj', 'w_avg_rating']
        )
    },
    'Properly include govt share spend': {
        'target_col': 'edu_ner', 
        'treatment_col': edu_treatment_col,
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'satisfactory_proj']
        )
    }
}

In [105]:
treatment_search_result = []
estimators = {}
for label, args in search_grid.items():
    est = experiment.evaluate_treatment(edu_df, args['target_col'], args['treatment_col'], data_cols, **args['est_kw_args'])
    results = experiment.extract_treatment_results(label, est, args['target_col'], args['treatment_col'], data_cols, args['est_kw_args'])
    treatment_search_result.append(results)
    estimators[label] = est

gsearch_results = pd.DataFrame(treatment_search_result)
gsearch_results

Unnamed: 0,Label,Target,Regression P,Treatment column,Treatment significance,Treatment coefficient,Sig feature coefficient,All p-values,Number significant FE,Mean coefficient on FE,Keyword args
0,"Straight replication, all aid, no ratings",edu_ner,1.55e-136,mean_pc_last_5,0.000434,0.12,"{'edu_pupil_teacher': -0.0249, 'young_populati...","{'edu_pupil_teacher': 0.0255, 'young_populatio...",121,9.78,"{'add_country_feffects': True, 'add_constant':..."
1,Include rating information,edu_ner,3.57e-135,mean_pc_last_5_ppd,0.0179,0.05,"{'edu_pupil_teacher': -0.0259, 'young_populati...","{'edu_pupil_teacher': 0.0205, 'young_populatio...",125,11.3,"{'add_country_feffects': True, 'add_constant':..."
2,Include period fixed effects,edu_ner,7.18e-126,mean_pc_last_5,0.119,-0.1,"{'edu_pupil_teacher': -0.03, 'gdp_pc_ppp': 0.0...","{'const': 0.0271, 'edu_pupil_teacher': 0.0077,...",65,2.01,"{'add_country_feffects': True, 'add_constant':..."
3,Include prior growth across education outcomes,edu_ner,1.75e-119,mean_pc_last_5,0.409,-0.05,"{'edu_pupil_teacher': -0.0323, 'gdp_pc_ppp': 0...","{'edu_pupil_teacher': 0.0045, 'young_populatio...",124,8.36,"{'add_country_feffects': True, 'add_constant':..."
4,Properly include govt share spend,edu_ner,2.72e-96,mean_pc_last_5,0.306,0.07,"{'edu_pupil_teacher': -0.0335, 'young_populati...","{'edu_share_gov_exp': 0.0839, 'edu_pupil_teach...",101,10.3,"{'add_country_feffects': True, 'add_constant':..."


In [108]:
print(estimators["Straight replication, all aid, no ratings"].summary())

                            OLS Regression Results                            
Dep. Variable:                edu_ner   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.512
Method:                 Least Squares   F-statistic:                     13.46
Date:                Wed, 08 Sep 2021   Prob (F-statistic):          1.55e-136
Time:                        13:08:35   Log-Likelihood:                -1868.0
No. Observations:                1238   AIC:                             3946.
Df Residuals:                    1133   BIC:                             4484.
Df Model:                         104                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
edu_pupil_teacher 

In [106]:
print(estimators["Include rating information"].summary())

                            OLS Regression Results                            
Dep. Variable:                edu_ner   R-squared:                       0.550
Model:                            OLS   Adj. R-squared:                  0.509
Method:                 Least Squares   F-statistic:                     13.31
Date:                Wed, 08 Sep 2021   Prob (F-statistic):          3.57e-135
Time:                        13:08:00   Log-Likelihood:                -1871.7
No. Observations:                1238   AIC:                             3953.
Df Residuals:                    1133   BIC:                             4491.
Df Model:                         104                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
edu_pupil_teacher 

In [12]:
write_results = False

if write_results:
    with open("./growth_not_abs_ner_target.txt", "w") as file:
        file.write(estimators['Use growth in NER as target'].summary().as_text())

    with open("./base_replication_full.txt", "w") as file:
        file.write(estimators['Straight replication'].summary().as_text())

    gsearch_results.to_csv('../data/results/education_model_crawl.csv', float_format='%.4f')

In [13]:
display_repl_summary = False

if display_repl_summary:
    print("*** Standard Replication: ")
    print(r_est.summary())
    
    print("*** Replication with average rating: ")
    print(estimators['Include rating information'].summary())

### Partialling out and EconML

In [14]:
feature_cols = ['edu_share_gov_exp', 'edu_pupil_teacher', 'young_population', 'gdp_pc_ppp', 
             'cash_surplus_deficit', 'inflation', 'trade_share_gdp', 'freedom_house']

edu_df['rolling_mean_edu_ner'] = explore_util.rolling_country_agg(edu_df, 'edu_ner', 5, 'mean')

edu_df = explore_util.lag_variable_simple(edu_df, 'mean_pc_last_5_ppd', 1)
edu_df = explore_util.lag_variable_simple(edu_df, 'w_avg_rating', 5)

In [16]:
# do this manually first, as EconML results are proving volatile and highly counter-intuitive
dlm_df = edu_df.copy()

# do some scaling
cols_to_scale = ['rolling_mean_edu_ner', 'w_avg_rating_lag5', 'mean_pc_last_5_ppd_lag1', "education_max_proj_5yr"] + feature_cols
for col in cols_to_scale:
    dlm_df[col] = (dlm_df[col] - dlm_df[col].mean()) / dlm_df[col].std()

In [66]:
def partial_out_crawl(specific_rating_col, outcome_col, feature_cols, sector_data_sets):
    dml_results = []
    for label in sector_data_sets:
        dml_est, est_target, est_treatment, result_dict = experiment.perform_dml_on_df(
            sector_data_sets[label], label, outcome_col, specific_rating_col, feature_cols)
        dml_results.append(result_dict)
        
    return pd.DataFrame(dml_results)

In [21]:
rating_col = 'w_avg_rating_lag5'
magnitude_col = 'mean_pc_last_5_ppd_lag1'
target_col = 'rolling_mean_edu_ner'

edu_data_sets = { 
    "all_years": dlm_df, 
    "only_from_raters": dlm_df[dlm_df[magnitude_col] > 0], 
    "only_rated": dlm_df[dlm_df[rating_col] > 0]
}

# only_rated_df = dlm_df[dlm_df[magnitude_col] > 0]

max_proj_df = partial_out_crawl("education_max_proj_5yr", target_col, cols_to_scale, edu_data_sets)
last_proj_df = partial_out_crawl("w_avg_rating_lag5", target_col, cols_to_scale, edu_data_sets)

full_edu_df = pd.concat((max_proj_df, last_proj_df)).round(2)
full_edu_df

Partialling out, target: rolling_mean_edu_ner, treatment: education_max_proj_5yr
Number of observations before dropping NA:  9464
Number of observations after dropping NA:  731
Partialling out, target: rolling_mean_edu_ner, treatment: education_max_proj_5yr
Number of observations before dropping NA:  428
Number of observations after dropping NA:  151
Partialling out, target: rolling_mean_edu_ner, treatment: education_max_proj_5yr
Number of observations before dropping NA:  245
Number of observations after dropping NA:  79
Partialling out, target: rolling_mean_edu_ner, treatment: w_avg_rating_lag5
Number of observations before dropping NA:  9464
Number of observations after dropping NA:  731
Partialling out, target: rolling_mean_edu_ner, treatment: w_avg_rating_lag5
Number of observations before dropping NA:  428
Number of observations after dropping NA:  151
Partialling out, target: rolling_mean_edu_ner, treatment: w_avg_rating_lag5
Number of observations before dropping NA:  245
Numbe

Unnamed: 0,label,treatment,resid_rsq,nums,resid_pval,treatment_pval,treatment_coeff,x_rsq,x_pval,x_maxsigcoeff
0,all_years,education_max_proj_5yr,0.0,731.0,0.78,0.78,0.0,0.48,0.0,0.07
1,only_from_raters,education_max_proj_5yr,0.02,151.0,0.06,0.06,-0.06,0.35,0.0,0.14
2,only_rated,education_max_proj_5yr,0.01,79.0,0.53,0.53,0.04,0.56,0.0,-0.49
0,all_years,w_avg_rating_lag5,0.0,731.0,0.56,0.56,-0.01,0.48,0.0,0.07
1,only_from_raters,w_avg_rating_lag5,0.0,151.0,0.89,0.89,0.0,0.36,0.0,0.14
2,only_rated,w_avg_rating_lag5,0.02,79.0,0.22,0.22,0.07,0.55,0.0,-0.52


In [23]:
# feature_cols = [col for col in data_cols if col != treatment_col and col != target_col]
treatment_col = magnitude_col

Y, T, X, W = experiment.assemble_econml_tuples(edu_df[edu_df.mean_pc_last_5_ppd > 0], target_col='edu_ner', treatment_col=treatment_col, 
                                               feature_cols=feature_cols + [rating_col])

est = LinearDML(model_y=LinearRegression(), model_t=LinearRegression())
est.fit(Y, T, X=X, W=W)
print(est.summary())

                           Coefficient Results                            
                     point_estimate stderr zstat  pvalue ci_lower ci_upper
--------------------------------------------------------------------------
edu_share_gov_exp             0.048  0.084  0.571  0.568    -0.09    0.186
edu_pupil_teacher             0.158  0.152  1.037    0.3   -0.093    0.408
young_population              0.039  0.089  0.443  0.657   -0.107    0.185
gdp_pc_ppp                      0.0    0.0  1.876  0.061      0.0      0.0
cash_surplus_deficit          0.043  0.061  0.703  0.482   -0.058    0.144
inflation                    -0.019  0.018 -1.016   0.31   -0.049    0.012
trade_share_gdp               0.008  0.016  0.524    0.6   -0.017    0.034
freedom_house                -0.004  0.263 -0.015  0.988   -0.437    0.429
w_avg_rating_lag5            -0.038  0.338 -0.111  0.912   -0.594    0.519
                       CATE Intercept Results                       
               point_estimate s

In [24]:
rating_col = 'w_avg_rating'
magnitude_col = 'mean_pc_last_5_ppd'
target_col = 'edu_ner'

# feature_cols = [col for col in data_cols if col not in [rating_col, magnitude_col, target_col]]
Y, T, X, W = experiment.assemble_econml_tuples(edu_df[edu_df.mean_pc_last_5_ppd > 0], target_col=target_col, treatment_col=rating_col, 
                                               feature_cols=feature_cols + [magnitude_col])

est = LinearDML(model_y=LinearRegression(), model_t=LinearRegression())
est.fit(Y, T, X=X, W=W)
print(est.summary())

                           Coefficient Results                            
                     point_estimate stderr zstat  pvalue ci_lower ci_upper
--------------------------------------------------------------------------
edu_share_gov_exp             0.129  0.105  1.223  0.221   -0.044    0.302
edu_pupil_teacher             0.054  0.133  0.405  0.686   -0.164    0.272
young_population               0.04  0.181   0.22  0.825   -0.258    0.337
gdp_pc_ppp                      0.0    0.0  0.912  0.362     -0.0    0.001
cash_surplus_deficit         -0.219  0.109 -2.008  0.045   -0.399    -0.04
inflation                    -0.507  0.202 -2.505  0.012    -0.84   -0.174
trade_share_gdp              -0.006  0.041 -0.141  0.888   -0.073    0.061
freedom_house                 0.388  0.391  0.993  0.321   -0.255    1.031
mean_pc_last_5_ppd           -0.769  0.433 -1.776  0.076   -1.481   -0.057
                       CATE Intercept Results                       
               point_estimate s

## Health

Process:

1. Repeat outcome variable formation, using lagged construction
2. Construct sectoral aid per capita using utilities
3. Construct specification, using Diana's original notebook

In [25]:
health_treatment_df = load_util.assemble_sector_ratings(project_df, 'Health').fillna(0)

In [26]:
recreate_health_df = False

if recreate_health_df:
    hp_df, hp_source = experiment.assemble_replication_panel('health', reload=False)
    health_df = hp_df.merge(health_treatment_df, how='left', left_on=['year', 'country'], right_on=['end_year', 'country_code'])
    treatment_cols = [col for col in health_treatment_df.columns if col not in ["end_year", "country_code"]]
    health_df[treatment_cols] = edu_df[treatment_cols].fillna(0)
    
    sectors = ['total', 'education', 'health', 'wash']
    suffixes = ['wb', 'ppd']

    for sector in sectors:
        for suffix in suffixes:
            health_df[f"pc_commit_{sector}_{suffix}"] = health_df[f"commit_{sector}_{suffix}"] / health_df["population"]
else:
    health_df = pd.read_csv('../data/transformed_data/health_df.csv', index_col=0)

In [27]:
health_df = experiment.add_project_and_aid_cols(health_df, sector='health', rated_too=True)

treatment_cols = [col for col in health_treatment_df.columns if col not in ["end_year", "country_code"]]
health_df[treatment_cols] = health_df[treatment_cols].fillna(0)
print("Treatment counts: ", health_df.project_completed_year.value_counts())

Treatment counts:  False    8717
True      747
Name: project_completed_year, dtype: int64


In [28]:
store_health_df = False

if store_health_df:
    health_df.to_csv('../data/transformed_data/health_df.csv')

In [29]:
# take rolling five year averages of everything, following paper
# note: not doing this for macro variables as justification does not make sense for those (measurement/volatility)
measured_cols = ['mortality_under5', 'fertility', 'hiv_prevalence']

for m_col in measured_cols:
    health_df[f"{m_col}_pavg"] = explore_util.rolling_country_agg(health_df, m_col, 5, "mean")
    health_df = explore_util.lag_variable_simple(health_df, f"{m_col}_pavg", 1)
    
macro_cols = ["gdp_pc_ppp", "population"]
for m_col in macro_cols:
    health_df = explore_util.lag_variable_simple(health_df, m_col, 1)

In [30]:
health_df = explore_util.lag_variable_simple(health_df, "mortality_under5_pavg", 5)
health_df["lag_log_mort"] = np.log(health_df["mortality_under5_pavg_lag5"])
health_df['prior_mort_decline'] = health_df['mortality_under5_pavg'] / health_df['mortality_under5_pavg_lag5']

* Specification 1:

```qui regress mortality_under5 pc_commit_health lag_mortality_under5 ///
            lag_gdp_pc_ppp lag_fertility lag_population ///
            hiv_prevalence conflict i.period, r```

In [31]:
# health_df.columns

In [48]:
target_cols = ["mortality_under5_pavg"]
treatment_cols = ["mean_pc_last_5_ppd"]

momentum_cols = ["lag_log_mort"]
control_cols = ["hiv_prevalence_pavg_lag1", "fertility_pavg_lag1", "gdp_pc_ppp_lag1", "population_lag1", "conflict"]
rating_cols = ['health_max_proj_5yr', 'w_avg_rating_lag5']


health_data_cols = ["country"] + target_cols + treatment_cols + momentum_cols + control_cols + rating_cols

In [49]:
# first go for the paper
# replication_exclude = ['prior_mort_decline', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth',
#                       'inflation', 'trade_share_gdp', 'freedom_house', 'mean_pc_last_5']

health_est = experiment.evaluate_treatment(health_df, 'mortality_under5_pavg', 'mean_pc_last_5_ppd', health_data_cols,
                            remove_feature_cols=["prior_mort_decline"], add_country_feffects=False, 
                            add_constant=False, log_target=True, log_treatment=True)

experiment.extract_treatment_results('Health Replication', health_est, 'mortality_under5_pavg', 'mean_pc_last_5_ppd', health_data_cols, None)

{'Label': 'Health Replication',
 'Target': 'mortality_under5_pavg',
 'Regression P': 0.0,
 'Treatment column': 'mean_pc_last_5_ppd',
 'Treatment significance': 1.1624845092514475e-08,
 'Treatment coefficient': -0.008542495409015971,
 'Sig feature coefficient': {'lag_log_mort': 0.9321,
  'hiv_prevalence_pavg_lag1': 0.0083,
  'fertility_pavg_lag1': 0.0346,
  'gdp_pc_ppp_lag1': -0.0,
  'conflict': 0.0263,
  'health_max_proj_5yr': -0.0134},
 'All p-values': {'mean_pc_last_5_ppd': 0.0,
  'lag_log_mort': 0.0,
  'hiv_prevalence_pavg_lag1': 0.0,
  'fertility_pavg_lag1': 0.0,
  'gdp_pc_ppp_lag1': 0.0,
  'population_lag1': 0.875,
  'conflict': 0.0399,
  'health_max_proj_5yr': 0.0,
  'w_avg_rating_lag5': 0.1198},
 'Number significant FE': 0,
 'Mean coefficient on FE': 0,
 'Keyword args': None}

In [50]:
print(health_est.summary())

                                  OLS Regression Results                                  
Dep. Variable:     mortality_under5_pavg   R-squared (uncentered):                   0.999
Model:                               OLS   Adj. R-squared (uncentered):              0.999
Method:                    Least Squares   F-statistic:                          2.535e+05
Date:                   Wed, 08 Sep 2021   Prob (F-statistic):                        0.00
Time:                           12:14:41   Log-Likelihood:                          1287.0
No. Observations:                   1671   AIC:                                     -2556.
Df Residuals:                       1662   BIC:                                     -2507.
Df Model:                              9                                                  
Covariance Type:               nonrobust                                                  
                               coef    std err          t      P>|t|      [0.025      0.97

In [35]:
# now add in controls for macro conditions
cols_rep2 = health_data_cols + ['inflation', 'cash_surplus_deficit', 'trade_share_gdp']
health_est2 = experiment.evaluate_treatment(health_df, 'mortality_under5_pavg', 'mean_pc_last_5', cols_rep2,
                          add_country_feffects=False, add_constant=False, log_target=True, log_treatment=True)

experiment.extract_treatment_results('Health Replication with Macro', health_est2, 'mortality_under5_pavg', 'mean_pc_last_5', cols_rep2, None)

{'Label': 'Health Replication with Macro',
 'Target': 'mortality_under5_pavg',
 'Regression P': 0.0,
 'Treatment column': 'mean_pc_last_5',
 'Treatment significance': 1.2205091297635886e-13,
 'Treatment coefficient': -0.033189583627926775,
 'Sig feature coefficient': {'mean_pc_last_5_ppd': -0.0045,
  'lag_log_mort': 0.9406,
  'hiv_prevalence_pavg_lag1': 0.0092,
  'fertility_pavg_lag1': 0.0377,
  'gdp_pc_ppp_lag1': -0.0,
  'population_lag1': -0.0,
  'trade_share_gdp': -0.0004},
 'All p-values': {'mean_pc_last_5_ppd': 0.0412,
  'lag_log_mort': 0.0,
  'hiv_prevalence_pavg_lag1': 0.0,
  'fertility_pavg_lag1': 0.0,
  'gdp_pc_ppp_lag1': 0.0002,
  'population_lag1': 0.0002,
  'conflict': 0.2275,
  'health_satisfactory_proj': 0.9777,
  'w_avg_rating': 0.3606,
  'inflation': 0.0713,
  'cash_surplus_deficit': 0.4351,
  'trade_share_gdp': 0.0009,
  'mean_pc_last_5': 0.0},
 'Number significant FE': 0,
 'Mean coefficient on FE': 0,
 'Keyword args': None}

* Specification 6

```qui regress mortality_under5 pc_commit_health lag_mortality_under5 ///
            lag_gdp_pc_ppp lag_fertility lag_population ///
            lag_physicians_rate  lag_female_adult_literacy ///
            lag_access_water lag_access_sanitation ///
            hiv_prevalence conflict i.period i.nregionname, r```

In [36]:
'female_adult_literacy' in health_df

True

In [37]:
# 'female_adult_literacy',
cols_rep3 = health_data_cols + ['access_water', 'access_sanitation', 'physicians_rate']

health_est3 = experiment.evaluate_treatment(health_df, 'mortality_under5_pavg', 'mean_pc_last_5', 
                          cols_rep3, add_country_feffects=True, add_period_feffects=True,
                            add_constant=False, log_target=True, log_treatment=True)

experiment.extract_treatment_results('Health Replication 6', health_est3, 'mortality_under5_pavg', 'mean_pc_last_5', health_data_cols, None)

{'Label': 'Health Replication 6',
 'Target': 'mortality_under5_pavg',
 'Regression P': 0.0,
 'Treatment column': 'mean_pc_last_5',
 'Treatment significance': 2.5119747118836807e-18,
 'Treatment coefficient': -0.028825367294643724,
 'Sig feature coefficient': {'mean_pc_last_5_ppd': -0.0031,
  'lag_log_mort': 0.7437,
  'hiv_prevalence_pavg_lag1': 0.0154,
  'fertility_pavg_lag1': 0.0883,
  'gdp_pc_ppp_lag1': -0.0},
 'All p-values': {'mean_pc_last_5_ppd': 0.0054,
  'lag_log_mort': 0.0,
  'hiv_prevalence_pavg_lag1': 0.0,
  'fertility_pavg_lag1': 0.0,
  'gdp_pc_ppp_lag1': 0.0,
  'population_lag1': 0.0967,
  'conflict': 0.082,
  'health_satisfactory_proj': 0.4496,
  'w_avg_rating': 0.1304,
  'access_water': 0.0,
  'access_sanitation': 0.0,
  'physicians_rate': 0.1226,
  'mean_pc_last_5': 0.0,
  'AGO': 0.0,
  'ALB': 0.1184,
  'ARG': 0.0015,
  'ARM': 0.0807,
  'ATG': 0.0,
  'AZE': 0.0007,
  'BDI': 0.0142,
  'BEN': 0.0,
  'BFA': 0.0,
  'BGD': 0.0038,
  'BGR': 0.0158,
  'BIH': 0.0,
  'BLR': 0.059

In [38]:
print(health_est3.summary())

                              OLS Regression Results                             
Dep. Variable:     mortality_under5_pavg   R-squared:                       0.995
Model:                               OLS   Adj. R-squared:                  0.995
Method:                    Least Squares   F-statistic:                     2437.
Date:                   Wed, 08 Sep 2021   Prob (F-statistic):               0.00
Time:                           12:12:30   Log-Likelihood:                 2010.3
No. Observations:                   1460   AIC:                            -3793.
Df Residuals:                       1346   BIC:                            -3190.
Df Model:                            113                                         
Covariance Type:               nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

In [39]:
# now just with positive rating
cols_rep4 = health_data_cols + ['access_water', 'access_sanitation', 'physicians_rate']

health_est4 = experiment.evaluate_treatment(health_df[health_df.w_avg_rating > 0], 'mortality_under5_pavg', 'mean_pc_last_5', 
                          cols_rep3, add_country_feffects=False, add_period_feffects=False,
                            add_constant=False, log_target=True, log_treatment=True)

experiment.extract_treatment_results('Health Replication Only Rating', health_est3, 'mortality_under5_pavg', 'mean_pc_last_5', health_data_cols, None)

{'Label': 'Health Replication Only Rating',
 'Target': 'mortality_under5_pavg',
 'Regression P': 0.0,
 'Treatment column': 'mean_pc_last_5',
 'Treatment significance': 2.5119747118836807e-18,
 'Treatment coefficient': -0.028825367294643724,
 'Sig feature coefficient': {'mean_pc_last_5_ppd': -0.0031,
  'lag_log_mort': 0.7437,
  'hiv_prevalence_pavg_lag1': 0.0154,
  'fertility_pavg_lag1': 0.0883,
  'gdp_pc_ppp_lag1': -0.0},
 'All p-values': {'mean_pc_last_5_ppd': 0.0054,
  'lag_log_mort': 0.0,
  'hiv_prevalence_pavg_lag1': 0.0,
  'fertility_pavg_lag1': 0.0,
  'gdp_pc_ppp_lag1': 0.0,
  'population_lag1': 0.0967,
  'conflict': 0.082,
  'health_satisfactory_proj': 0.4496,
  'w_avg_rating': 0.1304,
  'access_water': 0.0,
  'access_sanitation': 0.0,
  'physicians_rate': 0.1226,
  'mean_pc_last_5': 0.0,
  'AGO': 0.0,
  'ALB': 0.1184,
  'ARG': 0.0015,
  'ARM': 0.0807,
  'ATG': 0.0,
  'AZE': 0.0007,
  'BDI': 0.0142,
  'BEN': 0.0,
  'BFA': 0.0,
  'BGD': 0.0038,
  'BGR': 0.0158,
  'BIH': 0.0,
  'B

In [40]:
print(health_est4.summary())

                                  OLS Regression Results                                  
Dep. Variable:     mortality_under5_pavg   R-squared (uncentered):                   1.000
Model:                               OLS   Adj. R-squared (uncentered):              1.000
Method:                    Least Squares   F-statistic:                          2.993e+04
Date:                   Wed, 08 Sep 2021   Prob (F-statistic):                   1.57e-232
Time:                           12:12:34   Log-Likelihood:                          173.50
No. Observations:                    152   AIC:                                     -321.0
Df Residuals:                        139   BIC:                                     -281.7
Df Model:                             13                                                  
Covariance Type:               nonrobust                                                  
                               coef    std err          t      P>|t|      [0.025      0.97

### Partialling out and EconML (on health)

In [41]:
health_df["log_mort"] = np.log(health_df["mortality_under5_pavg"])
health_df["log_mean_pc_last_5"] = np.log(health_df["mean_pc_last_5"])
health_df["log_proj_rating"] = np.log(health_df["w_avg_rating"].replace(0, np.nan)).fillna(0).fillna(0)

In [42]:
health_df.health_max_proj_5yr.describe()

count    9464.000000
mean        0.412854
std         1.346078
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         6.000000
Name: health_max_proj_5yr, dtype: float64

In [43]:
pd.set_option('precision', 2)

In [47]:
health_feature_cols = ['lag_log_mort', 'hiv_prevalence_pavg_lag1', 'fertility_pavg_lag1', 'gdp_pc_ppp_lag1', 
             'population_lag1']

health_df = explore_util.lag_variable_simple(health_df, 'mean_pc_last_5_ppd', 1)
health_df = explore_util.lag_variable_simple(health_df, 'w_avg_rating', 5)

# do this manually first, as EconML results are proving volatile and highly counter-intuitive
hdlm_df = health_df.copy()
# do some scaling
cols_to_scale = ['mortality_under5_pavg', 'w_avg_rating_lag5', 'health_max_proj_5yr', 'mean_pc_last_5_ppd_lag1'] + health_feature_cols
for col in cols_to_scale:
    hdlm_df[col] = (hdlm_df[col] - hdlm_df[col].mean()) / hdlm_df[col].std()

rating_col = 'health_max_proj_5yr'
magnitude_col = 'mean_pc_last_5_ppd_lag1'
target_col = 'mortality_under5_pavg'

health_data_sets = { 
    "all_years": hdlm_df, 
    "only_from_raters": hdlm_df[hdlm_df[magnitude_col] > 0], 
    "only_rated": hdlm_df[hdlm_df[rating_col] > 0]
}

max_proj_df = partial_out_crawl("health_max_proj_5yr", target_col, cols_to_scale, health_data_sets)
last_proj_df = partial_out_crawl("w_avg_rating_lag5", target_col, cols_to_scale, health_data_sets)

pd.concat((max_proj_df, last_proj_df)).round(2)

Partialling out, target: mortality_under5_pavg, treatment: health_max_proj_5yr
Number of observations before dropping NA:  9464
Number of observations after dropping NA:  1396
Partialling out, target: mortality_under5_pavg, treatment: health_max_proj_5yr
Number of observations before dropping NA:  473
Number of observations after dropping NA:  368
Partialling out, target: mortality_under5_pavg, treatment: health_max_proj_5yr
Number of observations before dropping NA:  855
Number of observations after dropping NA:  349
Partialling out, target: mortality_under5_pavg, treatment: w_avg_rating_lag5
Number of observations before dropping NA:  9464
Number of observations after dropping NA:  1396
Partialling out, target: mortality_under5_pavg, treatment: w_avg_rating_lag5
Number of observations before dropping NA:  473
Number of observations after dropping NA:  368
Partialling out, target: mortality_under5_pavg, treatment: w_avg_rating_lag5
Number of observations before dropping NA:  855
Numbe

Unnamed: 0,label,treatment,resid_rsq,nums,resid_pval,treatment_pval,treatment_coeff,x_rsq,x_pval,x_maxsigcoeff
0,all_years,health_max_proj_5yr,0.03,1396.0,0.0,0.0,-0.04,0.89,0.0,0.6
1,only_from_raters,health_max_proj_5yr,0.02,368.0,0.01,0.01,-0.03,0.86,0.0,0.69
2,only_rated,health_max_proj_5yr,0.01,349.0,0.18,0.18,-0.02,0.93,0.0,0.54
0,all_years,w_avg_rating_lag5,0.0,1396.0,0.31,0.31,-0.01,0.89,0.0,0.6
1,only_from_raters,w_avg_rating_lag5,0.0,368.0,0.19,0.19,-0.01,0.86,0.0,0.68
2,only_rated,w_avg_rating_lag5,0.0,349.0,0.43,0.43,-0.0,0.93,0.0,0.53


### Prior EconML: skipping use for now

In [53]:
health_est_ldml = LinearDML(model_y=LassoCV(), model_t=LassoCV())

if "access_water" not in control_cols:
    control_cols += ["physicians_rate", "health_share_gov_exp"]

Y, T, X, W = experiment.assemble_econml_tuples(health_df, "log_mort", "log_mean_pc_last_5",
                                              control_cols + ["log_proj_rating"])
health_est_ldml.fit(Y, T, X=X, W=W)
health_est_ldml.summary()

health_df.w_avg_rating.describe()

health_df['w_avg_rating_std'] = (health_df['w_avg_rating_lag5'] - health_df['w_avg_rating_lag5'].mean()) / health_df['w_avg_rating_lag5'].std()
health_df['log_scale_avg_rating'] = experiment.safe_log(health_df, 'w_avg_rating_std')

Y, T, X, W = experiment.assemble_econml_tuples(health_df[health_df.w_avg_rating > 0], "log_mort", "log_scale_avg_rating",
                                              control_cols + ["log_mean_pc_last_5"])

health_est_ldml.fit(Y, T, X=X, W=W)
health_est_ldml.summary()

Co-variance matrix is undertermined. Inference will be invalid!
invalid value encountered in log


Coefficient Results:  Level values must be unique: ['hiv_prevalence_pavg_lag1', 'fertility_pavg_lag1', 'gdp_pc_ppp_lag1', 'population_lag1', 'conflict', 'physicians_rate', 'physicians_rate', 'health_share_gov_exp', 'health_share_gov_exp', 'physicians_rate', 'physicians_rate', 'health_share_gov_exp', 'health_share_gov_exp', 'log_proj_rating'] on level 0
Coefficient Results:  Level values must be unique: ['hiv_prevalence_pavg_lag1', 'fertility_pavg_lag1', 'gdp_pc_ppp_lag1', 'population_lag1', 'conflict', 'physicians_rate', 'physicians_rate', 'health_share_gov_exp', 'health_share_gov_exp', 'physicians_rate', 'physicians_rate', 'health_share_gov_exp', 'health_share_gov_exp', 'log_mean_pc_last_5'] on level 0


Co-variance matrix is undertermined. Inference will be invalid!


0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
cate_intercept,-0.275,0.0,-973.422,0.0,-0.275,-0.275


In [52]:
health_df['w_avg_rating_std'] = (health_df['w_avg_rating'] - health_df['w_avg_rating'].mean()) / health_df['w_avg_rating'].std()
health_df['log_scale_avg_rating'] = experiment.safe_log(health_df, 'w_avg_rating_std')

Y, T, X, W = experiment.assemble_econml_tuples(health_df[health_df.w_avg_rating > 0], "log_mort", "log_scale_avg_rating",
                                              control_cols + ["log_mean_pc_last_5"])

health_est_ldml.fit(Y, T, X=X, W=W)
health_est_ldml.summary()

invalid value encountered in log


0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
hiv_prevalence_pavg_lag1,0.151,0.014,10.992,0.0,0.128,0.174
fertility_pavg_lag1,0.092,0.022,4.213,0.0,0.056,0.128
gdp_pc_ppp_lag1,0.0,0.0,0.541,0.589,-0.0,0.0
population_lag1,0.0,0.0,4.311,0.0,0.0,0.0
conflict,-1.782,0.0,-6851.556,0.0,-1.783,-1.782
physicians_rate,0.751,0.0,8725.127,0.0,0.75,0.751
health_share_gov_exp,-0.169,0.032,-5.275,0.0,-0.222,-0.117
log_mean_pc_last_5,0.874,0.009,101.655,0.0,0.86,0.888

0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
cate_intercept,-1.593,0.004,-396.828,0.0,-1.6,-1.587


In [None]:
2 ** (0.083) - 1

## WASH replications

In [None]:
# Access = a0 + a1 * Aid + a2 * Aid^2 + beta * controls + country FE + error term

* Access (country, year) = access_water or access_sanitation  (each of them is separately used as the dependent variable, for Table 4 and Table 5, while the other is included as a lagged term in the controls). They also split urban and rural, but I think we can ignore this for now
* Aid (country, year) = Aid targeted to the water and sanitation sector as a percentage of GDP. So Aid = 100 * commit_wash / (gdp_pc * population)
* Controls (country, year): 
   - adult_literacy; log(gdp_pc), lagged(access_water or access_sanitation) and 3 others that are not exactly in the dataset but have reasonably close proxies:
   - Government spending on health (% of GDP)  is not in the dataset, but a reasonably close one is health_share_gov_exp = Government health expenditure (% of general government expenditure)
   - Age dependency ratio is not in the dataset, but a reasonably close one is young_population
   - Government stability from ICRG is not in the dataset, but reasonably close ones are conflict and freedom_house
* Other details:
   - Period = 1990-2010
   - Sample restricted to SSA countries only

In [82]:
wash_df = health_df.copy()

wash_df = experiment.add_project_and_aid_cols(wash_df, "wash")

wash_df['wash_aid'] = wash_df['mean_pc_last_5'] * 100 / (wash_df['gdp_pc_ppp'])
wash_df['wash_aid_sq'] = wash_df['wash_aid'] ** 2

wash_df['log_gdp_pc'] = experiment.safe_log(wash_df, 'gdp_pc_ppp')
wash_df = experiment.take_avg_and_lag(wash_df, "health_share_gov_exp")
wash_df = experiment.take_avg_and_lag(wash_df, "adult_literacy")

wash_df['access_water_pavg'] = explore_util.rolling_country_agg(wash_df, "access_water", 5, "mean")
wash_df['access_san_pavg'] = explore_util.rolling_country_agg(wash_df, "access_sanitation", 5, "mean")

wash_df = explore_util.lag_variable_simple(wash_df, 'access_water_pavg', 1)
wash_df = explore_util.lag_variable_simple(wash_df, "access_san_pavg", 1)
wash_df = explore_util.lag_variable_simple(wash_df, 'access_water_pavg', 5)
wash_df = explore_util.lag_variable_simple(wash_df, "access_san_pavg", 5)

Generating mean per capita commitments over prior years
Marking whether a satisfactory project concluded in that year
Taking maximum of weighted rating of concluded projects in prior period


In [84]:
wash_controls_constant = ['adult_literacy_pavg', 'log_gdp_pc', 'health_share_gov_exp_pavg_lag1',
                'young_population', 'conflict', 'freedom_house']

wash_access_cols = ['access_water_pavg', 'access_san_pavg', 'access_water_pavg_lag5', 'access_san_pavg_lag5']
wash_proj_cols = ['wash_satisfactory_proj', 'wash_max_proj_5yr']

wash_df['log_wash_aid_sq'] = experiment.safe_log(wash_df, 'wash_aid_sq')

In [85]:
remove_for_water = ["access_san_pavg", "access_water_pavg_lag5"]
# remove_for_water = []

water_est = experiment.evaluate_treatment(wash_df[wash_df.gdp_pc_ppp < 10000], "access_water_pavg", "wash_aid",
                                          wash_controls_constant + wash_access_cols + wash_proj_cols, # + ['log_wash_aid_sq'],
                                          remove_feature_cols=remove_for_water,
                                          log_target=True, log_treatment=True, add_constant=True, 
                                          add_country_feffects=True)

experiment.extract_treatment_results('Access to Water Estimate', water_est, 'access_water_pavg', 'wash_aid', wash_controls_constant, None)

{'Label': 'Access to Water Estimate',
 'Target': 'access_water_pavg',
 'Regression P': 5.202179391877371e-193,
 'Treatment column': 'wash_aid',
 'Treatment significance': 1.0969022573104455e-07,
 'Treatment coefficient': 0.06950637010040998,
 'Sig feature coefficient': {'young_population': 0.0174},
 'All p-values': {'const': 0.0001,
  'adult_literacy_pavg': 0.1116,
  'log_gdp_pc': 0.3549,
  'health_share_gov_exp_pavg_lag1': 0.6811,
  'young_population': 0.0128,
  'conflict': 0.7165,
  'freedom_house': 0.4009,
  'access_san_pavg_lag5': 0.0,
  'wash_satisfactory_proj': 0.8137,
  'wash_max_proj_5yr': 0.9828,
  'wash_aid': 0.0,
  'AGO': 0.9532,
  'ALB': 0.9992,
  'ARM': 0.7673,
  'AZE': 0.5125,
  'BDI': 0.9217,
  'BEN': 0.0,
  'BFA': 0.0,
  'BGD': 0.003,
  'BGR': 0.0795,
  'BIH': 0.7657,
  'BLR': 0.4099,
  'BLZ': 0.2144,
  'BOL': 0.0187,
  'BTN': 0.0,
  'BWA': 0.5441,
  'CAF': 0.0055,
  'CHL': 0.0299,
  'CHN': 0.0099,
  'CIV': 0.0,
  'CMR': 0.634,
  'COD': 0.9711,
  'COG': 0.0006,
  'COL':

In [86]:
# effect of doubling:
print("Doubling effect: ", 2 ** (water_est.params['wash_aid']) - 1)
print(water_est.summary())

Doubling effect:  0.049357575919604546
                            OLS Regression Results                            
Dep. Variable:      access_water_pavg   R-squared:                       0.785
Model:                            OLS   Adj. R-squared:                  0.756
Method:                 Least Squares   F-statistic:                     27.53
Date:                Wed, 08 Sep 2021   Prob (F-statistic):          5.20e-193
Time:                        13:00:28   Log-Likelihood:                 499.34
No. Observations:                 857   AIC:                            -796.7
Df Residuals:                     756   BIC:                            -316.6
Df Model:                         100                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [87]:
remove_for_san = ["access_water_pavg", "access_san_pavg_lag5"]
# remove_for_water = []

san_est = experiment.evaluate_treatment(wash_df[wash_df.gdp_pc_ppp < 10000], "access_san_pavg", "wash_aid",
                                          wash_controls_constant + wash_access_cols + ['w_avg_rating'], # + wash_proj_cols + ['log_wash_aid_sq'],
                                          remove_feature_cols=remove_for_san,
                                          log_target=True, log_treatment=True, add_constant=True, 
                                          add_country_feffects=True)

print(san_est.summary())

                            OLS Regression Results                            
Dep. Variable:        access_san_pavg   R-squared:                       0.959
Model:                            OLS   Adj. R-squared:                  0.954
Method:                 Least Squares   F-statistic:                     177.0
Date:                Wed, 08 Sep 2021   Prob (F-statistic):               0.00
Time:                        13:00:28   Log-Likelihood:                 448.97
No. Observations:                 859   AIC:                            -695.9
Df Residuals:                     758   BIC:                            -215.6
Df Model:                         100                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

In [88]:
# effect of doubling here
print("Effect of doubling: ", 2 ** (san_est.params["wash_aid"]) - 1)

Effect of doubling:  0.03759160967554398


In [89]:
write_wash_results = False

if write_wash_results:
    with open("../results/intial_run/water_initial_fe_linear.txt", "w") as file:
        file.write(water_est.summary().as_text())

    with open("../results/intial_run/sanitation_initial_fe_linear.txt", "w") as file:
        file.write(san_est.summary().as_text())

#     gsearch_results.to_csv('../data/results/education_model_crawl.csv', float_format='%.4f')

### Partialling out and EconML

In [90]:
wash_feature_cols = ['adult_literacy_pavg', 'log_gdp_pc', 'health_share_gov_exp_pavg_lag1',  
             'young_population', 'conflict', 'freedom_house', 'access_water_pavg_lag5']

wash_df = explore_util.lag_variable_simple(wash_df, 'mean_pc_last_5_ppd', 1)
wash_df = explore_util.lag_variable_simple(wash_df, 'w_avg_rating', 5)

# do this manually first, as EconML results are proving volatile and highly counter-intuitive
wdlm_df = wash_df.copy()
# do some scaling
cols_to_scale = ['access_san_pavg', 'w_avg_rating_lag5', 'wash_max_proj_5yr', 'mean_pc_last_5_ppd_lag1'] + wash_feature_cols
for col in cols_to_scale:
    wdlm_df[col] = (wdlm_df[col] - wdlm_df[col].mean()) / wdlm_df[col].std()

rating_col = 'wash_max_proj_5yr'
magnitude_col = 'mean_pc_last_5_ppd_lag1'
target_col = 'access_san_pavg'

wash_data_sets = { 
    "all_years": wdlm_df, 
    "only_from_raters": wdlm_df[wdlm_df[magnitude_col] > 0], 
    "only_rated": wdlm_df[wdlm_df[rating_col] > 0]
}

wash_max_proj_df = partial_out_crawl("wash_max_proj_5yr", target_col, cols_to_scale, wash_data_sets)
wash_last_proj_df = partial_out_crawl("w_avg_rating_lag5", target_col, cols_to_scale, wash_data_sets)

# print(est_target.summary())

Partialling out, target: access_san_pavg, treatment: wash_max_proj_5yr
Number of observations before dropping NA:  9464
Number of observations after dropping NA:  1201
Partialling out, target: access_san_pavg, treatment: wash_max_proj_5yr
Number of observations before dropping NA:  473
Number of observations after dropping NA:  369
Partialling out, target: access_san_pavg, treatment: wash_max_proj_5yr
Number of observations before dropping NA:  855
Number of observations after dropping NA:  364
Partialling out, target: access_san_pavg, treatment: w_avg_rating_lag5
Number of observations before dropping NA:  9464
Number of observations after dropping NA:  1201
Partialling out, target: access_san_pavg, treatment: w_avg_rating_lag5
Number of observations before dropping NA:  473
Number of observations after dropping NA:  369
Partialling out, target: access_san_pavg, treatment: w_avg_rating_lag5
Number of observations before dropping NA:  855
Number of observations after dropping NA:  364


In [91]:
wash_crawl_df = pd.concat((wash_max_proj_df, wash_last_proj_df)).round(2)

In [92]:
wash_crawl_df

Unnamed: 0,label,treatment,resid_rsq,nums,resid_pval,treatment_pval,treatment_coeff,x_rsq,x_pval,x_maxsigcoeff
0,all_years,wash_max_proj_5yr,0.01,1201.0,0.0,0.0,0.03,0.79,0.0,0.43
1,only_from_raters,wash_max_proj_5yr,0.01,369.0,0.08,0.08,0.03,0.78,0.0,0.48
2,only_rated,wash_max_proj_5yr,0.0,364.0,0.8,0.8,-0.01,0.81,0.0,0.56
0,all_years,w_avg_rating_lag5,0.0,1201.0,0.2,0.2,-0.01,0.79,0.0,0.43
1,only_from_raters,w_avg_rating_lag5,0.0,369.0,0.35,0.35,-0.01,0.78,0.0,0.5
2,only_rated,w_avg_rating_lag5,0.02,364.0,0.0,0.0,-0.02,0.81,0.0,0.56


In [93]:
feature_cols = wash_feature_cols

In [94]:
wash_df['log_san_access'] = experiment.safe_log(wash_df, 'access_san_pavg')
wash_df['log_wash_aid'] = experiment.safe_log(wash_df, 'wash_aid')
wash_df['log_avg_rating'] = experiment.safe_log(wash_df, 'w_avg_rating')

In [95]:
rating_col = 'log_avg_rating'
magnitude_col = 'log_wash_aid'
target_col = 'log_san_access'

# feature_cols = [col for col in data_cols if col != treatment_col and col != target_col]
Y, T, X, W = experiment.assemble_econml_tuples(wash_df, target_col=target_col, treatment_col=magnitude_col, 
                                               feature_cols=feature_cols + [rating_col])

In [96]:
wash_est = LinearDML(model_y='auto', model_t='auto')
wash_est.fit(Y, T, X=X, W=W)
print(wash_est.summary())

                                Coefficient Results                                 
                               point_estimate stderr zstat  pvalue ci_lower ci_upper
------------------------------------------------------------------------------------
adult_literacy_pavg                    -0.002  0.001  -1.59  0.112   -0.003      0.0
log_gdp_pc                              0.018   0.01  1.801  0.072    0.002    0.034
health_share_gov_exp_pavg_lag1         -0.004  0.002 -2.697  0.007   -0.007   -0.002
young_population                       -0.005  0.001 -4.903    0.0   -0.006   -0.003
conflict                                0.074  0.038  1.955  0.051    0.012    0.136
freedom_house                          -0.005  0.005 -1.007  0.314   -0.014    0.003
access_water_pavg_lag5                 -0.003  0.001 -2.515  0.012   -0.004   -0.001
log_avg_rating                          0.028  0.013   2.13  0.033    0.006     0.05
                       CATE Intercept Results                    

In [97]:
print(wash_est.summary())

                                Coefficient Results                                 
                               point_estimate stderr zstat  pvalue ci_lower ci_upper
------------------------------------------------------------------------------------
adult_literacy_pavg                    -0.002  0.001  -1.59  0.112   -0.003      0.0
log_gdp_pc                              0.018   0.01  1.801  0.072    0.002    0.034
health_share_gov_exp_pavg_lag1         -0.004  0.002 -2.697  0.007   -0.007   -0.002
young_population                       -0.005  0.001 -4.903    0.0   -0.006   -0.003
conflict                                0.074  0.038  1.955  0.051    0.012    0.136
freedom_house                          -0.005  0.005 -1.007  0.314   -0.014    0.003
access_water_pavg_lag5                 -0.003  0.001 -2.515  0.012   -0.004   -0.001
log_avg_rating                          0.028  0.013   2.13  0.033    0.006     0.05
                       CATE Intercept Results                    

In [98]:
Y, T, X, W = experiment.assemble_econml_tuples(wash_df, target_col=target_col, treatment_col=rating_col, 
                                               feature_cols=feature_cols + [magnitude_col])

est = LinearDML(model_y='auto', model_t='auto')
est.fit(Y, T, X=X, W=W)
print(est.summary())

                                Coefficient Results                                 
                               point_estimate stderr zstat  pvalue ci_lower ci_upper
------------------------------------------------------------------------------------
adult_literacy_pavg                    -0.006  0.002  -3.64    0.0   -0.008   -0.003
log_gdp_pc                              0.049  0.022  2.211  0.027    0.012    0.085
health_share_gov_exp_pavg_lag1          0.018  0.006  2.807  0.005    0.007    0.028
young_population                       -0.009  0.004 -2.282  0.023   -0.016   -0.003
conflict                               -0.001  0.083 -0.015  0.988   -0.138    0.135
freedom_house                           0.053  0.013    4.0    0.0    0.031    0.074
access_water_pavg_lag5                  0.003  0.002  1.735  0.083      0.0    0.005
log_wash_aid                            0.036  0.015  2.507  0.012    0.013     0.06
                       CATE Intercept Results                    

In [99]:
wash_est = LinearDML(model_y='auto', model_t='auto')
wash_est.fit(Y, T, X=X, W=W)
print(wash_est.summary())

                                Coefficient Results                                 
                               point_estimate stderr zstat  pvalue ci_lower ci_upper
------------------------------------------------------------------------------------
adult_literacy_pavg                    -0.005  0.002 -3.119  0.002   -0.008   -0.002
log_gdp_pc                              0.048  0.024  2.018  0.044    0.009    0.087
health_share_gov_exp_pavg_lag1          0.017  0.006  2.671  0.008    0.007    0.028
young_population                        -0.01  0.004 -2.465  0.014   -0.017   -0.003
conflict                               -0.026  0.085 -0.301  0.763   -0.166    0.115
freedom_house                            0.05  0.013  3.861    0.0    0.029    0.071
access_water_pavg_lag5                  0.002  0.001  1.288  0.198   -0.001    0.004
log_wash_aid                            0.043  0.015  2.834  0.005    0.018    0.068
                       CATE Intercept Results                    