In [None]:
%config Completer.use_jedi = False
from importlib import reload

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

pd.set_option("mode.chained_assignment", None)

import util.load as load_util
import util.explore as explore_util
import util.experiment as experiment

from econml.dml import LinearDML, SparseLinearDML, NonParamDML
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
project_df = load_util.load_projects() # loads in aid data projects

### Helper methods

In [None]:
def partial_out_crawl(specific_rating_col, outcome_col, feature_cols, sector_data_sets):
    dml_results = []
    for label in sector_data_sets:
        dml_est, est_target, est_treatment, result_dict = experiment.perform_dml_on_df(
            sector_data_sets[label], label, outcome_col, specific_rating_col, feature_cols)
        dml_results.append(result_dict)
        
    return pd.DataFrame(dml_results)

In [None]:
def write_to_text(estimator, filename):
    with open(f"../results/rating_regressions/{filename}.txt", "w") as file:
        file.write(estimator.summary().as_text())

In [None]:
def write_result_df(result_df, filename):
    result_df.to_csv(f"../results/rating_regressions/{filename}.csv", index=False, float_format='%.4f')

### Education replication, ratings check

1. Load in panel assembled by DG, and country code cross-matches
2. For each country-year, calculate mean growth in education indicators at year + lag

Education: Specification 2
```
regress  last_ner <- first_ner pc_commit_education [per capita commitment amount=
        edu_share_gov_exp edu_pupil_teacher young_population
        gdp_pc_ppp cash_surplus_deficit inflation trade_share_gdp
        freedom_house i.period i.ncountrycode if countrytoinclude == 1, r
```

In [None]:
edu_treatment_df = load_util.assemble_sector_ratings(project_df, 'Education').fillna(0) # loads in 
edu_df = pd.read_csv('../data/transformed_data/education_df.csv', index_col=0)
    
edu_df = experiment.add_project_and_aid_cols(edu_df, rated_too=False)
print(edu_df.project_completed_year.value_counts())

if 'edu_ner_lag5' not in edu_df:
    print('Generating past net enrollment rates')
    edu_df = explore_util.lag_variable_simple(edu_df, 'edu_ner', 5)

if 'future_edu_ner' not in edu_df:
    print('Generating future net enrollment rates')
    edu_df = explore_util.lag_variable_simple(edu_df, 'edu_ner', -5)
    edu_df = edu_df.rename(columns = { 'edu_ner_lag-5': 'future_edu_ner'})
    
edu_df['period'] = round((edu_df.year - 1900) / 5) - 10
edu_df['prior_ner_growth'] = edu_df['edu_ner'] / edu_df['edu_ner_lag5']
edu_df['edu_ner_pavg_5'] = explore_util.rolling_country_agg(edu_df, 'edu_ner', 5, 'mean')

In [None]:
edu_target_col = 'edu_ner_pavg_5'

treatment_cols = [
    'mean_pc_last_5',
    'mean_pc_last_5_ppd',
    'mean_pc_last_5_wb'
]

rating_cols = ['education_max_proj_5yr', 'education_satisfactory_proj']

edu_treatment_col = 'mean_pc_last_5'

data_cols = [edu_target_col, edu_treatment_col] + rating_cols + [
    'edu_share_gov_exp', 'edu_pupil_teacher', 'young_population', 'gdp_pc_ppp', 
    'cash_surplus_deficit', 'inflation', 'trade_share_gdp', 'freedom_house', 'prior_ner_growth'
]

initial_drop = ['prior_ner_growth', 'edu_share_gov_exp', 'prior_4year_growth']

In [None]:
edu_est = experiment.evaluate_treatment(
    edu_df, edu_target_col, edu_treatment_col, data_cols,
    remove_feature_cols=initial_drop, 
    add_country_feffects=True, add_constant=False, 
    log_target=True, log_treatment=True, add_period_feffects=False)

straight_results = experiment.extract_treatment_results(
    'Replication', edu_est, edu_target_col, edu_treatment_col, data_cols, None
)

print(straight_results)
print(edu_est.summary())

In [None]:
def assemble_args(arg_dict, default_args):
    keys = [key for key in default_args if key not in arg_dict]
    for key in keys:
        arg_dict[key] = default_args[key]
    return arg_dict

In [None]:
def crawl_specifications(search_grid, default_args):
    treatment_search_result = []
    estimators = {}

    for label, args in search_grid.items():
        all_args = assemble_args(args, default_args)
        est = experiment.evaluate_treatment(**all_args)
        results = experiment.extract_treatment_results(
            label, est, all_args['target_col'], all_args['treatment_col'], data_cols, {}
        )
        treatment_search_result.append(results)
        estimators[label] = est

    gsearch_results = pd.DataFrame(treatment_search_result)
    return gsearch_results, estimators

In [None]:
edu_default_args = dict(df=edu_df, target_col=edu_target_col, treatment_col=edu_treatment_col,
                   feature_cols=data_cols, add_country_feffects=True, add_constant=False, 
                    log_target=True, log_treatment=True)

search_grid = {
    'straight_replication': dict(
        remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 
                             'w_avg_rating', 'satisfactory_proj', 'mean_pc_last_5_ppd', 'mean_pc_last_5_wb']
    ),
    'only_rated_aid_all_data': dict(
        treatment_col='mean_pc_last_5_wb',
        remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'mean_pc_last_5', 'mean_pc_last_5_ppd']
    ),
    'only_rated_aid_narrow_data': dict(
        treatment_col='mean_pc_last_5_ppd',
        df=edu_df[edu_df.mean_pc_last_5_ppd > 0],
        remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'mean_pc_last_5', 'mean_pc_last_5_wb']
    ),
    'only_wb_data_narrow': dict(
        treatment_col='mean_pc_last_5_wb',
        df=edu_df[edu_df.mean_pc_last_5_wb > 0], remove_feature_cols=['mean_pc_last_5', 'mean_pc_last_5_ppd',
                                                                        'edu_share_gov_exp', 'satisfactory_proj',
                                                                     'education_satisfactory_proj', 'prior_ner_growth']
    )
}

In [None]:
gsearch_results, estimators = crawl_specifications(search_grid, edu_default_args)
gsearch_results

*Now partialling out*

In [None]:
# doing partialling out first, as EconML results are proving volatile and highly counter-intuitive
feature_cols = ['edu_share_gov_exp', 'edu_pupil_teacher', 'young_population', 'gdp_pc_ppp', 
             'cash_surplus_deficit', 'inflation', 'trade_share_gdp', 'freedom_house']

edu_df['rolling_mean_edu_ner'] = explore_util.rolling_country_agg(edu_df, 'edu_ner', 5, 'mean')

edu_df = explore_util.lag_variable_simple(edu_df, 'mean_pc_last_5_ppd', 1)
edu_df = explore_util.lag_variable_simple(edu_df, 'w_avg_rating', 5)

dlm_df = edu_df.copy()

# perform some scaling
cols_to_scale = ['rolling_mean_edu_ner', 'w_avg_rating_lag5', 'mean_pc_last_5_ppd_lag1', "education_max_proj_5yr"] + feature_cols
for col in cols_to_scale:
    dlm_df[col] = (dlm_df[col] - dlm_df[col].mean()) / dlm_df[col].std()

In [None]:
rating_col = 'w_avg_rating_lag5'
magnitude_col = 'mean_pc_last_5_ppd_lag1'
target_col = 'rolling_mean_edu_ner'

edu_data_sets = { 
    "all_years": dlm_df, 
    "only_from_raters": dlm_df[dlm_df[magnitude_col] > 0], 
    "only_rated": dlm_df[dlm_df[rating_col] > 0]
}

# only_rated_df = dlm_df[dlm_df[magnitude_col] > 0]

max_proj_df = partial_out_crawl("education_max_proj_5yr", target_col, cols_to_scale, edu_data_sets)
last_proj_df = partial_out_crawl("w_avg_rating_lag5", target_col, cols_to_scale, edu_data_sets)

pout_edu_results = pd.concat((max_proj_df, last_proj_df)).round(2)
pout_edu_results

In [None]:
write_edu_results = True

if write_edu_results:
    write_to_text(estimators['straight_replication'], "edu_outcomes_no_ratings")
    write_to_text(estimators["only_rated_aid_all_data"], "edu_outcomes_only_rated_with_ratings")
    write_to_text(estimators["only_wb_data_narrow"], "edu_outcomes_only_wb_rated_narrow")
    write_result_df(gsearch_results, "education_ratings_search")
    write_result_df(pout_edu_results, "education_partialling_out")
#     gsearch_results.to_csv('../data/results/rating_regressions/education_ratings_search.csv', float_format='%.4f', index=False)
#     pout_edu_results.to_csv('../data/results/rating_regressions/education_partialling_out.csv', index=False)

## Health

Process:

1. Repeat outcome variable formation, using lagged construction
2. Construct sectoral aid per capita using utilities
3. Construct specification, using Diana's original notebook

In [None]:
health_treatment_df = load_util.assemble_sector_ratings(project_df, 'Health').fillna(0)
health_df = pd.read_csv('../data/transformed_data/health_df.csv', index_col=0)
health_df = experiment.add_project_and_aid_cols(health_df, sector='health', rated_too=True)

treatment_cols = [col for col in health_treatment_df.columns if col not in ["end_year", "country_code"]]
health_df[treatment_cols] = health_df[treatment_cols].fillna(0)
print("Treatment counts: ", health_df.project_completed_year.value_counts())

# take rolling five year averages of everything, following paper
# note: not doing this for macro variables as justification does not make sense for those (measurement/volatility)
measured_cols = ['mortality_under5', 'fertility', 'hiv_prevalence']

for m_col in measured_cols:
    health_df[f"{m_col}_pavg"] = explore_util.rolling_country_agg(health_df, m_col, 5, "mean")
    health_df = explore_util.lag_variable_simple(health_df, f"{m_col}_pavg", 1)
    
macro_cols = ["gdp_pc_ppp", "population"]
for m_col in macro_cols:
    health_df = explore_util.lag_variable_simple(health_df, m_col, 1)

health_df = explore_util.lag_variable_simple(health_df, "mortality_under5_pavg", 5)
health_df["lag_log_mort"] = np.log(health_df["mortality_under5_pavg_lag5"])
health_df['prior_mort_decline'] = health_df['mortality_under5_pavg'] / health_df['mortality_under5_pavg_lag5']
health_df = explore_util.lag_variable_simple(health_df, "w_avg_rating", 5)

* Specification 1:

```qui regress mortality_under5 pc_commit_health lag_mortality_under5 ///
            lag_gdp_pc_ppp lag_fertility lag_population ///
            hiv_prevalence conflict i.period, r```

In [None]:
target_cols = ["mortality_under5_pavg"]
treatment_cols = ["mean_pc_last_5_ppd"]

momentum_cols = ["lag_log_mort"]
control_cols = [
    "hiv_prevalence_pavg_lag1", 
    "fertility_pavg_lag1", 
    "gdp_pc_ppp_lag1", 
    "population_lag1", 
    "conflict"
]

rating_cols = [
    'health_max_proj_5yr', 
    'w_avg_rating_lag5'
]

health_data_cols = ["country"] + target_cols + treatment_cols + momentum_cols + control_cols + rating_cols

health_default_args = dict(df=health_df, target_col="mortality_under5_pavg", treatment_col=treatment_cols[0],
                           feature_cols=health_data_cols, add_constant=False, log_target=True, log_treatment=True)

In [None]:
health_est = experiment.evaluate_treatment(**health_default_args,
                                           remove_feature_cols=["prior_mort_decline"], 
                                           add_country_feffects=False)

health_results = experiment.extract_treatment_results('Health Replication', health_est, 'mortality_under5_pavg', 'mean_pc_last_5_ppd', health_data_cols, None)
print(health_est.summary())

* Specification 6

```qui regress mortality_under5 pc_commit_health lag_mortality_under5 ///
            lag_gdp_pc_ppp lag_fertility lag_population ///
            lag_physicians_rate  lag_female_adult_literacy ///
            lag_access_water lag_access_sanitation ///
            hiv_prevalence conflict i.period i.nregionname, r```

In [None]:
health_specifications = {
    "simple_replication": dict(
        remove_feature_cols=["prior_mort_decline"], 
        add_country_feffects=False
    ),
    # add in controls for macro
    "replication_with_macro": dict(
        feature_cols=health_data_cols + ['inflation', 'cash_surplus_deficit', 'trade_share_gdp'],
        treatment_col="mean_pc_last_5_ppd",
        remove_feature_cols=["mean_pc_last_5"]
    ),
    "replication_full_controls": dict(
        feature_cols=health_data_cols + ['access_water', 'access_sanitation', 'physicians_rate'],
        treatment_col="mean_pc_last_5_ppd",
        remove_feature_cols=["mean_pc_last_5"],
        add_country_feffects=True, add_period_feffects=True
    ),
    # now just with positive rating
    "only_rated_data": dict(
        df=health_df[health_df.w_avg_rating > 0],
        feature_cols=health_data_cols + ['access_water', 'access_sanitation', 'physicians_rate'],
        treatment_col="mean_pc_last_5_ppd",
        remove_feature_cols=["mean_pc_last_5"]
    )
}

In [None]:
health_gsearch, health_estimators = crawl_specifications(health_specifications, health_default_args)
health_gsearch

In [None]:
print(health_estimators["replication_full_controls"].summary())

In [None]:
# print(health_estimators["only_rated_data"].summary())

### Partialling out and EconML (on health)

In [None]:
health_df["log_mort"] = np.log(health_df["mortality_under5_pavg"])
health_df["log_mean_pc_last_5"] = np.log(health_df["mean_pc_last_5"])
health_df["log_proj_rating"] = np.log(health_df["w_avg_rating"].replace(0, np.nan)).fillna(0).fillna(0)

In [None]:
health_feature_cols = ['lag_log_mort', 'hiv_prevalence_pavg_lag1', 'fertility_pavg_lag1', 'gdp_pc_ppp_lag1', 
             'population_lag1']

health_df = explore_util.lag_variable_simple(health_df, 'mean_pc_last_5_ppd', 1)
health_df = explore_util.lag_variable_simple(health_df, 'w_avg_rating', 5)

# do this manually first, as EconML results are proving volatile and highly counter-intuitive
hdlm_df = health_df.copy()
# do some scaling
cols_to_scale = ['mortality_under5_pavg', 'w_avg_rating_lag5', 'health_max_proj_5yr', 'mean_pc_last_5_ppd_lag1'] + health_feature_cols
for col in cols_to_scale:
    hdlm_df[col] = (hdlm_df[col] - hdlm_df[col].mean()) / hdlm_df[col].std()

rating_col = 'health_max_proj_5yr'
magnitude_col = 'mean_pc_last_5_ppd_lag1'
target_col = 'mortality_under5_pavg'

health_data_sets = { 
    "all_years": hdlm_df, 
    "only_from_raters": hdlm_df[hdlm_df[magnitude_col] > 0], 
    "only_rated": hdlm_df[hdlm_df[rating_col] > 0]
}

max_proj_df = partial_out_crawl("health_max_proj_5yr", target_col, cols_to_scale, health_data_sets)
last_proj_df = partial_out_crawl("w_avg_rating_lag5", target_col, cols_to_scale, health_data_sets)

health_pout_results = pd.concat((max_proj_df, last_proj_df)).round(2)

In [None]:
health_pout_results

In [None]:
write_health_results = True

if write_health_results:
    write_to_text(health_estimators["simple_replication"], "health_outcomes_simple_replication")
    write_to_text(health_estimators['replication_full_controls'], "health_outcomes_full_controls")
    write_to_text(health_estimators["only_rated_data"], "health_outcomes_only_wb_rated_narrow")
    health_gsearch.to_csv('../data/results/health_ratings_search.csv', float_format='%.4f')
    health_pout_results.to_csv('../data/results/health_ratings_pout.csv')

## WASH replications

Access = a0 + a1 * Aid + a2 * Aid^2 + beta * controls + country FE + error term

* Access (country, year) = access_water or access_sanitation  (each of them is separately used as the dependent variable, for Table 4 and Table 5, while the other is included as a lagged term in the controls). They also split urban and rural, but I think we can ignore this for now
* Aid (country, year) = Aid targeted to the water and sanitation sector as a percentage of GDP. So Aid = 100 * commit_wash / (gdp_pc * population)
* Controls (country, year): 
   - adult_literacy; log(gdp_pc), lagged(access_water or access_sanitation) and 3 others that are not exactly in the dataset but have reasonably close proxies:
   - Government spending on health (% of GDP)  is not in the dataset, but a reasonably close one is health_share_gov_exp = Government health expenditure (% of general government expenditure)
   - Age dependency ratio is not in the dataset, but a reasonably close one is young_population
   - Government stability from ICRG is not in the dataset, but reasonably close ones are conflict and freedom_house
* Other details:
   - Period = 1990-2010
   - Sample restricted to SSA countries only

In [None]:
wash_df = health_df.copy()

wash_df = experiment.add_project_and_aid_cols(wash_df, "wash")

wash_df['wash_aid'] = wash_df['mean_pc_last_5'] * 100 / (wash_df['gdp_pc_ppp'])
wash_df['wash_aid_sq'] = wash_df['wash_aid'] ** 2

wash_df['log_gdp_pc'] = experiment.safe_log(wash_df, 'gdp_pc_ppp')
wash_df = experiment.take_avg_and_lag(wash_df, "health_share_gov_exp")
wash_df = experiment.take_avg_and_lag(wash_df, "adult_literacy")

wash_df['access_water_pavg'] = explore_util.rolling_country_agg(wash_df, "access_water", 5, "mean")
wash_df['access_san_pavg'] = explore_util.rolling_country_agg(wash_df, "access_sanitation", 5, "mean")

wash_df = explore_util.lag_variable_simple(wash_df, 'access_water_pavg', 1)
wash_df = explore_util.lag_variable_simple(wash_df, "access_san_pavg", 1)
wash_df = explore_util.lag_variable_simple(wash_df, 'access_water_pavg', 5)
wash_df = explore_util.lag_variable_simple(wash_df, "access_san_pavg", 5)

wash_df['log_wash_aid_sq'] = experiment.safe_log(wash_df, 'wash_aid_sq')

In [None]:
wash_controls_constant = [
    'adult_literacy_pavg', 
    'log_gdp_pc', 
    'health_share_gov_exp_pavg_lag1',
    'young_population', 'conflict', 'freedom_house']

wash_access_cols = ['access_water_pavg', 'access_san_pavg', 'access_water_pavg_lag5', 'access_san_pavg_lag5']
wash_proj_cols = ['wash_satisfactory_proj', 'wash_max_proj_5yr']

remove_for_water = ["access_san_pavg", "access_water_pavg_lag5"]
remove_for_san = ["access_water_pavg", "access_san_pavg_lag5"]

In [None]:
wash_default_args = dict(
    df=wash_df[wash_df.gdp_pc_ppp < 10000], treatment_col="wash_aid",
                feature_cols=wash_controls_constant + wash_access_cols + wash_proj_cols, # + ['log_wash_aid_sq'],
                remove_feature_cols=remove_for_water,
                log_target=True, log_treatment=True, add_constant=True, add_country_feffects=True
)

In [None]:
water_args = dict(target_col="access_water_pavg", remove_feature_cols=remove_for_water)
water_est = experiment.evaluate_treatment(**assemble_args(water_args, wash_default_args))
# print(experiment.extract_treatment_results('Access to Water Estimate', water_est, 'access_water_pavg', 'wash_aid', wash_controls_constant, None))
print("Doubling effect: ", 2 ** (water_est.params['wash_aid']) - 1)
print(water_est.summary())

In [None]:
san_args = dict(target_col="access_san_pavg", remove_feature_cols=remove_for_san)
san_est = experiment.evaluate_treatment(**assemble_args(san_args, wash_default_args))

print(san_est.summary())
print("Effect of doubling: ", 2 ** (san_est.params["wash_aid"]) - 1)

In [None]:
write_wash_results = False

if write_wash_results:
    write_to_text(water_est, "water_initial_fe_linear")
    write_to_text(san_est, "sanitation_initial_fe_linear")

### Partialling out and EconML

In [None]:
wash_feature_cols = ['adult_literacy_pavg', 'log_gdp_pc', 'health_share_gov_exp_pavg_lag1',  
             'young_population', 'conflict', 'freedom_house', 'access_water_pavg_lag5']

wash_df = explore_util.lag_variable_simple(wash_df, 'mean_pc_last_5_ppd', 1)
wash_df = explore_util.lag_variable_simple(wash_df, 'w_avg_rating', 5)

# do this manually first, as EconML results are proving volatile and highly counter-intuitive
wdlm_df = wash_df.copy()

cols_to_scale = ['access_san_pavg', 'w_avg_rating_lag5', 'wash_max_proj_5yr', 'mean_pc_last_5_ppd_lag1'] + wash_feature_cols
for col in cols_to_scale:
    wdlm_df[col] = (wdlm_df[col] - wdlm_df[col].mean()) / wdlm_df[col].std()

rating_col = 'wash_max_proj_5yr'
magnitude_col = 'mean_pc_last_5_ppd_lag1'
target_col = 'access_san_pavg'

wash_data_sets = { 
    "all_years": wdlm_df, 
    "only_from_raters": wdlm_df[wdlm_df[magnitude_col] > 0], 
    "only_rated": wdlm_df[wdlm_df[rating_col] > 0]
}

wash_max_proj_df = partial_out_crawl("wash_max_proj_5yr", target_col, cols_to_scale, wash_data_sets)
wash_last_proj_df = partial_out_crawl("w_avg_rating_lag5", target_col, cols_to_scale, wash_data_sets)

wash_pout_results = pd.concat((wash_max_proj_df, wash_last_proj_df)).round(2)

In [None]:
wash_pout_results