In [None]:
%config Completer.use_jedi = False
from importlib import reload

In [None]:
import statsmodels.api as sm

from datetime import datetime, timedelta

import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

from IPython.display import Image, display

from dowhy import CausalModel

In [None]:
pd.set_option("mode.chained_assignment", None)

In [None]:
import util.load as load_util
import util.explore as explore_util
import util.experiment as experiment

In [None]:
from econml.dml import LinearDML

### Load in data frames

1. Load in panel assembled by DG, and country code cross-matches
2. For each country-year, calculate mean growth in education indicators at year + lag

In [None]:
project_df = load_util.load_projects() # loads in aid data projects
edu_treatment_df = load_util.assemble_sector_ratings(project_df, 'Education').fillna(0) # loads in 
edu_treatment_df.head()

In [None]:
recreate_df = False

if recreate_df:
    panel_df, panel_source = experiment.assemble_replication_panel('education')
    df = panel_df.merge(edu_treatment_df, how='left', left_on=['year', 'country'], right_on=['end_year', 'country_code'])
    treatment_cols = [col for col in edu_treatment_df.columns if col not in ["end_year", "country_code"]]
    df[treatment_cols] = df[treatment_cols].fillna(0)
else:
    df = pd.read_csv('../data/transformed_data/education_df.csv', index_col=0)

In [None]:
# deal with some legacy (quicker than regenerating)
df = df.drop(columns=[f"education_lag_{i}_growth" for i in range(1, 10)], errors="ignore")
df = df.drop(columns=[f"education_lag_{i}_count" for i in range(1, 10)], errors="ignore")
df = df.drop(columns=["education_lag_-4_count", "mean_pc_last_5", "future_edu_ner", "lagged_edu_ner", "ner_growth"], errors="ignore")

### Construct lagged indicators, fille in needed columns

In [None]:
df.project_completed_year.value_counts()

Generate some necessary feature and outcome columns

In [None]:
def add_lagged_future_edu_outcomes(df):
    if 'edu_ner_lag5' not in df:
        print('Generating past net enrollment rates')
        df = explore_util.lag_variable_simple(df, 'edu_ner', 5)

    if 'future_edu_ner' not in df:
        print('Generating future net enrollment rates')
        df = explore_util.lag_variable_simple(df, 'edu_ner', -5)
        df = df.rename(columns = { 'edu_ner_lag-5': 'future_edu_ner'})

    return df

In [None]:
df = experiment.add_project_and_aid_cols(df)
df = add_lagged_future_edu_outcomes(df)

In [None]:
df[df.edu_ner_lag5.notna()][['year', 'country', 'edu_ner', 'edu_ner_lag5', 'pc_commit_education', 'education_mean_pc_rolling_5', 'education_mean_pc_rolling_5_lag1']].head()

In [None]:
store_df = False

if store_df:
    df.to_csv('../data/transformed_data/education_df.csv')

### Check data coverage, isolating features with a lot of uncovered years/countries

A column is a culprit for an N/A row if it alone is N/A, i.e., it is responsible for the country-year being unusable. If multiple columns are N/A then none are culprits

In [None]:
conduct_coverage_analysis = False

In [None]:
if 'panel_source' not in vars():
    panel_source = pd.read_csv('../data/countrypanel.csv')

In [None]:
if conduct_coverage_analysis:

    # We only want to do this on the data columns, so don't check the others
    panel_label_columns = ['year', 'countrycode', 'regionname', 'fcv_ind', 'lendingtype', 'incomelevel']
    df_label_columns = ['year', 'country', 'ppd_countrycode', 'wdi_countryname', 'project_completed_year'] + panel_non_data

    data_cols = [col for col in df.columns if col not in panel_label_columns + df_label_columns]
    ddf_data_cols = [col for col in panel_source.columns if col not in panel_non_data]

    culprit_counts, null_df = experiment.extract_culprit_counts(df, data_cols)
    ddf_culprits, ddf_nulls = experiment.extract_culprit_counts(panel_source, ddf_data_cols)

    non_zero_culps = lambda culpc: (
        { key: value for key, value in sorted(culprit_counts.items(), key=lambda item: item[1], reverse=True) if value > 0 }
    ) 

    print(non_zero_culps(culprit_counts))
    print(non_zero_culps(ddf_culprits))

*Note*: There is no surplus deficit in the standard WDI, so using net borrowing in year (is available)

## Education: replicate original, then probe the specification

Education: Specification 2
```
regress  last_ner <- first_ner pc_commit_education [per capita commitment amount=
        edu_share_gov_exp edu_pupil_teacher young_population
        gdp_pc_ppp cash_surplus_deficit inflation trade_share_gdp
        freedom_house i.period i.ncountrycode if countrytoinclude == 1, r
```

In [None]:
df['mean_pc_last_5'] = (df.pc_commit_education_lag5.notna() * df.education_mean_pc_rolling_5_lag1).replace({ 0: np.nan })

In [None]:
df['period'] = round((df.year - 1900) / 5) - 10
df['prior_ner_growth'] = df['edu_ner'] / df['edu_ner_lag5']
df = explore_util.lag_variable_simple(df, 'pc_commit_education', 5)

In [None]:
df[(df.education_mean_pc_rolling_5_lag1.notna()) & (df.country == 'CHN')][['country', 'year', 'edu_ner', 'mean_pc_last_5', 'pc_commit_education', 'education_mean_pc_rolling_5_lag1']]

In [None]:
df.education_mean_pc_rolling_5_lag1.isna().sum()

In [None]:
data_cols = ['edu_ner', 'mean_pc_last_5',
             'edu_share_gov_exp', 'edu_pupil_teacher', 'young_population', 'gdp_pc_ppp', 
             'cash_surplus_deficit', 'inflation', 'trade_share_gdp',
             'freedom_house', 'prior_ner_growth', 'prior_4year_growth']
    
# first go for the paper
edu_treatment_col = 'mean_pc_last_5'

initial_drop = ['prior_ner_growth', 'edu_share_gov_exp', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
r_est = experiment.evaluate_treatment(df, 'edu_ner', edu_treatment_col, data_cols,
                          remove_feature_cols=initial_drop,
                          add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
                                     add_period_feffects=False)

experiment.extract_treatment_results('Replication', r_est, 'edu_ner', edu_treatment_col, data_cols, None)
# print(r_est.summary())

In [None]:
print(r_est.summary())

In [None]:
search_grid = {
    'Straight replication': {
        'target_col': 'edu_ner', 
        'treatment_col': edu_treatment_col,
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )
    },
    'Include rating information': {
        'target_col': 'edu_ner', 
        'treatment_col': edu_treatment_col,
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'prior_4year_growth']
        )
    },
    'Include period fixed effects': {
        'target_col': 'edu_ner', 
        'treatment_col': edu_treatment_col,
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=True, log_target=True, log_treatment=True, add_period_feffects=True,
            remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )        
    },
    'Include prior growth across education outcomes': {
        'target_col': 'edu_ner', 
        'treatment_col': edu_treatment_col,
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'edu_share_gov_exp', 'satisfactory_proj', 'w_avg_rating']
        )
    },
    'Properly include govt share spend': {
        'target_col': 'edu_ner', 
        'treatment_col': edu_treatment_col,
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )
    },
    'Use growth in NER as target': {
        'target_col': 'prior_ner_growth', 
        'treatment_col': edu_treatment_col,
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=True, log_target=True, log_treatment=True,
            remove_feature_cols=['edu_ner', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )

    }
}

In [None]:
treatment_search_result = []
estimators = {}
for label, args in search_grid.items():
    est = experiment.evaluate_treatment(df, args['target_col'], args['treatment_col'], data_cols, **args['est_kw_args'])
    results = experiment.extract_treatment_results(label, est, args['target_col'], args['treatment_col'], data_cols, args['est_kw_args'])
    treatment_search_result.append(results)
    estimators[label] = est

gsearch_results = pd.DataFrame(treatment_search_result)

In [None]:
gsearch_results

In [None]:
treatment_search_result[1]

In [None]:
write_results = False

if write_results:
    with open("./growth_not_abs_ner_target.txt", "w") as file:
        file.write(estimators['Use growth in NER as target'].summary().as_text())

    with open("./base_replication_full.txt", "w") as file:
        file.write(estimators['Straight replication'].summary().as_text())

    gsearch_results.to_csv('../data/results/education_model_crawl.csv', float_format='%.4f')

In [None]:
display_repl_summary = True

if display_repl_summary:
    print("*** Standard Replication: ")
    print(r_est.summary())
    
    print("*** Replication with average rating: ")
    print(estimators['Include rating information'].summary())

## Health

Process:

1. Repeat outcome variable formation, using lagged construction
2. Construct sectoral aid per capita using utilities
3. Construct specification, using Diana's original notebook

In [None]:
health_treatment_df = load_util.assemble_sector_ratings(project_df, 'Health').fillna(0)
health_treatment_df.head()

In [None]:
recreate_health_df = False

if recreate_health_df:
    hp_df, hp_source = experiment.assemble_replication_panel('health')
    health_df = hp_df.merge(health_treatment_df, how='left', left_on=['year', 'country'], right_on=['end_year', 'country_code'])
    treatment_cols = [col for col in health_treatment_df.columns if col not in ["end_year", "country_code"]]
    health_df[treatment_cols] = df[treatment_cols].fillna(0)
else:
    health_df = pd.read_csv('../data/transformed_data/health_df.csv', index_col=0)

In [None]:
health_df = experiment.add_project_and_aid_cols(health_df, sector='health')

treatment_cols = [col for col in health_treatment_df.columns if col not in ["end_year", "country_code"]]
health_df[treatment_cols] = health_df[treatment_cols].fillna(0)
print("Treatment counts: ", health_df.project_completed_year.value_counts())

In [None]:
store_health_df = False

if store_health_df:
    health_df.to_csv('../data/transformed_data/health_df.csv')

In [None]:
# take rolling five year averages of everything, following paper
# note: not doing this for macro variables as justification does not make sense for those (measurement/volatility)
measured_cols = ['mortality_under5', 'fertility', 'hiv_prevalence']

for m_col in measured_cols:
    health_df[f"{m_col}_pavg"] = explore_util.rolling_country_agg(health_df, m_col, 5, "mean")
    health_df = explore_util.lag_variable_simple(health_df, f"{m_col}_pavg", 1)
    
macro_cols = ["gdp_pc_ppp", "population"]
for m_col in macro_cols:
    health_df = explore_util.lag_variable_simple(health_df, m_col, 1)

In [None]:
health_df['mean_pc_last_5'] = (
    health_df.pc_commit_health_lag5.notna() * health_df.health_mean_pc_rolling_5_lag1
).replace({ 0: np.nan })

health_df = explore_util.lag_variable_simple(health_df, "mortality_under5_pavg", 5)
health_df["lag_log_mort"] = np.log(health_df["mortality_under5_pavg_lag5"])
health_df['prior_mort_decline'] = health_df['mortality_under5_pavg'] / health_df['mortality_under5_pavg_lag5']

* Specification 1:

```qui regress mortality_under5 pc_commit_health lag_mortality_under5 ///
            lag_gdp_pc_ppp lag_fertility lag_population ///
            hiv_prevalence conflict i.period, r```

In [None]:
# health_df.columns

In [None]:
target_cols = ["mortality_under5_pavg"]
treatment_cols = ["mean_pc_last_5"]

momentum_cols = ["lag_log_mort"]
control_cols = ["hiv_prevalence_pavg_lag1", "fertility_pavg_lag1", "gdp_pc_ppp_lag1", "population_lag1", "conflict"]
rating_cols = ['health_satisfactory_proj', 'w_avg_rating']


health_data_cols = ["country"] + target_cols + treatment_cols + momentum_cols + control_cols + rating_cols

In [None]:
# first go for the paper
# replication_exclude = ['prior_mort_decline', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth',
#                       'inflation', 'trade_share_gdp', 'freedom_house', 'mean_pc_last_5']

health_est = experiment.evaluate_treatment(health_df, 'mortality_under5_pavg', 'mean_pc_last_5', health_data_cols,
                            remove_feature_cols=["prior_mort_decline"], add_country_feffects=False, 
                            add_constant=False, log_target=True, log_treatment=True)

experiment.extract_treatment_results('Health Replication', health_est, 'mortality_under5_pavg', 'mean_pc_last_5', health_data_cols, None)

In [None]:
print(health_est.summary())

In [None]:
# now add in controls for macro conditions
cols_rep2 = health_data_cols + ['inflation', 'cash_surplus_deficit', 'trade_share_gdp']
health_est2 = experiment.evaluate_treatment(health_df, 'mortality_under5_pavg', 'mean_pc_last_5', cols_rep2,
                          add_country_feffects=False, add_constant=False, log_target=True, log_treatment=True)

experiment.extract_treatment_results('Health Replication with Macro', health_est2, 'mortality_under5_pavg', 'mean_pc_last_5', cols_rep2, None)

* Specification 6

```qui regress mortality_under5 pc_commit_health lag_mortality_under5 ///
            lag_gdp_pc_ppp lag_fertility lag_population ///
            lag_physicians_rate  lag_female_adult_literacy ///
            lag_access_water lag_access_sanitation ///
            hiv_prevalence conflict i.period i.nregionname, r```

In [None]:
'female_adult_literacy' in health_df

In [None]:
# 'female_adult_literacy',
cols_rep3 = health_data_cols + ['access_water', 'access_sanitation', 'physicians_rate']

health_est3 = experiment.evaluate_treatment(health_df, 'mortality_under5_pavg', 'mean_pc_last_5', 
                          cols_rep3, add_country_feffects=True, add_period_feffects=True,
                            add_constant=False, log_target=True, log_treatment=True)

experiment.extract_treatment_results('Health Replication 6', health_est3, 'mortality_under5_pavg', 'mean_pc_last_5', health_data_cols, None)

In [None]:
print(health_est3.summary())

### Now trying EconML (on health)

In [None]:
from econml.dml import LinearDML

In [None]:
eml_est = LinearDML()

In [None]:
health_df["log_mort"] = np.log(health_df["mortality_under5_pavg"])
health_df["log_mean_pc_last_5"] = np.log(health_df["mean_pc_last_5"])
health_df["log_proj_rating"] = np.log(health_df["w_avg_rating"].replace(0, np.nan)).fillna(0).fillna(0)

In [None]:
if "access_water" not in control_cols:
    control_cols += ["physicians_rate", "health_share_gov_exp"]

In [None]:
Y, T, X, W = experiment.assemble_econml_tuples(health_df, "log_mort", "log_mean_pc_last_5",
                                              control_cols + ["log_proj_rating"])
eml_est.fit(Y, T, X=X, W=W)
eml_est.summary()

In [None]:
Y, T, X, W = experiment.assemble_econml_tuples(health_df, "log_mort", "w_avg_rating",
                                              control_cols + ["log_mean_pc_last_5"])

eml_est.fit(Y, T, X=X, W=W)
eml_est.summary()

In [None]:
# so far copied from the process directly

from econml.policy import DRPolicyTree, DRPolicyForest
from sklearn.ensemble import RandomForestRegressor

# fit a single binary decision tree policy
policy = DRPolicyTree(max_depth=1, min_impurity_decrease=0.01, honest=True)
policy.fit(Y, T, X=X, W=W)
# predict the recommended treatment
recommended_T = policy.predict(X)
# plot the binary decision tree
plt.figure(figsize=(10,5))
policy.plot()
# get feature importances
importances = policy.feature_importances_

# fit a binary decision forest
policy = DRPolicyForest(max_depth=1, min_impurity_decrease=0.01, honest=True)
policy.fit(Y, T, X=X, W=W)
# predict the recommended treatment
recommended_T = policy.predict(X)
# plot the first tree in the ensemble
plt.figure(figsize=(10,5))
policy.plot(0)
# get feature importances
importances = policy.feature_importances_


## WASH replications

In [None]:
# Access = a0 + a1 * Aid + a2 * Aid^2 + beta * controls + country FE + error term

* Access (country, year) = access_water or access_sanitation  (each of them is separately used as the dependent variable, for Table 4 and Table 5, while the other is included as a lagged term in the controls). They also split urban and rural, but I think we can ignore this for now
* Aid (country, year) = Aid targeted to the water and sanitation sector as a percentage of GDP. So Aid = 100 * commit_wash / (gdp_pc * population)
* Controls (country, year): 
   - adult_literacy; log(gdp_pc), lagged(access_water or access_sanitation) and 3 others that are not exactly in the dataset but have reasonably close proxies:
   - Government spending on health (% of GDP)  is not in the dataset, but a reasonably close one is health_share_gov_exp = Government health expenditure (% of general government expenditure)
   - Age dependency ratio is not in the dataset, but a reasonably close one is young_population
   - Government stability from ICRG is not in the dataset, but reasonably close ones are conflict and freedom_house
* Other details:
   - Period = 1990-2010
   - Sample restricted to SSA countries only

In [None]:
safe_log = lambda data, col: np.log(data[col].replace(0, np.nan)).fillna(0)

In [None]:
wash_df = df.copy()

In [None]:
wash_df.columns

In [None]:
wash_df = experiment.add_project_and_aid_cols(wash_df, "wash")

In [None]:
wash_df['wash_aid'] = wash_df['mean_pc_last_5'] * 100 / (wash_df['gdp_pc_ppp'])
wash_df['wash_aid_sq'] = wash_df['wash_aid'] ** 2

In [None]:
wash_df[wash_df.wash_aid.notna()][['year', 'country', 'commit_wash', 'population', 'mean_pc_last_5', 'gdp_pc_ppp', 'wash_aid', 'wash_aid_sq']].tail()

In [None]:
def take_avg_and_lag(data, col):
    data[f"{col}_pavg"] = explore_util.rolling_country_agg(data, col, 5, "mean")
    data = explore_util.lag_variable_simple(data, f"{col}_pavg", 1)
    return data

In [None]:
wash_df['log_gdp_pc'] = safe_log(wash_df, 'gdp_pc_ppp')
wash_df = take_avg_and_lag(wash_df, "health_share_gov_exp")
wash_df = take_avg_and_lag(wash_df, "adult_literacy")

In [None]:
wash_df['access_water_pavg'] = explore_util.rolling_country_agg(wash_df, "access_water", 5, "mean")
wash_df['access_san_pavg'] = explore_util.rolling_country_agg(wash_df, "access_sanitation", 5, "mean")

In [None]:
# wash_df = explore_util.lag_variable_simple(wash_df, 'access_water_pavg', 1)
# wash_df = explore_util.lag_variable_simple(wash_df, "access_san_pavg", 1)

wash_df = explore_util.lag_variable_simple(wash_df, 'access_water_pavg', 5)
wash_df = explore_util.lag_variable_simple(wash_df, "access_san_pavg", 5)

In [None]:
wash_controls_constant = ['adult_literacy_pavg', 'log_gdp_pc', 'health_share_gov_exp_pavg_lag1',
                'young_population', 'conflict', 'freedom_house']

wash_access_cols = ['access_water_pavg', 'access_san_pavg', 'access_water_pavg_lag5', 'access_san_pavg_lag5']
wash_proj_cols = ['wash_satisfactory_proj', 'wash_max_proj_5yr']

In [None]:
wash_df['log_wash_aid_sq'] = safe_log(wash_df, 'wash_aid_sq')

In [None]:
remove_for_water = ["access_san_pavg", "access_water_pavg_lag5"]
# remove_for_water = []

water_est = experiment.evaluate_treatment(wash_df[wash_df.gdp_pc_ppp < 10000], "access_water_pavg", "wash_aid",
                                          wash_controls_constant + wash_access_cols + wash_proj_cols, # + ['log_wash_aid_sq'],
                                          remove_feature_cols=remove_for_water,
                                          log_target=True, log_treatment=True, add_constant=True, 
                                          add_country_feffects=True)

experiment.extract_treatment_results('Access to Water Estimate', water_est, 'access_water_pavg', 'wash_aid', wash_controls_constant, None)

In [None]:
print(water_est.summary())

In [None]:
# effect of doubling:
2 ** (water_est.params['wash_aid']) - 1

In [None]:
remove_for_san = ["access_water_pavg", "access_san_pavg_lag5"]
# remove_for_water = []

san_est = experiment.evaluate_treatment(wash_df[wash_df.gdp_pc_ppp < 10000], "access_san_pavg", "wash_aid",
                                          wash_controls_constant + wash_access_cols, # + wash_proj_cols + ['log_wash_aid_sq'],
                                          remove_feature_cols=remove_for_san,
                                          log_target=True, log_treatment=True, add_constant=True, 
                                          add_country_feffects=True)

print(san_est.summary())

In [None]:
# effect of doubling here
2 ** (san_est.params["wash_aid"])

In [None]:
write_wash_results = True

if write_wash_results:
    with open("../results/intial_run/water_initial_fe_linear.txt", "w") as file:
        file.write(water_est.summary().as_text())

    with open("../results/intial_run/sanitation_initial_fe_linear.txt", "w") as file:
        file.write(san_est.summary().as_text())

#     gsearch_results.to_csv('../data/results/education_model_crawl.csv', float_format='%.4f')

### Also trying GMM model

In [None]:
from statsmodels.sandbox.regression.gmm import GMM, LinearIVGMM, NonlinearIVGMM

In [None]:
gmm_df = wash_df[wash_df.gdp_pc_ppp < 10000][
    ['access_water_pavg', 'wash_aid', 'log_wash_aid_sq'] + wash_controls_constant + wash_access_cols
]
gmm_df = gmm_df.dropna()
gmm_df.head()

In [None]:
# endog, exog, instrument = map(lambda cols: np.asarray(gmm_df[cols]), [
#     ['access_water_pavg'], ['wash_aid'], ['conflict']
# ])
# gm_water_est = LinearIVGMM(endog, exog, instrument)
# gm_water_est.fit()

In [None]:
wash_df.access_water_pavg.describe()

In [None]:
water_est2 = experiment.evaluate_treatment(wash_df, "access_water_pavg", "wash_aid",
                                          wash_controls_constant + wash_access_cols + wash_proj_cols,
                                          remove_feature_cols=remove_for_water, log_target=True, log_treatment=True,
                                          add_constant=True, add_country_feffects=False)

In [None]:
print(water_est2.summary())

In [None]:
experiment.extract_treatment_results(water_est, )

In [None]:
n_est = evaluate_treatment(df, 'edu_ner', 'satisfactory_proj', data_cols,
                          remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'prior_4year_growth'],
                          add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True)

In [None]:
print(n_est.summary())

In [None]:
# future: do a pair-wise comparison, e.g., using BERT and others