In [None]:
%config Completer.use_jedi = False
from importlib import reload

In [None]:
import statsmodels.api as sm

from datetime import datetime, timedelta

import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

from IPython.display import Image, display

from dowhy import CausalModel

In [None]:
pd.set_option("mode.chained_assignment", None)

In [None]:
import util.load as load_util
import util.explore as explore_util
import util.experiment as experiment

In [None]:
def add_project_and_aid_cols(sector_df, sector='education'):
    mean_pc_col = f"{sector}_mean_pc_last_5"
    if mean_pc_col not in sector_df:
        print('Generating mean per capita commitments over prior years')
        sector_df[mean_pc_col] = sector_df.apply(
            lambda row: obtain_lagged(f'pc_commit_{sector}', row['country'], row['year'], -5, take_agg=True, agg_function=np.mean), axis=1)

    sat_proj_col = f"{sector}_satisfactory_proj"
    if sat_proj_col not in sector_df:
        print('Marking whether a satisfactory project concluded in that year')
        sector_df[sat_proj_col] = (sector_df['max_rating'] > 3).astype(int)

    max_rating_col = f"{sector}_max_proj_5yr"
    if max_rating_col not in sector_df:
        print('Taking maximum of weighted rating of concluded projects in prior period')
        sector_df[max_rating_col] = sector_df.apply(lambda row: obtain_lagged('w_avg_rating', row['country'], row['year'], -5, take_agg=True, agg_function=np.max), axis=1)
        
    return sector_df

In [None]:
def evaluate_treatment(df, target_col, treatment_col, feature_cols,
                       log_target=False, log_treatment=False, remove_feature_cols=[], # this last is convenience 
                       add_country_feffects=True, add_constant=True):
    data = df.copy() # else logs overwrite
    if treatment_col not in feature_cols:
        feature_cols += [treatment_col]
    ols_cols = [col for col in feature_cols if col not in remove_feature_cols]
    if log_target:
        data[target_col] = np.log(data[target_col].replace(0, np.nan)).fillna(0)
    if log_treatment:
        data[treatment_col] = np.log(data[treatment_col].replace(0, np.nan)).fillna(0)
    
    est = experiment.plain_vanilla_ols(data, target_col, ols_cols, 
                                       add_country_feffects=add_country_feffects, 
                                       add_constant=add_constant)
    
    return est

In [None]:
def extract_treatment_results(label, est, target_col, treatment_col, feature_cols, est_kwards, sig_level=0.05):
    sig_params = [param for param in est.params.keys() if est.pvalues[param] < sig_level]
    sig_features = [param for param in sig_params if param in feature_cols and param != treatment_col]
    sig_coeffs = { feature: round(est.params[feature], 4) for feature in sig_features }
    sig_f_effects = [param for param in sig_params if param not in feature_cols]
    
    return {
        'Label': label,
        'Regression P': est.f_pvalue,
        'Treatment significance': est.pvalues[treatment_col],
        'Treatment coefficient': est.params[treatment_col],
        'Sig feature coefficient': sig_coeffs,
        'Sig feature p-values': { col: round(est.pvalues[col], 4) for col in sig_features },
        'Number significant FE': len(sig_f_effects),
        'Mean coefficient on FE': max([value for param, value in est.params.items() if param in sig_f_effects]) if len(sig_f_effects) > 0 else 0,
        'Keyword args': est_kwards
    }

### Load in data frames

1. Load in panel assembled by DG, and country code cross-matches
2. For each country-year, calculate mean growth in education indicators at year + lag

In [None]:
# Project data
project_df = load_util.load_projects() # loads in aid data projects
# Sector ratings for educations
edu_treatment_df = load_util.assemble_sector_ratings(project_df, 'Education').fillna(0) # loads in 
edu_treatment_df.head()

In [None]:
recreate_df = False

if recreate_df:
    panel_df, panel_source = experiment.assemble_replication_panel('education')
    df = panel_df.merge(edu_treatment_df, how='left', left_on=['year', 'country'], right_on=['end_year', 'country_code'])
    treatment_cols = [col for col in edu_treatment_df.columns if col not in ["end_year", "country_code"]]
    df[treatment_cols] = df[treatment_cols].fillna(0)
else:
    df = pd.read_csv('../data/transformed_data/education_df.csv', index_col=0)

### Construct lagged indicators, fille in needed columns

In [None]:
df.project_completed_year.value_counts()

In [None]:
def obtain_lagged(col, country, year, lag_years, take_agg=False, agg_function=None):
    if take_agg:
        start_year = year if lag_years > 0 else year + lag_years
        end_year = year + lag_years if lag_years > 0 else year
        years = df[(df.country == country) & (df.year >= start_year) & (df.year < end_year)]
        if len(years) == 0:
            return np.nan
        else:
            return agg_function(years[col])
    else:
        future = df[(df.country == country) & (df.year == year + lag_years)]
        if len(future) == 0:
            return np.nan
        else:
            return future.iloc[0].to_dict()[col]

Generate some necessary feature and outcome columns

In [None]:
def add_lagged_future_edu_outcomes(df):
    if 'lagged_edu_ner' not in df:
        print('Generating past net enrollment rates')
        df['lagged_edu_ner'] = df.apply(lambda row: obtain_lagged('edu_ner', row['country'], row['year'], -5), axis=1)

    if 'future_edu_ner' not in df:
        print('Generating future net enrollment rates')
        df['future_edu_ner'] = df.apply(lambda row: obtain_lagged('edu_ner', row['country'], row['year'], 5), axis=1)

    return df

In [None]:
df = add_project_and_aid_cols(df)

In [None]:
df = add_lagged_future_edu_outcomes(df)

In [None]:
store_df = False

if store_df:
    df.to_csv('../data/transformed_data/education_df.csv')

In [None]:
# df.max_proj_rating_5.hist()

### Check data coverage, isolating features with a lot of uncovered years/countries

A column is a culprit for an N/A row if it alone is N/A, i.e., it is responsible for the country-year being unusable. If multiple columns are N/A then none are culprits

In [None]:
conduct_coverage_analysis = False

In [None]:
if 'panel_source' not in vars():
    panel_source = pd.read_csv('../data/countrypanel.csv')

In [None]:
if conduct_coverage_analysis:

    # We only want to do this on the data columns, so don't check the others
    panel_label_columns = ['year', 'countrycode', 'regionname', 'fcv_ind', 'lendingtype', 'incomelevel']
    df_label_columns = ['year', 'country', 'ppd_countrycode', 'wdi_countryname', 'project_completed_year'] + panel_non_data

    data_cols = [col for col in df.columns if col not in panel_label_columns + df_label_columns]
    ddf_data_cols = [col for col in panel_source.columns if col not in panel_non_data]

    culprit_counts, null_df = experiment.extract_culprit_counts(df, data_cols)
    ddf_culprits, ddf_nulls = experiment.extract_culprit_counts(panel_source, ddf_data_cols)

    non_zero_culps = lambda culpc: (
        { key: value for key, value in sorted(culprit_counts.items(), key=lambda item: item[1], reverse=True) if value > 0 }
    ) 

    print(non_zero_culps(culprit_counts))
    print(non_zero_culps(ddf_culprits))

*Note*: There is no surplus deficit in the standard WDI, so using net borrowing in year (is available)

## Main event, education: replicate original, then probe the specification

Education: Specification 2
```
regress  last_ner <- first_ner pc_commit_education [per capita commitment amount=
        edu_share_gov_exp edu_pupil_teacher young_population
        gdp_pc_ppp cash_surplus_deficit inflation trade_share_gdp
        freedom_house i.period i.ncountrycode if countrytoinclude == 1, r
```

In [None]:
# Adding categorical variables for country, for replication purposes, although means dimensionality explosion (for unclear gain)

In [None]:
df = df.rename(columns={ 'education_lag_-4_growth': 'prior_4year_growth' })

In [None]:
df[(df.mean_pc_last_5.notna()) & (df.country == 'CHN')][['country', 'year', 'edu_ner', 'pc_commit_education', 'mean_pc_last_5']]

In [None]:
df.mean_pc_last_5.isna().sum()

In [None]:
df['prior_ner_growth'] = df['edu_ner'] / df['lagged_edu_ner']

data_cols = ['country', 'satisfactory_proj', 'w_avg_rating', 'prior_ner_growth', 'edu_ner',
                        'mean_pc_last_5', 'edu_pupil_teacher', 
                        'young_population', 'gdp_pc_ppp', 'cash_surplus_deficit', 'inflation', 'trade_share_gdp',
                        'freedom_house', 'prior_4year_growth']

# first go for the paper
r_est = evaluate_treatment(df, 'edu_ner', 'mean_pc_last_5', data_cols,
                          remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth'],
                          add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True)

extract_treatment_results('Replication', r_est, 'edu_ner', 'mean_pc_last_5', data_cols, None)

In [None]:
print(r_est.summary())

In [None]:
search_grid = {
    'Straight replication': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )
    },
    'Include weighted average rating': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'satisfactory_proj', 'prior_4year_growth']
        )
    },
    'Include constant term in regression': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=True, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )        
    },
    'Include prior growth across education outcomes': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'satisfactory_proj', 'w_avg_rating']
        )
    },
    'Use growth in NER as target': {
        'target_col': 'prior_ner_growth', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=True, log_target=True, log_treatment=True,
            remove_feature_cols=['edu_ner', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )

    }
}

In [None]:
treatment_search_result = []
estimators = {}
for label, args in search_grid.items():
    est = evaluate_treatment(df, args['target_col'], args['treatment_col'], data_cols, **args['est_kw_args'])
    results = extract_treatment_results(label, est, args['target_col'], args['treatment_col'], data_cols, args['est_kw_args'])
    treatment_search_result.append(results)
    estimators[label] = est

gsearch_results = pd.DataFrame(treatment_search_result)

In [None]:
write_results = False

if write_results:
    with open("./growth_not_abs_ner_target.txt", "w") as file:
        file.write(estimators['Use growth in NER as target'].summary().as_text())

    with open("./base_replication_full.txt", "w") as file:
        file.write(estimators['Straight replication'].summary().as_text())

    gsearch_results.to_csv('../data/results/education_model_crawl.csv', float_format='%.4f')

In [None]:
display_repl_summary = True

if display_repl_summary:
    print("*** Standard Replication: ")
    print(r_est.summary())
    
    print("*** Replication with average rating: ")
    print(estimators['Include weighted average rating'].summary())

## Now conduct health

Process:

1. Repeat outcome variable formation, using lagged construction
2. Construct sectoral aid per capita using utilities
3. Construct specification, using Diana's original notebook

In [None]:
health_treatment_df = load_util.assemble_sector_ratings(project_df, 'Health').fillna(0)
health_treatment_df.head()

In [None]:
recreate_health_df = False

if recreate_health_df:
    hp_df, hp_source = experiment.assemble_replication_panel('health')
    health_df = hp_df.merge(health_treatment_df, how='left', left_on=['year', 'country'], right_on=['end_year', 'country_code'])
    treatment_cols = [col for col in health_treatment_df.columns if col not in ["end_year", "country_code"]]
    health_df[treatment_cols] = df[treatment_cols].fillna(0)
else:
    health_df = pd.read_csv('../data/transformed_data/health_df.csv', index_col=0)

In [None]:
health_df = add_project_and_aid_cols(health_df, sector='health')

treatment_cols = [col for col in health_treatment_df.columns if col not in ["end_year", "country_code"]]
health_df[treatment_cols] = health_df[treatment_cols].fillna(0)
print("Treatment counts: ", health_df.project_completed_year.value_counts())

In [None]:
if 'lagged_health_mort' not in health_df:
    print('Generating past mortality')
    health_df['lagged_health_mort'] = health_df.apply(
        lambda row: obtain_lagged('mortality_under5', row['country'], row['year'], -5), axis=1
    )

if 'future_health_mort' not in health_df:
    print('Generating future mortality')
    health_df['future_health_mort'] = health_df.apply(
        lambda row: obtain_lagged('mortality_under5', row['country'], row['year'], 5), axis=1
    )

In [None]:
store_health_df = False

if store_health_df:
    health_df.to_csv('../data/transformed_data/health_df.csv')

* Specification 1:

```qui regress mortality_under5 pc_commit_health lag_mortality_under5 ///
            lag_gdp_pc_ppp lag_fertility lag_population ///
            hiv_prevalence conflict i.period, r```

In [None]:
health_df = health_df.rename(columns={ 'health_lag_-4_growth': 'prior_4year_growth' })

In [None]:
# sometimes gets dragged along, so drop to make sense of things more easily
health_df = health_df.drop(columns=[
    'edu_ner', 'edu_aner', 'edu_completion',
    'edu_share_gov_exp', 'edu_pupil_teacher',
    'adult_literacy', 'commit_education', 'pc_commit_wash', 
    'pc_commit_education', 'commit_wash'], errors='ignore')

In [None]:
health_df['prior_mort_decline'] = health_df['mortality_under5'] / health_df['lagged_health_mort']

In [None]:
health_data_cols = ['country', 'health_satisfactory_proj', 'w_avg_rating', 'prior_mort_decline', 'mortality_under5',
                        'health_mean_pc_last_5', 'gdp_pc_ppp', 'cash_surplus_deficit', 
                        'inflation', 'trade_share_gdp', 'freedom_house', 'prior_mort_decline',
                        'fertility', 'hiv_prevalence', 'conflict']

In [None]:
# first go for the paper
# replication_exclude = ['prior_mort_decline', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth',
#                       'inflation', 'trade_share_gdp', 'freedom_house', 'mean_pc_last_5']

cols_for_rep1 = ['mortality_under5', 'health_mean_pc_last_5', 'lagged_health_mort', 'gdp_pc_ppp', 'fertility', 'population', 'hiv_prevalence', 'conflict']
health_est = evaluate_treatment(health_df, 'mortality_under5', 'health_mean_pc_last_5', cols_for_rep1,
                          add_country_feffects=False, add_constant=False, log_target=True, log_treatment=True)

extract_treatment_results('Health Replication', health_est, 'mortality_under5', 'health_mean_pc_last_5', cols_for_rep1, None)

In [None]:
print(health_est.summary())

In [None]:
# now add in controls for macro conditions
cols_rep2 = cols_for_rep1 + ['inflation', 'cash_surplus_deficit', 'trade_share_gdp']
health_est2 = evaluate_treatment(health_df, 'mortality_under5', 'mean_pc_last_5', cols_rep2,
                          add_country_feffects=False, add_constant=False, log_target=True, log_treatment=True)

extract_treatment_results('Health Replication with Macro', health_est2, 'mortality_under5', 'health_mean_pc_last_5', cols_rep2, None)

* Specification 6

```qui regress mortality_under5 pc_commit_health lag_mortality_under5 ///
            lag_gdp_pc_ppp lag_fertility lag_population ///
            lag_physicians_rate  lag_female_adult_literacy ///
            lag_access_water lag_access_sanitation ///
            hiv_prevalence conflict i.period i.nregionname, r```

In [None]:
'female_adult_literacy' in health_df

In [None]:
# 'female_adult_literacy',
cols_rep3 = cols_for_rep1 + ['access_water', 'access_sanitation', 'physicians_rate']

health_est3 = evaluate_treatment(health_df, 'mortality_under5', 'health_mean_pc_last_5', 
                          cols_rep3, add_country_feffects=False, add_constant=False, log_target=True, log_treatment=True)

extract_treatment_results('Health Replication 6', health_est3, 'mortality_under5', 'health_mean_pc_last_5', health_data_cols, None)

In [None]:
print(health_est3.summary())

In [None]:
# add project ratings
cols_rep4 = cols_rep3 + ['w_avg_rating']

health_est4 = evaluate_treatment(health_df, 'mortality_under5', 'health_mean_pc_last_5', 
                          cols_rep4, add_country_feffects=False, add_constant=False, log_target=True, log_treatment=True)

extract_treatment_results('Health Replication 6', health_est4, 'mortality_under5', 'health_mean_pc_last_5', health_data_cols, None)

In [None]:
print(health_est4.summary())

## Next round of simple replications

In [None]:
# first: restrict to projects with satisfactory ratings and better

In [None]:
df.columns

In [None]:
n_est = evaluate_treatment(df, 'edu_ner', 'satisfactory_proj', data_cols,
                          remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'prior_4year_growth'],
                          add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True)

In [None]:
print(n_est.summary())

In [None]:
# second: construct exploratory function

In [None]:
# third: conduct for health

In [None]:
# fourth: repeat for WASH

In [None]:
# fifth: summarize 

In [None]:
# sixth: move onto EconML, and start planning the writing

In [None]:
# future: do a pair-wise comparison, e.g., using BERT and others