In [None]:
from importlib import reload

In [None]:
import statsmodels.api as sm

In [None]:
from datetime import datetime, timedelta

import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt

from IPython.display import Image, display

from dowhy import CausalModel

In [None]:
pd.set_option("mode.chained_assignment", None)

In [None]:
import util.load as load_util
import util.explore as explore_util
import util.experiment as experiment

In [None]:
project_df = load_util.load_projects()
edu_treatment_df = load_util.assemble_sector_ratings(project_df, 'Education').fillna(0)

In [None]:
edu_treatment_df.head()

## First approach: binary treatment, replicating papers on causal structure

In [None]:
# load in panel assembled by DG, and country code cross-matches
# for each country-year, calculate mean growth in education indicators at year + lag

In [None]:
panel_df, panel_source = experiment.assemble_replication_panel('education')

In [None]:
# df = panel_df.merge(edu_treatment_df, how='left', left_on=['year', 'country'], right_on=['end_year', 'country_code'])

In [None]:
df = pd.read_csv('../data/transformed_data/education_df.csv', index_col=0)

In [None]:
treatment_cols = [col for col in edu_treatment_df.columns if col not in ["end_year", "country_code"]]
df[treatment_cols] = df[treatment_cols].fillna(0)
df.project_completed_year.value_counts()

In [None]:
def obtain_lagged(col, country, year, lag_years, take_agg=False, agg_function=None):
    if take_agg:
        start_year = year if lag_years > 0 else year + lag_years
        end_year = year + lag_years if lag_years > 0 else year
        years = df[(df.country == country) & (df.year >= start_year) & (df.year < end_year)]
        if len(years) == 0:
            return np.nan
        else:
            return agg_function(years[col])
    else:
        future = df[(df.country == country) & (df.year == year + lag_years)]
        if len(future) == 0:
            return np.nan
        else:
            return future.iloc[0].to_dict()[col]

In [None]:
# obtain_lagged('pc_commit_education', 'GNQ', 2005, -5)
# df[(df.country == 'GNQ') & (df.year > 2000)][['year', 'edu_ner', 'pc_commit_education', 'mean_pc_last_5', 'lagged_edu_ner']]

In [None]:
if 'mean_pc_last_5' not in df:
    print('Generating mean per capita commitments over prior years')
    df['mean_pc_last_5'] = df.apply(lambda row: obtain_lagged('pc_commit_education', row['country'], row['year'], -5, take_mean=True), axis=1)

if 'lagged_edu_ner' not in df:
    print('Generating past net enrollment rates')
    df['lagged_edu_ner'] = df.apply(lambda row: obtain_lagged('edu_ner', row['country'], row['year'], -5), axis=1)
    
if 'future_edu_ner' not in df:
    print('Generating future net enrollment rates')
    df['future_edu_ner'] = df.apply(lambda row: obtain_lagged('edu_ner', row['country'], row['year'], 5), axis=1)
    
if 'satisfactory_proj' not in df:
    print('Marking whether a satisfactory project concluded in that year')
    df['satisfactory_proj'] = (df['max_rating'] > 3).astype(int)
    
if 'max_proj_rating_5' not in df:
    print('Taking maximum of weighted rating of concluded projects in prior period')
    df['max_proj_rating_5'] = df.apply(lambda row: obtain_lagged('w_avg_rating', row['country'], row['year'], -5, take_agg=True, agg_function=np.max), axis=1)

In [None]:
store_df = False

if store_df:
    df.to_csv('../data/transformed_data/education_df.csv')

In [None]:
df.max_proj_rating_5.hist()

In [None]:
panel_non_data = ['year', 'countrycode', 'regionname', 'fcv_ind', 'lendingtype', 'incomelevel']
non_data_cols = ['year', 'country', 'ppd_countrycode', 'wdi_countryname', 'project_completed_year'] + panel_non_data
data_cols = [col for col in df.columns if col not in non_data_cols]
ddf_data_cols = [col for col in panel_source.columns if col not in panel_non_data]

In [None]:
non_zero_culps = lambda culpc: (
    { key: value for key, value in sorted(culprit_counts.items(), key=lambda item: item[1], reverse=True) if value > 0 }
) 
culprit_counts, null_df = experiment.extract_culprit_counts(df, data_cols)
print(non_zero_culps(culprit_counts))

In [None]:
ddf_culprits, ddf_nulls = experiment.extract_culprit_counts(panel_source, ddf_data_cols)
print(non_zero_culps(ddf_culprits))

*Note*: There is no surplus deficit in the standard WDI, so using net borrowing in year (is available)

In [None]:
# Adding categorical variables for country, for replication purposes, although means dimensionality explosion (for unclear gain)

Education: Specification 2
```
regress  last_ner <- first_ner pc_commit_education [per capita commitment amount=
        edu_share_gov_exp edu_pupil_teacher young_population
        gdp_pc_ppp cash_surplus_deficit inflation trade_share_gdp
        freedom_house i.period i.ncountrycode if countrytoinclude == 1, r
```

In [None]:
df = df.rename(columns={ 'education_lag_-4_growth': 'prior_4year_growth' })

In [None]:
def evaluate_treatment(df, target_col, treatment_col, feature_cols,
                       log_target=False, log_treatment=False, remove_feature_cols=[], # this last is convenience 
                       add_country_feffects=True, add_constant=True):
    data = df.copy() # else logs overwrite
    if treatment_col not in feature_cols:
        feature_cols += [treatment_col]
    ols_cols = [col for col in feature_cols if col not in remove_feature_cols]
    if log_target:
        data[target_col] = np.log(data[target_col].replace(0, np.nan)).fillna(0)
    if log_treatment:
        data[treatment_col] = np.log(data[treatment_col].replace(0, np.nan)).fillna(0)
    
    est = experiment.plain_vanilla_ols(data, target_col, ols_cols, 
                                       add_country_feffects=add_country_feffects, 
                                       add_constant=add_constant)
    
    return est

In [None]:
df['prior_ner_growth'] = df['edu_ner'] / df['lagged_edu_ner']
df[(df.country == 'GNQ') & (df.year > 2000)][['year', 'country', 'edu_ner', 'lagged_edu_ner', 'prior_ner_growth']]

In [None]:
data_cols = ['country', 'satisfactory_proj', 'w_avg_rating', 'prior_ner_growth', 'edu_ner',
                        'mean_pc_last_5', 'edu_pupil_teacher', 
                        'young_population', 'gdp_pc_ppp', 'cash_surplus_deficit', 'inflation', 'trade_share_gdp',
                        'freedom_house', 'prior_4year_growth']

In [None]:
# first go for the paper
r_est = evaluate_treatment(df, 'edu_ner', 'mean_pc_last_5', data_cols,
                          remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth'],
                          add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True)

In [None]:
def extract_treatment_results(label, est, target_col, treatment_col, feature_cols, est_kwards, sig_level=0.05):
    sig_params = [param for param in est.params.keys() if est.pvalues[param] < sig_level]
    sig_features = [param for param in sig_params if param in feature_cols and param != treatment_col]
    sig_coeffs = { feature: round(est.params[feature], 4) for feature in sig_features }
    sig_f_effects = [param for param in sig_params if param not in feature_cols]
    
    return {
        'Label': label,
        'Regression P': est.f_pvalue,
        'Treatment significance': est.pvalues[treatment_col],
        'Treatment coefficient': est.params[treatment_col],
        'Sig feature coefficient': sig_coeffs,
        'Sig feature p-values': { col: round(est.pvalues[col], 4) for col in sig_features },
        'Number significant FE': len(sig_f_effects),
        'Mean coefficient on FE': max([value for param, value in est.params.items() if param in sig_f_effects]),
        'Keyword args': est_kwards
    }

In [None]:
extract_treatment_results('Replication', r_est, 'edu_ner', 'mean_pc_last_5', data_cols, None)

In [None]:
# kwargs default: log_target=False, log_treatment=False, remove_feature_cols=[], # this last is convenience 
#                        add_country_feffects=True, add_constant=True

In [None]:
df.columns

In [None]:
search_grid = {
    'Straight replication': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )
    },
    'Include weighted average rating': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'satisfactory_proj', 'prior_4year_growth']
        )
    },
    'Include constant term in regression': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=True, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )        
    },
    'Include prior growth across education outcomes': {
        'target_col': 'edu_ner', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True,
            remove_feature_cols=['prior_ner_growth', 'satisfactory_proj', 'w_avg_rating']
        )
    },
    'Use growth in NER as target': {
        'target_col': 'prior_ner_growth', 
        'treatment_col': 'mean_pc_last_5',
        'est_kw_args': dict(
            add_country_feffects=True, add_constant=True, log_target=True, log_treatment=True,
            remove_feature_cols=['edu_ner', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth']
        )

    }
}

In [None]:
treatment_search_result = []
estimators = {}
for label, args in search_grid.items():
    est = evaluate_treatment(df, args['target_col'], args['treatment_col'], data_cols, **args['est_kw_args'])
    results = extract_treatment_results(label, est, args['target_col'], args['treatment_col'], data_cols, args['est_kw_args'])
    treatment_search_result.append(results)
    estimators[label] = est

In [None]:
gsearch_results = pd.DataFrame(treatment_search_result)

In [None]:
with open("./growth_not_abs_ner_target.txt", "w") as file:
    file.write(estimators['Use growth in NER as target'].summary().as_text())
    
with open("./base_replication_full.txt", "w") as file:
    file.write(estimators['Straight replication'].summary().as_text())

In [None]:
gsearch_results.to_csv('../data/results/education_model_crawl.csv', float_format='%.4f')

In [None]:
gsearch_results

In [None]:
2 ** (0.11)

In [None]:
df.edu_ner.describe()

In [None]:
r_est.pvalues['mean_pc_last_5']

In [None]:
print(r_est.summary())

In [None]:
2 ** (0.1101)

In [None]:
# now check if we leave in weighted average rating
# data_cols += ['max_proj_rating_5']
n_est = evaluate_treatment(df, 'edu_ner', 'mean_pc_last_5', data_cols,
                          remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'satisfactory_proj', 'prior_4year_growth'],
                          add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True)

In [None]:
print(n_est.summary())

In [None]:
df.edu_ner.describe()

In [None]:
df.mean_pc_last_5.describe()

In [None]:
print(est.summary())

In [None]:
# dml_est.summary()

In [None]:
# print(est.summary())

In [None]:
# print(target_est.summary())

## Next round of simple replications

In [None]:
# first: restrict to projects with satisfactory ratings and better

In [None]:
df.columns

In [None]:
n_est = evaluate_treatment(df, 'edu_ner', 'satisfactory_proj', data_cols,
                          remove_feature_cols=['prior_ner_growth', 'w_avg_rating', 'prior_4year_growth'],
                          add_country_feffects=True, add_constant=False, log_target=True, log_treatment=True)

In [None]:
print(n_est.summary())

In [None]:
# second: construct exploratory function

In [None]:
# third: conduct for health

In [None]:
# fourth: repeat for WASH

In [None]:
# fifth: summarize 

In [None]:
# sixth: move onto EconML, and start planning the writing

In [None]:
# future: do a pair-wise comparison, e.g., using BERT and others

## Now use EconML

In [None]:
from econml.dml import LinearDML
from sklearn.linear_model import LassoCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from econml.inference import BootstrapInference

In [None]:
# treatment_col = 'project_completed_year'

In [None]:
feature_cols = ['pc_commit_education', 'edu_pupil_teacher', 'young_population', 'gdp_pc_ppp', 
                'cash_surplus_deficit', 'inflation', 'trade_share_gdp', 'freedom_house', 'prior_4year_growth']

In [None]:
Y, T, X, W = experiment.assemble_econml_tuples(df, target_col, treatment_col, feature_cols)

In [None]:
est = LinearDML(model_t=LogisticRegressionCV(max_iter=500), discrete_treatment=True)
# est = LinearDML(model_t=RandomForestClassifier(), discrete_treatment=True)
est.fit(Y, T, X=X, W=W) # W -> high-dimensional confounders, X -> features

In [None]:
print(est.score_)
print(est.summary())

In [None]:
# point = est.const_marginddal_effect(X)
# print(point)
# est.effect(X, T0=False, T1=True)

In [None]:
# initiating some crawls, to find anything