In [3]:
import os

repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")
os.chdir(code_dir)

import matplotlib.pyplot as plt
import numpy as np
import scipy.linalg
from scipy.stats import spearmanr

import pickle
import pandas as pd
import sklearn 
import sys
import pandas as pd
from importlib import reload
import copy

import warnings

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import seaborn as sns

from mosaiks.utils.imports import *

# Key prediction functions are here
from prediction_utils import predict_y_from_kfold_dict, generalized_demean, X_matrix_to_demeaned_X
from prediction_utils import make_train_pred_scatterplot as make_scatterplot


import geopandas as gpd
import shapely

env variable MOSAIKS_HOME not defined; setting to: "/shares/maps100/code/code_LS/hdi_downscaling"
If not desired, please reset os.environ["MOSAIKS_NAME"]


## Subnational IWI Predictions

Predict and evaluate IWI performance at the DHS Cluster level. Model is trained in the notebook called `hdi_and_iwi_model_training.ipynb`. IWI data at the DHS cluster level are not available publicly. The hierarchal linking of DHS clusters to GDL ADM1 polygons occurs in the notebook called `hierarchally_link_DHS_cluster_and_adm1.ipynb`.

## Read DHS cluster IWI data and geo link file

In [2]:
link_df = pd.read_pickle(data_dir + "/int/DHS_to_GDL_ADM1_link/dhsid_to_gdl_adm1_link.p")

iwi = pd.read_csv('/shares/maps100/data/undp/int/GDL_IWI/mean_IWI.csv').groupby("DHSID").mean()

## Read ADM1 IWI data

We create a subnational and national dataframe of IWI. These IWI data are downloaded from the Global Data Lab for the year 2018 with 7 year nearest neighbor interpolation. 

In [3]:
path = (("/shares/maps100/data/raw/applications/HDI_Smits/"
        "GDL-Mean-International-Wealth-Index-(IWI)-score-of-region-data.csv"))
df = pd.read_csv(path)

In [4]:
subnat = df[df["Level"] == "Subnat"]
subnat = subnat.dropna().rename(columns= {"2018" : "subnat IWI"})

In [5]:
nat = subnat.groupby("ISO_Code")["subnat IWI"].mean().rename("nat IWI")

In [6]:
subnat = subnat.set_index("GDLCODE")["subnat IWI"]

## Step 2 - Evaluate cross country models at the DHS cluster level

In [7]:
def dhs_pred_truth_to_metrics(preds, truth, nat=nat,link_df=link_df, subnat=subnat, write_path=None, 
                              demeaned_input=False, return_df=False, recenter_on = None, extra_clip=False):
    
    """
    Function to calculate all metrics for DHS level predictions
    
    For the demaned models, we may want to recenter. Recenter options are as follows:
    
    recenter_on = "adm0", "adm1", or "adm1_ideal"
    adm0 - Add back the country mean for a demeaned model. (Mean of ADM1 obs, not the pop weighted country value)
    adm1 - Force the mean of the DHS observations to match the ADM1 observed value for the parent polygon
    
    adm1_ideal -  calculate the mean of adm2_observations aggregated to the adm1 level. Ensure these match for the truth
                    and the preds. We cannot do this in practice, because we imagine that we do not know the truth.
    
    """
    
    df = pd.DataFrame([preds.rename("preds"),truth.rename("truth")]).T
    
    dhs_id_to_iso = link_df[["DHSID","alpha-3","GDL_adm1_parent"]]
    
    df = df.merge(dhs_id_to_iso ,"left",left_index=True, right_on="DHSID")
    df = df.merge(nat, "left", left_on="alpha-3", right_index=True)
    df = df.merge(subnat, how="left", left_on = "GDL_adm1_parent", right_index=True)
    
    ## If demeaned, the first step is to add back the mean of of the ADM1 observations aggregated to the country level
    if demeaned_input:
        df["truth"] = df["truth"] + df["nat IWI"]
    
    #If we re-center preds on andmo
    if recenter_on == "adm0" and demeaned_input:
        df["preds"] = df["preds"] + df["nat IWI"]
        
    elif recenter_on == "adm0":
        raise NotImplementedError
        
    
    # This extra clipping option will only effect the demeaned model. It ensures that after
    # correction, there are not outliers outside the known min, max.
    if extra_clip:
        df["preds"] = np.clip(df["preds"], df["truth"].min(), df["truth"].max())
    
     
    if recenter_on == "adm1":
        ## Force the mean of the preds to match the observed ADM1 values
        grouped_to_adm1 = df.groupby(["GDL_adm1_parent"]).mean()[["preds"]].rename(columns = {
        "preds":"adm1_unweighted_mean_of_preds_from_dhs_obs"})
            
        df = df.merge(grouped_to_adm1, "left", left_on="GDL_adm1_parent", right_index=True)
        
        # difference between the observed ADM1 data and the preds aggregated to ADM1. 
        # we use this to re-center our preds
        df["adm1_recentering_adj"] = df["subnat IWI"] - df["adm1_unweighted_mean_of_preds_from_dhs_obs"]
        
        
        df["preds"] = df["adm1_recentering_adj"] + df["preds"]
        df.drop(columns = ["adm1_unweighted_mean_of_preds_from_dhs_obs"],
               inplace=True)
        
    
    if recenter_on == "adm1_ideal":
        ## Calculate the mean of DHS truth and preds aggreagated to the ADM1 level. Force these to match.
        ## This does not effect within country and within adm1 performance much, since we already force them to be
        ## mean 0
        
        grouped_to_adm1 = df.groupby(["GDL_adm1_parent"]).mean()[["truth", "preds"]].rename(columns = {
        "preds":"adm1_unweighted_mean_of_preds_from_dhs_obs","truth": "adm1_unweighted_mean_of_truth_from_dhs_obs"})
        df = df.merge(grouped_to_adm1, "left", left_on="GDL_adm1_parent", right_index=True)
        
        df["adj_ideal"] = df["adm1_unweighted_mean_of_truth_from_dhs_obs"] - df["adm1_unweighted_mean_of_preds_from_dhs_obs"]
        df["preds"] = df["adj_ideal"] + df["preds"]
        
        df.drop(columns = ["adm1_unweighted_mean_of_truth_from_dhs_obs","adm1_unweighted_mean_of_preds_from_dhs_obs"],
               inplace=True)
        
        
    r2 = sklearn.metrics.r2_score(df["truth"],df["preds"])
    pearson = np.corrcoef(df["truth"],df["preds"])[0,1] ** 2
    spearman = spearmanr(df["truth"],df["preds"]).correlation
    
    #Aggregate DHS obs to ADM0
    adm0_mean_dhs_obs = df.groupby("alpha-3")[["truth","preds"]].mean().rename(columns={
        "truth":"adm0_unweighted_mean_of_truth_from_dhs_obs","preds":"adm0_unweighted_mean_of_preds_from_dhs_obs"})
    df = df.merge(adm0_mean_dhs_obs, "left", left_on="alpha-3", right_index=True)
        
    #Aggregate DHS obs to ADM1
    grouped_to_adm1 = df.groupby(["GDL_adm1_parent"]).mean()[["truth", "preds"]].rename(columns = {
        "preds":"adm1_unweighted_mean_of_preds_from_dhs_obs","truth": "adm1_unweighted_mean_of_truth_from_dhs_obs"})
    df = df.merge(grouped_to_adm1, "left", left_on="GDL_adm1_parent", right_index=True)
    
    
    # Remove mean of DHS observatios for each country
    # See NL super-res notebook for comments on this general procedure
    
    df["preds_demean_adm0"] = df["preds"] - df["adm0_unweighted_mean_of_preds_from_dhs_obs"]
    df["true_demean_adm0"] = df["truth"] - df["adm0_unweighted_mean_of_truth_from_dhs_obs"]
    
    within_r2 = sklearn.metrics.r2_score(df["true_demean_adm0"],df["preds_demean_adm0"])
    within_pearson = np.corrcoef(df["true_demean_adm0"],df["preds_demean_adm0"])[0,1]  ** 2
    within_spearman = spearmanr(df["true_demean_adm0"],df["preds_demean_adm0"]).correlation
    
    
    df["preds_demean_adm1"] = df["preds"] - df["adm1_unweighted_mean_of_preds_from_dhs_obs"]
    df["true_demean_adm1"] = df["truth"] - df["adm1_unweighted_mean_of_truth_from_dhs_obs"]
    
    within_r2_adm1 = sklearn.metrics.r2_score(df["true_demean_adm1"],df["preds_demean_adm1"])
    within_pearson_adm1 = np.corrcoef(df["true_demean_adm1"],df["preds_demean_adm1"])[0,1] ** 2
    within_spearman_adm1 = spearmanr(df["true_demean_adm1"],df["preds_demean_adm1"]).correlation
    
    if write_path:
        df.to_pickle(write_path)
    
    output_dict = {"pearson" : pearson, "spearman" : spearman, "r2" : r2, 
                   "within_adm0_pearson": within_pearson, "within_adm0_spearman": within_spearman, "within_adm0_r2":within_r2, 
                  "within_adm1_pearson": within_pearson_adm1, "within_adm1_spearman": within_spearman_adm1, "within_adm1_r2":within_r2_adm1, 
                  }
    
    # For demeaned performance metrics to be correct, demeaned truth vals MUST have mean zero
    assert round(df["true_demean_adm1"].mean(),6) ==0
    assert round(df["true_demean_adm0"].mean(),6) ==0
    
    assert round(df["preds_demean_adm1"].mean(),6) ==0
    assert round(df["preds_demean_adm0"].mean(),6) ==0
    
    if return_df:
        return df
            
    return output_dict
        
        

In [8]:
dhs_rcf_X = pd.read_pickle( (data_dir + "/features/mosaiks_features/"
                             "DHS_DHS_dense_DHSID_pop_weight=True.p") )
                           
dhs_nl_X = pd.read_pickle( (data_dir + "/features/nl_features/DHS_polygons/"
                            "dmsp_nightlight_features_for_iwi_polygons_20_bins_GPW_pop_weighted.p") )
                           
task = "iwi"
# task = "Sub-national HDI"

## Let's calculate missing data

In [9]:
overlap_idxs = iwi.index[iwi.index.isin(dhs_rcf_X.index)]

In [10]:
iwi_overlap = iwi.loc[overlap_idxs]

In [11]:
n_not_in_grid = len(iwi) - len(iwi_overlap)

print( n_not_in_grid, "missing obs from saved DHS dense grid")
print("These observations DO NOT have planet data")

33 missing obs from saved DHS dense grid
These observations DO NOT have planet data


In [12]:
overlap_adm1 = iwi_overlap.index.isin(link_df["DHSID"])

In [13]:
iwi_overlap = iwi_overlap.loc[overlap_adm1 ]

In [14]:
n_without_adm1_overlap = len(iwi) - len(iwi_overlap) - n_not_in_grid

print( n_without_adm1_overlap, "obs are dropped because they do not overlap any ADM1 region")

1196 obs are dropped because they do not overlap any ADM1 region


In [15]:
dhs_rcf_X = dhs_rcf_X.loc[iwi_overlap.index]
dhs_nl_X = dhs_nl_X.loc[iwi_overlap.index]

In [16]:
dhs_rcf_X.head()

Unnamed: 0_level_0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,...,X_3990,X_3991,X_3992,X_3993,X_3994,X_3995,X_3996,X_3997,X_3998,X_3999
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AL201700000001,0.263912,0.529834,0.120221,0.221756,0.378645,0.742076,0.220042,0.312819,0.143793,0.560435,...,0.439009,0.246906,0.802624,0.563975,0.351334,1.220489,2.117525,0.768354,0.652577,0.167308
AL201700000002,0.256797,0.502296,0.118349,0.1887,0.358811,0.778503,0.237806,0.321708,0.115251,0.55841,...,0.412139,0.236711,0.788731,0.555154,0.332123,1.188645,2.046447,0.769052,0.626322,0.166969
AL201700000003,0.277972,0.573552,0.125706,0.231508,0.41296,0.771762,0.210382,0.322703,0.146661,0.587726,...,0.488851,0.271301,0.918831,0.622778,0.398386,1.295117,2.351002,0.826812,0.701968,0.181992
AL201700000004,0.258856,0.511748,0.118132,0.204472,0.369289,0.754167,0.229083,0.316454,0.129066,0.553742,...,0.423077,0.240886,0.792644,0.561666,0.340954,1.198411,2.054426,0.76426,0.632968,0.167788
AL201700000005,0.263912,0.529834,0.120221,0.221756,0.378645,0.742076,0.220042,0.312819,0.143793,0.560435,...,0.439009,0.246906,0.802624,0.563975,0.351334,1.220489,2.117525,0.768354,0.652577,0.167308


## Examine performance from cross country models

In [17]:
model_directory = data_dir + "/model_data/"

In [18]:
### Adm1, RCF only

path = (model_directory+
           "cross_country_kfold_solve_all_outcomes_country_fold_DENSE_pop_weight=True.pkl"
          )
kfold_dict = pickle.load(open(path, "rb"))

preds = predict_y_from_kfold_dict(dhs_rcf_X,kfold_dict,task)
truth = iwi_overlap["IWI"]


In [19]:
n_train_adm1 = len(np.hstack(kfold_dict[task]["locations_test"]))

In [20]:
adm1_rcf_perf_dict = dhs_pred_truth_to_metrics(preds, truth)

In [21]:
adm1_rcf_perf_dict

{'pearson': 0.3685642367086544,
 'spearman': 0.5962937665575114,
 'r2': 0.31149244806519394,
 'within_adm0_pearson': 0.13968959243544718,
 'within_adm0_spearman': 0.36367876645193054,
 'within_adm0_r2': -0.0814394949863011,
 'within_adm1_pearson': 0.07739748516015149,
 'within_adm1_spearman': 0.2592693301453697,
 'within_adm1_r2': -0.32646255651326017}

In [22]:
df = dhs_pred_truth_to_metrics(preds, truth,return_df=True)

In [23]:
### Adm1, NL only

path = (model_directory+
           "cross_country_nl_solve_all_outcomes_country_fold_dmsp_hist_bins_GPW_pop_weighted.pkl")

nl_kfold_dict = pickle.load(open(path, "rb"))

preds = predict_y_from_kfold_dict(dhs_nl_X,nl_kfold_dict,task)
truth = iwi_overlap["IWI"]

In [24]:
adm1_nl_perf_dict = dhs_pred_truth_to_metrics(preds, truth)

In [25]:
adm1_nl_perf_dict

{'pearson': 0.402450271608525,
 'spearman': 0.6145369659261958,
 'r2': -0.006479248019127359,
 'within_adm0_pearson': 0.3023741437011924,
 'within_adm0_spearman': 0.5509754582659943,
 'within_adm0_r2': 0.022134621147371747,
 'within_adm1_pearson': 0.26532220557504677,
 'within_adm1_spearman': 0.5033089497948544,
 'within_adm1_r2': -0.12430493026821932}

In [26]:
### Adm1, RCF+NL only

path = (model_directory + 
           "cross_country_rcf_and_nl_solve_all_outcomes_country_fold_DENSE_pop_weight=True_dmsp_hist_bins_GPW_pop_weighted.pkl")


nl_and_rcf_kfold_dict = pickle.load(open(path, "rb"))

preds = predict_y_from_kfold_dict(dhs_rcf_X,nl_and_rcf_kfold_dict,task, dhs_nl_X)
truth = iwi_overlap["IWI"]

In [27]:
adm1_rcf_and_nl_perf_dict = dhs_pred_truth_to_metrics(preds, truth)

In [28]:
adm1_rcf_and_nl_perf_dict

{'pearson': 0.4993888580507504,
 'spearman': 0.7006971142334537,
 'r2': 0.3797321157144947,
 'within_adm0_pearson': 0.2961418930584032,
 'within_adm0_spearman': 0.5450086651334161,
 'within_adm0_r2': 0.19050095527916555,
 'within_adm1_pearson': 0.1890277784248376,
 'within_adm1_spearman': 0.41869003618283945,
 'within_adm1_r2': -0.0380163916608125}

## Now examine performance from ADM0 cross country models

In [29]:
### Adm0, RCF only

path = (model_directory+
           "kfold_solve_adm0_model_full"
           "_pop_weighted_feats_DENSE.pkl")

adm0_kfold_dict = pickle.load(open(path, "rb"))

preds = predict_y_from_kfold_dict(dhs_rcf_X,adm0_kfold_dict,task)
truth = iwi_overlap["IWI"]


In [30]:
n_train_adm0 = len(np.hstack(adm0_kfold_dict[task]["locations_test"]))

In [31]:
adm0_rcf_perf_dict = dhs_pred_truth_to_metrics(preds, truth)

In [32]:
adm0_rcf_perf_dict

{'pearson': 0.27291928575362284,
 'spearman': 0.5111963777074391,
 'r2': 0.09054380450325206,
 'within_adm0_pearson': 0.11663749324100793,
 'within_adm0_spearman': 0.3289280921192751,
 'within_adm0_r2': -0.38373078110444414,
 'within_adm1_pearson': 0.08083086902699292,
 'within_adm1_spearman': 0.2559454996275326,
 'within_adm1_r2': -0.6275136178560663}

In [33]:
### Adm0, NL only

path = (data_dir + "/model_data/"
           "dmsp_hist_bins_GPW_pop_weighted.pkl")
adm0_nl_kfold_dict = pickle.load(open(path, "rb"))

preds = predict_y_from_kfold_dict(dhs_nl_X,adm0_nl_kfold_dict,task)
truth = iwi_overlap["IWI"]

In [34]:
adm0_nl_perf_dict = dhs_pred_truth_to_metrics(preds, truth)
adm0_nl_perf_dict

{'pearson': 0.4194871231785607,
 'spearman': 0.613991813916459,
 'r2': -0.32796899047743655,
 'within_adm0_pearson': 0.3275726470083765,
 'within_adm0_spearman': 0.5687887688430951,
 'within_adm0_r2': -0.4655227923470937,
 'within_adm1_pearson': 0.2863278555667298,
 'within_adm1_spearman': 0.5154997744813887,
 'within_adm1_r2': -0.733462232030653}

In [35]:
### Adm0, RCF+NL
path = (model_directory+
           "kfold_solve_adm0_level_pop_weighted_feats_rcf_nl_dmsp_hist_bins_GPW_pop_weighted.pkl")

adm0_rcf_nl_kfold_dict = pickle.load(open(path, "rb"))

preds = predict_y_from_kfold_dict(dhs_rcf_X,adm0_rcf_nl_kfold_dict,task,dhs_nl_X)
truth = iwi_overlap["IWI"]



In [36]:
adm0_rcf_nl_perf_dict = dhs_pred_truth_to_metrics(preds, truth)

In [37]:
adm0_rcf_nl_perf_dict

{'pearson': 0.40972411731192054,
 'spearman': 0.6161893410887099,
 'r2': -0.2985670087635124,
 'within_adm0_pearson': 0.30000219116896887,
 'within_adm0_spearman': 0.5442045343840687,
 'within_adm0_r2': -0.38993542330628084,
 'within_adm1_pearson': 0.2546827657610343,
 'within_adm1_spearman': 0.4844594913410964,
 'within_adm1_r2': -0.6740837159603865}

## Evaluate performance from within country models

### Create demeaned Xs and Ys

First we have to demean our Xs using the same country data as used in training. This is the mean of the ADM1 observations aggregated to the ADM1 level.

In [38]:
adm1_rcf_feats = pd.read_pickle( (data_dir + "/features/mosaiks_features/"
                                  "GDL_ADM1_polygon_X_creation_pop_weight=True.p")).drop(columns="GDLCODE")

adm0_mean_rcf_feats =  X_matrix_to_demeaned_X(adm1_rcf_feats, return_mean_frame=True)


adm1_nl_feats = pd.read_pickle( (data_dir + "/features/nl_features/"
                                 "GDL_HDI_polygons/dmsp_nightlight_features_"
                                 "for_hdi_polygons_20_bins_GPW_pop_weighted.p"))
                                
adm0_mean_nl_feats =  X_matrix_to_demeaned_X(adm1_nl_feats , return_mean_frame=True)

In [39]:
dhs_id_to_iso = link_df[["DHSID","alpha-3"]]

In [40]:
## Demean RCF X

dhs_x_with_country = dhs_rcf_X.reset_index().merge(dhs_id_to_iso,"left", on = "DHSID").set_index("DHSID")
demean_dhs_X_rcf =  generalized_demean(dhs_x_with_country,adm0_mean_rcf_feats,"alpha-3")

In [41]:
# Demean NL X
dhs_nl_with_country = dhs_nl_X.reset_index().merge(dhs_id_to_iso,"left", on = "DHSID").set_index("DHSID")
demean_dhs_X_nl =  generalized_demean(dhs_nl_with_country,adm0_mean_nl_feats,"alpha-3")

In [42]:
#demean y

In [43]:
iwi_with_country = iwi_overlap.reset_index().merge(dhs_id_to_iso,"left", on = "DHSID").set_index("DHSID")
iwi_demean = iwi_with_country.merge(nat, "left", left_on="alpha-3", right_index=True)

iwi_demean["iwi demean"] = iwi_demean["IWI"] - iwi_demean["nat IWI"]

### Evaluate Performance

In [44]:
### RCF only

In [45]:
path = (model_directory+
           "within_country_demeaned_kfold_solve_all_outcomes_country_fold_DENSE_pop_weight=True.pkl")

demeaned_kfold_dict = pickle.load(open(path, "rb"))

preds = predict_y_from_kfold_dict(demean_dhs_X_rcf.loc[iwi_demean.index], demeaned_kfold_dict, task)
truth = iwi_demean["iwi demean"]

n_train_demean = len(np.hstack(demeaned_kfold_dict[task]["locations_test"]))

In [46]:
n_train_demean = len(np.hstack(demeaned_kfold_dict[task]["locations_test"]))

In [47]:
write_path = data_dir + "/preds/demean_iwi_rcf_at_dhs.p"
#write_path = None

In [48]:
demean_rcf_perf_dict = dhs_pred_truth_to_metrics(preds, truth,write_path=write_path, demeaned_input=True, recenter_on = "adm1")

In [49]:
## NL only

In [50]:
path = (model_directory+
           "within_country_nl_demeaned_solve_all_outcomes_country_fold_dmsp_hist_bins_GPW_pop_weighted.pkl")

nl_demeaned_kfold_dict = pickle.load(open(path, "rb"))

preds = predict_y_from_kfold_dict(demean_dhs_X_nl.loc[iwi_demean.index], nl_demeaned_kfold_dict, task)
truth = iwi_demean["iwi demean"]


In [52]:
write_path = data_dir + "/preds/demean_iwi_nl_at_dhs.p"
#write_path=None


In [53]:
demean_nl_perf_dict = dhs_pred_truth_to_metrics(preds, truth, write_path=write_path, 
                                                demeaned_input=True, recenter_on="adm1")

In [54]:
### RCF and NL

In [82]:
path = (model_directory+
           "within_country_rcf_and_nl_demeaned_solve_all_outcomes_country_fold"
           "_DENSE_pop_weight=True_dmsp_hist_bins_GPW_pop_weighted.pkl")

nl_and_rcf_demeaned_kfold_dict = pickle.load(open(path, "rb"))

preds = predict_y_from_kfold_dict(demean_dhs_X_rcf.loc[iwi_demean.index], 
                                  nl_and_rcf_demeaned_kfold_dict, 
                                  task,
                                  demean_dhs_X_nl.loc[iwi_demean.index])
truth = iwi_demean["iwi demean"]

In [83]:
iwi_demean["iwi demean"] = iwi_demean["IWI"] - iwi_demean["nat IWI"]
iwi_demean = iwi_demean.dropna(subset = ["nat IWI"])

In [84]:
iwi_demean = iwi_demean.dropna(subset = ["nat IWI"])

In [85]:
write_path = data_dir + "/preds/demean_iwi_rcf_nl_at_dhs.p"

In [86]:
demean_rcf_and_nl_perf_dict = dhs_pred_truth_to_metrics(preds, truth, write_path=write_path, 
                                                        demeaned_input=True, recenter_on="adm1")

In [87]:
demean_rcf_and_nl_perf_dict

{'pearson': 0.7540908910931801,
 'spearman': 0.8668545156227587,
 'r2': 0.7533925038387096,
 'within_adm0_pearson': 0.5901016496311956,
 'within_adm0_spearman': 0.7726875603701354,
 'within_adm0_r2': 0.5899706521557301,
 'within_adm1_pearson': 0.38510021204705425,
 'within_adm1_spearman': 0.5951284707420764,
 'within_adm1_r2': 0.38461623587548777}

In [88]:
df = dhs_pred_truth_to_metrics(preds, truth, demeaned_input=True, return_df=True, recenter_on="adm1")
# make_scatterplot("demeaned adm1 scatter", df["true_demean_adm1"], df["preds_demean_adm1"])

# make_scatterplot("scatter", df["truth"], df["preds"])

### Make summary table

In [62]:
outcomes = [adm0_rcf_perf_dict,
           adm0_nl_perf_dict,
           adm0_rcf_nl_perf_dict,
           
           adm1_rcf_perf_dict,
           adm1_nl_perf_dict,
           adm1_rcf_and_nl_perf_dict,
           
           demean_rcf_perf_dict,
           demean_nl_perf_dict,
           demean_rcf_and_nl_perf_dict
           
    
    
    
]

In [63]:
table = pd.DataFrame(outcomes).round(2)

table[table < 0] = "$< 0$"

In [64]:
rename_dict = {"pearson": "$\rho^{2}$", "spearman":"Spearman r", "r2":"$R^2$",
               "within_adm0_pearson": "$\rho^{2}$", "within_adm0_spearman":"Spearman r", "within_adm0_r2":"$R^{2}$",
               "within_adm1_pearson": "$\rho^{2}$", "within_adm1_spearman":"Spearman r", "within_adm1_r2":"$R^{2}$",
    
}

In [65]:
table = table.rename(columns = rename_dict)

In [66]:
table.loc[0:2,"HDI"] = "\textbf{Country level} " + "(n={:,})".format(n_train_adm0)
table.loc[3:5,"HDI"] = "\textbf{Province level} " + "(n={:,})".format(n_train_adm1 )
table.loc[6:8,"HDI"] = "\textbf{Within-country} " + "(n={:,})".format(n_train_demean)



In [67]:
table.loc[0:2,""] = ["\textbf{MOSAIKS}","\textbf{NL}","\textbf{MOSAIKS+NL}"]
table.loc[3:5,""] = ["\textbf{MOSAIKS}","\textbf{NL}","\textbf{MOSAIKS+NL}"]
table.loc[6:8,""] = ["\textbf{MOSAIKS}","\textbf{NL}","\textbf{MOSAIKS+NL}"]

In [68]:
table.loc[9] = "\textbf{Predicted at DHS cluster level} " + "(n={:,})".format(len(iwi_overlap))

In [69]:
table.loc[10] = "\emph{Full variation performance}"
table.iloc[10,3:6] = "\emph{Within-country performance}"
table.iloc[10,6:10] = "\emph{Within-province performance}"

table = table.iloc[:,[0,2,3,5,6,8,9,10]] #Remove Spearman


table.loc[11] = ("(" + pd.Series(np.arange(1,table.shape[1]+1)).astype(str) +")").to_numpy()

In [70]:
table = table.T.reset_index().set_index([9,10,"index",11])

In [71]:
tab = table.T

In [72]:
tab.columns.names = ([None, None, None,None])

In [73]:
table = tab.set_index([tab.columns[-2],tab.columns[-1]])

In [74]:
table.index.names =["\textbf{\emph{IWI trained at}}:", "\textbf{\emph{Features}}"]


In [75]:
table = table.iloc[[8,6,7,5,3,4,2,0,1]]
table

Unnamed: 0_level_0,Unnamed: 1_level_0,"\textbf{Predicted at DHS cluster level} (n=51,996)","\textbf{Predicted at DHS cluster level} (n=51,996)","\textbf{Predicted at DHS cluster level} (n=51,996)","\textbf{Predicted at DHS cluster level} (n=51,996)","\textbf{Predicted at DHS cluster level} (n=51,996)","\textbf{Predicted at DHS cluster level} (n=51,996)"
Unnamed: 0_level_1,Unnamed: 1_level_1,\emph{Full variation performance},\emph{Full variation performance},\emph{Within-country performance},\emph{Within-country performance},\emph{Within-province performance},\emph{Within-province performance}
Unnamed: 0_level_2,Unnamed: 1_level_2,$\rho^{2}$,$R^2$,$\rho^{2}$,$R^{2}$,$\rho^{2}$,$R^{2}$
Unnamed: 0_level_3,Unnamed: 1_level_3,(1),(2),(3),(4),(5),(6)
extbf{\emph{IWI trained at}}:,extbf{\emph{Features}},Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4
\textbf{Within-country} (n=863),\textbf{MOSAIKS+NL},0.75,0.75,0.59,0.59,0.39,0.38
\textbf{Within-country} (n=863),\textbf{MOSAIKS},0.69,0.68,0.48,0.47,0.22,0.2
\textbf{Within-country} (n=863),\textbf{NL},0.76,0.76,0.59,0.59,0.39,0.39
\textbf{Province level} (n=864),\textbf{MOSAIKS+NL},0.5,0.38,0.3,0.19,0.19,$< 0$
\textbf{Province level} (n=864),\textbf{MOSAIKS},0.37,0.31,0.14,$< 0$,0.08,$< 0$
\textbf{Province level} (n=864),\textbf{NL},0.4,$< 0$,0.3,0.02,0.27,$< 0$
\textbf{Country level} (n=86),\textbf{MOSAIKS+NL},0.41,$< 0$,0.3,$< 0$,0.25,$< 0$
\textbf{Country level} (n=86),\textbf{MOSAIKS},0.27,0.09,0.12,$< 0$,0.08,$< 0$
\textbf{Country level} (n=86),\textbf{NL},0.42,$< 0$,0.33,$< 0$,0.29,$< 0$


In [76]:
#table.to_pickle = "/shares/maps100/data/output/applications/HDI/journal_tables/hdi_models_on_dhs_table_2022_2_28.p"

In [77]:
print(table.to_latex(bold_rows=False,column_format="ll||cc|cc|cc",
      escape=False, multicolumn_format="c", na_rep = "-")  )

\begin{tabular}{ll||cc|cc|cc}
\toprule
                              &             & \multicolumn{6}{c}{\textbf{Predicted at DHS cluster level} (n=51,996)} \\
                              &             & \multicolumn{2}{c}{\emph{Full variation performance}} & \multicolumn{2}{c}{\emph{Within-country performance}} & \multicolumn{2}{c}{\emph{Within-province performance}} \\
                              &             &                                         $\rho^{2}$ &  $R^2$ &                        $\rho^{2}$ & $R^{2}$ &                         $\rho^{2}$ & $R^{2}$ \\
                              &             &                                                (1) &    (2) &                               (3) &     (4) &                                (5) &     (6) \\
\textbf{\emph{IWI trained at}}: & \textbf{\emph{Features}} &                                                    &        &                                   &         &                                    &         \\
\mid