In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import pickle
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

## run baseline models 

In [7]:
# load the necessary functions from the analysis package

# load the variable names, this allows to access the variables in the feature data in a compact way
from analysis_utils.variable_names import *

# load flagged ids 
from analysis_utils.flagged_uids import flagged_uids

# load the functions to do spatial k-fold CV
from analysis_utils.spatial_CV import split_lsms_spatial

# load the helper functions
from analysis_utils.analysis_helpers import *

# load the random forest trainer and cross_validator
import analysis_utils.RandomForest as rf

# load the combien model
from analysis_utils.CombinedModel import CombinedModel

In [8]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# the feature data
feat_data_pth = f"{root_data_dir}/feature_data/tabular_data.csv"

# set the random seed
random_seed = 423
spatial_cv_random_seed = 348

# set the number of folds for k-fold CV
n_folds = 5

In [9]:
# load the feature and the label data
lsms_df = pd.read_csv(lsms_pth)
# remove flagged ids form dataset
lsms_df = lsms_df[~lsms_df.unique_id.isin(flagged_uids)].reset_index()
lsms_df['avg_log_mean_pc_cons_usd_2017'] = lsms_df.groupby('cluster_id')['log_mean_pc_cons_usd_2017'].transform('mean')
lsms_df['avg_mean_asset_index_yeh'] = lsms_df.groupby('cluster_id')['mean_asset_index_yeh'].transform('mean')
feat_df = pd.read_csv(feat_data_pth)

# merge the label and the feature data to one dataset
lsms_vars = ['unique_id', 'n_households',           
             'log_mean_pc_cons_usd_2017', 'avg_log_mean_pc_cons_usd_2017',
             'mean_asset_index_yeh', 'avg_mean_asset_index_yeh']
df = pd.merge(lsms_df[lsms_vars], feat_df, on = 'unique_id', how = 'left')

# get the training and validation split
fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = spatial_cv_random_seed)

# describe the training data broadly
print(f"Number of observations {len(lsms_df)}")
print(f"Number of clusters {len(np.unique(lsms_df.cluster_id))}")
print(f"Number of x vars {len(feat_df.columns)-2}")

Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Number of observations 6401
Number of clusters 2128
Number of x vars 109


In [10]:
# define the within and between x variables
avg_rs_vars = avg_ndvi_vars + avg_ndwi_gao_vars + avg_nl_vars
osm_vars = osm_dist_vars + osm_count_vars + osm_road_vars
between_x_vars = osm_vars + esa_lc_vars + wsf_vars + avg_rs_vars + avg_preciptiation + median_rgb_vars

In [11]:
print(f"Number of OSM vars: {len(osm_vars)}")
print(f"Number of Dyn img vars: {len(avg_rs_vars)}")
print(f"Number of Static img vars: {len(esa_lc_vars) + len(wsf_vars)}")
print(f"Number of precipiation vars: {len(avg_preciptiation)}")
print(f"Number of RGB vars: {len(median_rgb_vars)}")
print(f"Total number of vars - Baseline: {len(between_x_vars) - len(median_rgb_vars)}")
print(f"Total number of vars - Baseline + RGB: {len(between_x_vars)}")

Number of OSM vars: 25
Number of Dyn img vars: 6
Number of Static img vars: 10
Number of precipiation vars: 1
Number of RGB vars: 25
Total number of vars - Baseline: 42
Total number of vars - Baseline + RGB: 67


### Consumption Expenditure

In [12]:
# run Cross validation for consumption expenditure
between_target_var = 'avg_log_mean_pc_cons_usd_2017'
cl_df = df[['cluster_id', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

# normalise the feature data
cl_df_norm = standardise_df(cl_df, exclude_cols = [between_target_var])

# run the bewtween training
print('Between training')
baseline_cons = rf.CrossValidator(cl_df_norm, 
                                            fold_ids, 
                                            between_target_var, 
                                            between_x_vars, 
                                            id_var = 'cluster_id', 
                                            random_seed = random_seed)
baseline_cons.run_cv_training(min_samples_leaf = 1)

Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 228 seconds


### Asset index

In [None]:
# run Cross validation for consumption expenditure
between_target_var = 'avg_mean_asset_index_yeh'
cl_df = df[['cluster_id', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

# normalise the feature data
cl_df_norm = standardise_df(cl_df, exclude_cols = [between_target_var])

# run the bewtween training
print('Between training')
baseline_asset = rf.CrossValidator(cl_df_norm, 
                                            fold_ids, 
                                            between_target_var, 
                                            between_x_vars, 
                                            id_var = 'cluster_id', 
                                            random_seed = random_seed)
baseline_asset.run_cv_training(min_samples_leaf = 1)

Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

# Make plot

In [None]:
# load the dl consumption and the dl asset results
results_dir = "../analysis/results"
dl_cons_pth = f"{results_dir}/model_objects/between_cons.pkl"
dl_asset_pth = f"{results_dir}/model_objects/between_asset.pkl"

with open(dl_cons_pth, 'rb') as f:
    dl_cons = pickle.load(f)
    
with open(dl_asset_pth, 'rb') as f:
    dl_asset = pickle.load(f)

In [None]:
# create labels for the plot

rgb_vars = ['median_rgb_pc_'+str(i) for i in range(1,26)]
rgb_labels = ['ls pc ' + str(i) for i in range(1,26)]

ls_vars = ['ls_feat_'+str(i) for i in range(0,25)]
rs_vars = ['rs_feat_'+str(i) for i in range(0,25)]

ls_var_labels = ['ls pc '+str(i) for i in range(1,26)]
rs_var_labels = ['rs pc '+str(i) for i in range(1,26)]

osm_count_labels = ["# " + i.split("_")[0] for i in osm_count_vars]
osm_dist_labels = ["D " + i.split("_")[0] for i in osm_dist_vars]
osm_road_labels = ["road network length", "D paved road", "D primary road"]

ls_rs_label_dict = dict(zip(ls_vars+rs_vars+rgb_vars,ls_var_labels+rs_var_labels+rgb_labels))
osm_label_dict = dict(zip(osm_count_vars + osm_dist_vars + osm_road_vars, osm_count_labels + osm_dist_labels + osm_road_labels))
precip_label_dict = {'avg_precipitation':'precipitation'}

rs_vars = avg_ndvi_vars + avg_ndwi_gao_vars + avg_nl_vars 
rs_var_labels = [i.replace('avg_',"").replace("_"," ") for i in rs_vars]
wsf_var_labels = [i.replace("_"," ") for i in wsf_vars]
esa_lc_var_labels = esa_lc_vars

rs_labels = rs_var_labels + wsf_var_labels + esa_lc_var_labels
rs_names = rs_vars + wsf_vars + esa_lc_vars

rs_label_dict = dict(zip(rs_names, rs_labels))

label_dict = {}

# Merge the dictionaries
label_dict.update(ls_rs_label_dict)
label_dict.update(osm_label_dict)
label_dict.update(precip_label_dict)
label_dict.update(rs_label_dict)

# change by hand some
label_dict['fuel_dist'] = 'D gas station'
label_dict['fuel_count'] = '# gas station'
label_dict['avg_ndwi_gao_mean'] = 'ndwi mean'
label_dict['avg_ndwi_gao_std'] = 'ndwi std'
label_dict['avg_nl_mean'] = 'nightlights mean'
label_dict['avg_nl_std'] = 'nightlights std'
label_dict['built_up'] = 'built up'

In [None]:
# create a color dictionary
ls_color = 'darkred'
rs_color = 'teal'
remoteness_color = 'sandybrown'
amenities_color = 'darkgrey'

# create a color for geography (includes WSF, LC, NL)
geography_color = 'blueviolet'
agriculture_color = 'green'
urbanisation_color = 'blue'

baseline_colors = {
    'agriculture': 'green',
    'amenities': 'darkgrey',
    'geography': 'blueviolet',
    'remoteness': 'sandybrown',
    'urbanisation': 'blue'
}

baseline_ls_colors = {
    'agriculture': 'green',
    'amenities': 'darkgrey',
    'geography': 'blueviolet',
    'LS images': 'darkred',
    'remoteness': 'sandybrown',
    'urbanisation': 'blue'
}

dl_colors = {
    'agriculture': 'green',
    'amenities': 'darkgrey',
    'LS images': 'darkred',
    'RS images': 'teal',
    'remoteness': 'sandybrown'
}

color_dict = {}
for k in label_dict.keys():
    if 'ls_' in k or '_rgb_' in k:
        color_dict[k] = 'darkred'
    elif 'rs_' in k:
        color_dict[k] = 'teal'
    
    elif 'count' in k:
        color_dict[k] = amenities_color    
    elif 'dist' in k or 'road' in k:
        color_dict[k] = remoteness_color
        if 'road_length' in k:
            color_dict[k] = urbanisation_color
        
    elif 'precipitation' in k:
        color_dict[k] = agriculture_color
    elif 'ndvi' in k:
        color_dict[k] = agriculture_color
    elif 'ndwi' in k:
        color_dict[k] = agriculture_color
    elif 'cropland' in k:
        color_dict[k] = agriculture_color
    
    elif 'wsf_' in k or 'nl_' in k or 'built_' in k:
        color_dict[k] = urbanisation_color
    else:
        color_dict[k] = geography_color

In [None]:
# for the baseline model add rgb to the ls name
for k, v in label_dict.items():
    if 'ls pc' in v:
        new_v = 'rgb' + v
        label_dict[k] = 'rgb ' + v
    



# get feature importance for the baseline models
baseline_cons_feat_importance = baseline_cons.get_feature_importance()
baseline_asset_feat_importance = baseline_asset.get_feature_importance()

baseline_cons_feat_importance['var_label'] = [label_dict[i] for i in baseline_cons_feat_importance['variable_name']]
baseline_asset_feat_importance['var_label'] = [label_dict[i] for i in baseline_asset_feat_importance['variable_name']]

baseline_cons_feat_importance['var_col'] = [color_dict[i] for i in baseline_cons_feat_importance['variable_name']]
baseline_asset_feat_importance['var_col'] = [color_dict[i] for i in baseline_asset_feat_importance['variable_name']]

In [None]:
# finish labels and plot data...
font = {'family' : 'sans-serif',
        'weight' : 'normal',
        'size'   : 14}

matplotlib.rc('font', **font)

fig, ax = plt.subplots(1,2,figsize = (12,16))

ax[0].set_ymargin(0.01)
ax[0].barh(y = baseline_cons_feat_importance['var_label'],
           width = baseline_cons_feat_importance['feat_importance'],
           color = baseline_cons_feat_importance['var_col'])    
ax[0].spines[['right', 'top']].set_visible(False)
ax[0].set_xlabel("Relative Feature Importance")
ax[0].set_title('Consumption expenditure')


ax[1].set_ymargin(0.01)
ax[1].barh(y = baseline_asset_feat_importance['var_label'], 
           width = baseline_asset_feat_importance['feat_importance'],
           color = baseline_asset_feat_importance['var_col'])    
ax[1].spines[['right', 'top']].set_visible(False)
ax[1].set_xlabel("Relative Feature Importance")
ax[1].set_title('Asset wealth index')
ax[1].legend()

# Create legend handles
handles = [plt.Line2D([0], [0], marker='o', color='w', label=label, markersize=10,
                      markerfacecolor=color) for label, color in baseline_ls_colors.items()]

# Add the legend to the second subplot (ax[1])
ax[1].legend(handles=handles, title='Type of feature', loc='lower right')

fig.text(0.05,.97,'A',weight = 'bold')
fig.text(0.55,.97,'B',weight = 'bold')

plt.tight_layout()
pth = '../figures/results/baseline_ls_feature_importance.png'
plt.savefig(pth, dpi = 300, bbox_inches = 'tight')
plt.show()

In [None]:
# add ms to ls features
for k, v in label_dict.items():
    if 'ls pc' in v:
        new_v = 'rgb' + v
        label_dict[k] = 'ms ' + v[4:]


# get feature importance for the deep learning models
dl_cons.feat_importance['var_label'] = [label_dict[i] for i in dl_cons.feat_importance['variable_name']]
dl_asset.feat_importance['var_label'] = [label_dict[i] for i in dl_asset.feat_importance['variable_name']]

dl_cons.feat_importance['var_col'] = [color_dict[i] for i in dl_cons.feat_importance['variable_name']]
dl_asset.feat_importance['var_col'] = [color_dict[i] for i in dl_asset.feat_importance['variable_name']]


In [None]:
font = {'family' : 'sans-serif',
        'weight' : 'normal',
        'size'   : 14}

matplotlib.rc('font', **font)

fig, ax = plt.subplots(1,2,figsize = (12,16))

ax[0].set_ymargin(0.01)
ax[0].barh(y = dl_cons.feat_importance['var_label'], 
           width = dl_cons.feat_importance['feat_importance'],
           color = dl_cons.feat_importance['var_col'])    
ax[0].spines[['right', 'top']].set_visible(False)
ax[0].set_xlabel("Relative Feature Importance")
ax[0].set_title('Consumption expenditure')


ax[1].set_ymargin(0.01)
ax[1].barh(y = dl_asset.feat_importance['var_label'], 
           width = dl_asset.feat_importance['feat_importance'],
          color = dl_asset.feat_importance['var_col'])    
ax[1].spines[['right', 'top']].set_visible(False)
ax[1].set_xlabel("Mean relative feature importance")
ax[1].set_title('Asset wealth index')

# Create legend handles
handles = [plt.Line2D([0], [0], marker='o', color='w', label=label, markersize=10,
                      markerfacecolor=color) for label, color in dl_colors.items()]

# Add the legend to the second subplot (ax[1])
ax[1].legend(handles=handles, title='Type of feature', loc='lower right')


fig.text(0.05,.99,'A',weight = 'bold', size = 14)
fig.text(0.55,.99,'B',weight = 'bold', size = 14)

plt.tight_layout()
pth = '../figures/results/dl_feature_importance.png'
plt.savefig(pth, dpi = 300, bbox_inches = 'tight')
plt.show()