In [1]:
import pandas as pd
import numpy as np
import torch
import pickle

from sklearn.metrics import r2_score

In [2]:
from analysis_utils.CombinedModel import CombinedModel
from analysis_utils.flagged_uids import flagged_uids
from analysis_utils.spatial_CV import *
from analysis_utils.variable_names import *

In [4]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# load the feature and the label data
lsms_df = pd.read_csv(lsms_pth)

# remove flagged ids form dataset
lsms_df = lsms_df[~lsms_df.unique_id.isin(flagged_uids)].reset_index()

# add delta id to data
lsms_df['delta_id'] = lsms_df.unique_id

# lsms_df with the normalised target variable
lsms_df_norm = lsms_df.copy()
lsms_df_norm['log_mean_pc_cons_usd_2017'] = (lsms_df_norm['log_mean_pc_cons_usd_2017'] - lsms_df_norm['log_mean_pc_cons_usd_2017'].mean())/lsms_df_norm['log_mean_pc_cons_usd_2017'].std() 

In [5]:
def print_results_dict(dictionary):
    # Get the keys in the dictionary
    keys = list(dictionary.keys())

    # Get the number of entries in each fold
    num_entries = len(dictionary[keys[0]])
    
    # Print the header row with fold names
    header = [f"Fold \t {keys[0]} \t {keys[1]} \t {keys[2]}"]
    print("".join(header))

    # Iterate through the entries and print them in rows
    for i in range(num_entries):
        row = [f"{i+1}"]
        for key in keys:
            value = dictionary[key][i]
            formatted_value = "{:.4f}".format(value)
            row.append('\t\t')
            row.append(formatted_value)
        print("".join(row))

## Consumption expenditure - Full model

In [6]:
# load the within and between model results
results_dir = "results"
between_cons_pth = f"{results_dir}/model_objects/between_cons.pkl"
within_cons_pth = f"{results_dir}/model_objects/within_cons.pkl"

with open(between_cons_pth, 'rb') as f:
    between_cons_cv = pickle.load(f)
    
with open(within_cons_pth, 'rb') as f:
    within_cons_cv = pickle.load(f)

combined_cons = CombinedModel(lsms_df, between_cons_cv, within_cons_cv)
combined_cons.evaluate()
results_cons_combined = combined_cons.compute_overall_performance(use_fold_weights = True)
print(f"R2: {results_cons_combined['r2']}")

print('\nTex output:')
combined_cons.print_tex('r2')

print(f"\nMSE: {results_cons_combined['mse']}")

print('\nFold Performance (R2):')
print_results_dict(combined_cons.res_r2)

R2: {'overall': 0.3844386237299032, 'between': 0.4700126573842902, 'within': -0.010345140943119557}

Tex output:
& 0.4700 & -0.0103 & 0.3844

MSE: {'overall': 0.216518287992741, 'between': 0.15630641664181993, 'within': 0.05663912594888358}

Fold Performance (R2):
Fold 	 overall 	 between 	 within
1		0.4927		0.5484		-0.1034
2		0.3713		0.4418		0.0152
3		0.4202		0.5111		-0.0158
4		0.3316		0.4263		0.0218
5		0.3010		0.4189		0.0305


## Consumption expenditure -- RGB Landsat

In [7]:
# load the within and between model results
results_dir = "results"
between_cons_ls_pth = f"{results_dir}/model_objects/between_cons_LS_cv.pkl"
within_cons_ls_pth = f"{results_dir}/model_objects/within_cons_LS_cv.pkl"

with open(between_cons_ls_pth, 'rb') as f:
    between_cons_ls = pickle.load(f)
    
with open(within_cons_ls_pth, 'rb') as f:
    within_cons_ls = pickle.load(f)

combined_cons_ls = CombinedModel(lsms_df_norm, between_cons_ls, within_cons_ls)

combined_cons_ls.evaluate()
results_cons_combined_ls = combined_cons_ls.compute_overall_performance(use_fold_weights = True)
print(f"R2: {results_cons_combined_ls['r2']}")

print('\nTex output:')
combined_cons_ls.print_tex('r2')

print(f"\nMSE: {results_cons_combined_ls['mse']}")

print('\nFold Performance (R2):')
print_results_dict(combined_cons_ls.res_r2)

R2: {'overall': 0.06634923112060229, 'between': 0.1785347684882714, 'within': -0.0021438570430164284}

Tex output:
& 0.1785 & -0.0021 & 0.0663

MSE: {'overall': 0.8584840942857698, 'between': 0.7566866652017697, 'within': 1.0022619}

Fold Performance (R2):
Fold 	 overall 	 between 	 within
1		0.0918		0.1326		-0.0133
2		0.1152		0.2318		0.0046
3		-0.1345		0.0605		-0.0288
4		0.1540		0.2486		0.0050
5		0.1152		0.2239		0.0217


## Consumption expenditure -- RS images

In [8]:
# load the within and between model results
results_dir = "results"
between_cons_rs_pth = f"{results_dir}/model_objects/between_cons_RS_cv.pkl"
within_cons_rs_pth = f"{results_dir}/model_objects/within_cons_RS_cv.pkl"

with open(between_cons_rs_pth, 'rb') as f:
    between_cons_rs = pickle.load(f)
    
with open(within_cons_rs_pth, 'rb') as f:
    within_cons_rs = pickle.load(f)
#within_cons_rs.predictions['delta_id'] = lsms_df.unique_id

combined_cons_rs = CombinedModel(lsms_df_norm, between_cons_rs, within_cons_rs)

combined_cons_rs.evaluate()
results_cons_combined_rs = combined_cons_rs.compute_overall_performance(use_fold_weights = True)
print(f"R2: {results_cons_combined_rs['r2']}")
print('\nTex output:')
combined_cons_rs.print_tex('r2')

print(f"\nMSE: {results_cons_combined_rs['mse']}")

print('\nFold Performance (R2):')
print_results_dict(combined_cons_rs.res_r2)

R2: {'overall': 0.3057057835118867, 'between': 0.38905982455411, 'within': 0.008462319196773826}

Tex output:
& 0.3891 & 0.0085 & 0.3057

MSE: {'overall': 0.6264404668330583, 'between': 0.5496754105573982, 'within': 0.9925707}

Fold Performance (R2):
Fold 	 overall 	 between 	 within
1		0.4274		0.4492		0.0011
2		0.2577		0.3747		0.0135
3		0.3862		0.4564		0.0047
4		0.2332		0.3190		0.0123
5		0.2158		0.3421		0.0107


# Asset index - Full model

In [9]:
# load the within and between model results
results_dir = "results"
between_asset_pth = f"{results_dir}/model_objects/between_asset.pkl"
within_asset_pth = f"{results_dir}/model_objects/within_asset.pkl"

with open(between_asset_pth, 'rb') as f:
    between_asset_cv = pickle.load(f)
    
with open(within_asset_pth, 'rb') as f:
    within_asset_cv = pickle.load(f)

combined_asset= CombinedModel(lsms_df, between_asset_cv, within_asset_cv)
combined_asset.evaluate()
results_asset_combined = combined_asset.compute_overall_performance(use_fold_weights = True)
print(f"R2: {results_asset_combined['r2']}")

print('\nTex output:')
combined_asset.print_tex('r2')

print(f"\nMSE: {results_asset_combined['mse']}")

print('\nFold Performance (R2):')
print_results_dict(combined_asset.res_r2)

R2: {'overall': 0.5086660544330386, 'between': 0.5417230937392483, 'within': 0.02370752397174105}

Tex output:
& 0.5417 & 0.0237 & 0.5087

MSE: {'overall': 1.0851358498039378, 'between': 1.1072535025546855, 'within': 0.07405648002968442}

Fold Performance (R2):
Fold 	 overall 	 between 	 within
1		0.6245		0.6633		-0.0029
2		0.6049		0.6152		0.0178
3		0.4609		0.4909		0.0393
4		0.4988		0.5293		0.0448
5		0.3522		0.4029		0.0195


## Asset index -- RGB images

In [10]:
# load the within and between model results
results_dir = "results"
between_asset_ls_pth = f"{results_dir}/model_objects/between_asset_LS_cv.pkl"
within_asset_ls_pth = f"{results_dir}/model_objects/within_asset_LS_cv.pkl"

with open(between_asset_ls_pth, 'rb') as f:
    between_asset_ls = pickle.load(f)
    
with open(within_asset_ls_pth, 'rb') as f:
    within_asset_ls = pickle.load(f)

combined_asset_ls = CombinedModel(lsms_df_norm, between_asset_ls, within_asset_ls)

combined_asset_ls.evaluate()
results_asset_combined_ls = combined_asset_ls.compute_overall_performance(use_fold_weights = True)
print(f"R2: {results_asset_combined_ls['r2']}")

print('\nTex output:')
combined_asset_ls.print_tex('r2')

print(f"\nMSE: {results_asset_combined_ls['mse']}")

print('\nFold Performance (R2):')
print_results_dict(combined_asset_ls.res_r2)

R2: {'overall': 0.3072283701228052, 'between': 0.39582788988194384, 'within': -0.001205724074963399}

Tex output:
& 0.3958 & -0.0012 & 0.3072

MSE: {'overall': 1.541819144567217, 'between': 0.552586328042181, 'within': 1.0028496}

Fold Performance (R2):
Fold 	 overall 	 between 	 within
1		0.4115		0.5925		-0.0265
2		0.4161		0.5437		0.0151
3		0.3064		0.3478		0.0073
4		0.3047		0.3827		-0.0059
5		0.0921		0.0960		0.0040


## Asset index -- RS images

In [11]:
# load the within and between model results
results_dir = "results"
between_asset_rs_pth = f"{results_dir}/model_objects/between_asset_RS_cv.pkl"
within_asset_rs_pth = f"{results_dir}/model_objects/within_asset_RS_cv.pkl"

with open(between_asset_rs_pth, 'rb') as f:
    between_asset_rs = pickle.load(f)
    
with open(within_asset_rs_pth, 'rb') as f:
    within_asset_rs = pickle.load(f)

combined_asset_rs = CombinedModel(lsms_df_norm, between_asset_rs, within_asset_rs)

combined_asset_rs.evaluate()
results_asset_combined_rs = combined_asset_rs.compute_overall_performance(use_fold_weights = True)
print(f"R2: {results_asset_combined_rs['r2']}")

print('\nTex output:')
combined_asset_rs.print_tex('r2')

print(f"\nMSE: {results_asset_combined_rs['mse']}")

print('\nFold Performance (R2):')
print_results_dict(combined_asset_rs.res_r2)

R2: {'overall': 0.15634533188287117, 'between': 0.14527235985910825, 'within': -0.01275680362642222}

Tex output:
& 0.1453 & -0.0128 & 0.1563

MSE: {'overall': 1.8414238780478103, 'between': 0.7665750399736085, 'within': 1.0149889}

Fold Performance (R2):
Fold 	 overall 	 between 	 within
1		0.4496		0.5880		-0.0252
2		0.3472		0.4575		-0.0490
3		-0.0829		-0.2795		0.0172
4		0.2601		0.3512		0.0097
5		-0.1931		-0.4154		-0.0165
