In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [2]:
# load the necessary functions from the analysis package

# load the variable names, this allows to access the variables in the feature data in a compact way
from analysis_utils.variable_names import *

# load the functions to do spatial k-fold CV
from analysis_utils.spatial_CV import *

# load the helper functions
from analysis_utils.analysis_helpers import *

# load the random forest trainer and cross_validator
import analysis_utils.RandomForest as rf

In [3]:
# set the global file paths
root_data_dir = "../../Data"

lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

feat_data_pth = f"{root_data_dir}/feature_data/tabular_data.csv"

# set the random seed
random_seed = 348

# set the number of folds for k-fold CV
n_folds = 5

In [4]:
# load the feature and the label data
lsms_df = pd.read_csv(lsms_pth)
lsms_df['avg_log_mean_pc_cons_usd_2017'] = lsms_df.groupby('cluster_id')['log_mean_pc_cons_usd_2017'].transform('mean')
feat_df = pd.read_csv(feat_data_pth)

# describe the training data broadly
print(f"Number of observations {len(lsms_df)}")
print(f"Number of clusters {len(np.unique(lsms_df.cluster_id))}")
print(f"Number of x vars {len(feat_df.columns)-2}")

Number of observations 7141
Number of clusters 2255
Number of x vars 63


In [5]:
# divide the data into k different folds
fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = random_seed)

Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18


In [6]:
# merge the label and the feature data to one dataset
df = pd.merge(lsms_df[['unique_id', 'log_mean_pc_cons_usd_2017', 'avg_log_mean_pc_cons_usd_2017','n_households']],
             feat_df, on = 'unique_id', how = 'left')

# Between model
This model takes as input any variable that is static, that is the OSM variables, ESA Landcover variables and the WSF variables. Moreover, it takes the mean over all dynamic variables. The dynamic variables include Nightlights, NDVI, and NDWI_Gao as well as NDWI_McF. 

The idea is that the between model captures variation between clusters and thus the target variable for the between model is $\bar{w}_c = \frac{1}{T_c}\sum_t^{T_c} w_{c,t}$ 

In [7]:
# get a dataset that only varies at the cluster level
between_x_vars = osm_dist_pca_vars + osm_count_pca_vars + esa_lc_vars + wsf_vars + rs_mean_vars + ['avg_precipitation']
between_target_var = 'avg_log_mean_pc_cons_usd_2017'
cl_df = df[['cluster_id', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

# normalise the feature data
cl_df_norm = standardise_df(cl_df, exclude_cols = [between_target_var])

In [8]:
# run the bewtween training
between_cv_trainer = rf.CrossValidator(cl_df_norm, fold_ids, between_target_var, between_x_vars, id_var = 'cluster_id', random_seed = random_seed)
between_cv_trainer.run_cv_training()
between_cv_trainer.compute_overall_performance()

Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 121 seconds


{'r2': 0.4638394412893702, 'mse': 0.15337512765665054}

# Within model
This goal of this model is to predict the deviations from the cluster mean for each year. I.e. the model should capture variation within each cluster. To do so, the target variable is $\tilde{w}_{ct} = w_{ct} - \bar{w}_{ct}$. 

For cluster $c$ in time period $t$, the feature vector is defined as $\tilde{\boldsymbol{x}}_{ct} = \boldsymbol{x}_{ct} - \bar{\boldsymbol{x}}_{ct}, where~\bar{\boldsymbol{x}}_{ct} \in \mathbb{R}^{k\times1}$. 

To predict $\tilde{w}_{ct}$, I rely on $\tilde{\boldsymbol{x}}_{ct}$. This allows me to interpret the performance metric as the within R2, i.e. the share of the variance the model captures within clusters. 


(this does not help at all, thus disregard)...
To augment the number of training observations, I train the model on deltas, rather than on the demeaned variables. This substantially increases the number of training observations and covers a wider range of differences, making the training dataset more versatile and robust. Ideally, this helps to learn from a wider range of differences and thus increases the out-of-sample when predicting $\tilde{\boldsymbol{w}}_{ct}$.

In [16]:
# define the within variables
within_x_vars = rs_dyn_vars + ['precipitation']
within_target_var = 'log_mean_pc_cons_usd_2017'
within_df = df[['cluster_id','unique_id', within_target_var] + within_x_vars]

# demean the data and standardise the variables
demeaned_df = demean(within_df)
demeaned_df_norm = standardise_df(demeaned_df, exclude_cols = [within_target_var])

In [18]:
# run the within training
within_cv_trainer = rf.CrossValidator(demeaned_df_norm, fold_ids, within_target_var, within_x_vars, id_var = 'unique_id', random_seed = random_seed)
within_cv_trainer.run_cv_training(min_samples_leaf = 15)
within_cv_trainer.compute_overall_performance()

Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 295 seconds


{'r2': 0.010477796846323209, 'mse': 0.05445850180218004}

# Overall model
This model combines the within and between model to make the overall predictions on the validation samples

In [24]:
# define the target variable
target_var = 'log_mean_pc_cons_usd_2017'

# merge the predictions to one dataset
preds = pd.DataFrame(within_cv_trainer.predictions)
preds = pd.merge(preds, df[['cluster_id','unique_id']], on = 'unique_id')
preds = pd.merge(preds[['unique_id', 'cluster_id', 'y_hat']],
                 pd.DataFrame(between_cv_trainer.predictions)[['cluster_id', 'y_hat']],
                 on = 'cluster_id', how = 'left', suffixes = ('_change', '_mn'))
preds['y_hat'] = preds['y_hat_change'] + preds['y_hat_mn']

# get the ground truth value
preds = pd.merge(preds, df[['unique_id', target_var]], on = 'unique_id')

In [30]:
# evaluate the final predictions based on the k-fold CV workflow
r2 = []
for fold, splits in fold_ids.items():
    # get the training and validation sample
    train_df, val_df = split_lsms_ids(preds, splits['val_ids'])
    y_hat_val = val_df['y_hat']
    y_val = val_df[target_var]
    
    # calculate the performance on the validation sample
    r2.append(r2_score(y_val, y_hat_val))

In [36]:
fold_weights = [len(v['val_ids'])/(len(v['val_ids']) + len(v['train_ids'])) for v in fold_ids.values()]
np.average(r2, weights = fold_weights)

0.3976245560985697

In [37]:
# save the predictions to plot the data
preds.to_csv("../results/predictions/baseline_preds.csv", index = False)

# Delta model

In [None]:
# define the within variables
within_x_vars = rs_dyn_vars + ['precipitation']
within_target_var = 'log_mean_pc_cons_usd_2017'
within_df = df[['cluster_id','unique_id', within_target_var] + within_x_vars]

In [None]:
demeaned_df = demean(within_df)
delta_df = make_delta_df(within_df)

# combine the delta df, with the demeaned df
demeaned_df = demeaned_df.rename(columns = {'unique_id': 'delta_id'})
delta_df = pd.concat([delta_df, demeaned_df]).reset_index(drop = True)
delta_df_norm = standardise_df(delta_df)

In [None]:
# run the within training
within_cv_trainer = rf.CrossValidator(delta_df_norm, fold_ids, within_target_var, within_x_vars, id_var = 'delta_id', random_seed = random_seed)
within_cv_trainer.run_cv_training(min_samples_leaf = 15)
within_cv_trainer.compute_overall_performance()

In [None]:
# evaluate the model on the demeaned df
within_evaluator = rf.CV_Evaluator(standardise_df(demeaned_df), fold_ids, within_cv_trainer, id_var = 'delta_id')
within_evaluator.evaluate()
within_evaluator.compute_overall_performance()

# Calculate for different number of households

In [None]:
n_households = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

between_x_vars = osm_dist_pca_vars + osm_count_pca_vars + esa_lc_vars + wsf_vars + rs_mean_vars + ['avg_precipitation']
between_target_var = 'avg_log_mean_pc_cons_usd_2017'
between_df = df[['cluster_id','n_households', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

within_x_vars = rs_dyn_vars + ['precipitation']
within_target_var = 'log_mean_pc_cons_usd_2017'
within_df = df[['cluster_id','n_households','unique_id', within_target_var] + within_x_vars]

between_r2 = []
within_r2 = []

for i in n_households:
    print(f"Training on at least {i} Households per cluster")
    between_df_sub = between_df[between_df['n_households'] >= i].drop(columns = 'n_households').reset_index(drop = True)
    within_df_sub = within_df[within_df['n_households'] >= i].drop(columns = 'n_households').reset_index(drop = True)
    
    between_df_norm = standardise_df(between_df_sub, exclude_cols = [between_target_var])
    demeaned_df_norm = standardise_df(demean(within_df_sub), exclude_cols = [within_target_var])

    # run the bewtween training
    print('\nBetween model')
    between_cv_trainer = rf.CrossValidator(between_df_norm, fold_ids, between_target_var, between_x_vars, id_var = 'cluster_id', random_seed = random_seed)
    between_cv_trainer.run_cv_training(min_samples_leaf = 5)
    between_res = between_cv_trainer.compute_overall_performance()
    print(between_res)
    
    # run the within training
    print('\nWithin Model')
    within_cv_trainer = rf.CrossValidator(demeaned_df_norm, fold_ids, within_target_var, within_x_vars, id_var = 'unique_id', random_seed = random_seed)
    within_cv_trainer.run_cv_training(min_samples_leaf = 15)
    within_res = within_cv_trainer.compute_overall_performance()
    print(within_res)
    
    # store results
    between_r2.append(between_res['r2'])
    within_r2.append(within_res['r2'])
    print('\n\n\n')

In [None]:
plt.figure(figsize = (4,4))
plt.plot(list(range(1,16)), between_r2, label = 'Between $R^2$')
plt.plot(list(range(1,16)), within_r2, label = 'Within $R^2$')
plt.legend()
plt.xlabel("Minium number of housheolds per cluster")
plt.ylabel("$R^2$")
plt.axhline(y=0, color='red', linestyle='dotted', label='y = 0')  # Add red dotted line at y = 0
plt.xticks(range(0, 16))  # Set x-axis ticks from 1 to 10
plt.savefig("../figures/results/R2_vs_households.png", dpi = 300, bbox_inches = 'tight')
plt.show()