# Load the files and packages


In [1]:
# mount the google drive
from google.colab import drive
drive.mount('/content/gdrive/')
%cd "/content/gdrive/MyDrive/master_thesis/predicting_poverty/analysis"


Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).
/content/gdrive/MyDrive/master_thesis/predicting_poverty/analysis


In [2]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset

In [3]:
# import the torch_framework package
from analysis_utils.torch_framework.CrossValidator import CrossValidator
from analysis_utils.torch_framework.ResNet18 import *
from analysis_utils.torch_framework.torch_helpers import *
from analysis_utils.torch_framework.SatDataset import SatDataset
from analysis_utils.torch_framework.BetweenModel import BetweenModel

# load the variable names of the tabular feature data
from analysis_utils.variable_names import *

# load the functions to do spatial CV
from analysis_utils.spatial_CV import *

In [4]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# the feature data (OSM + precipitation)
feat_data_pth = f"{root_data_dir}/feature_data/tabular_data.csv"

# set the random seed
random_seed = 348

# set the number of folds for k-fold CV
n_folds = 5

# set the number of epochs
n_epochs = 2

# training device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device: {device}")

Training device: cpu


In [5]:
# Set the paths for the satellite image satistics
sat_img_dir = f"{root_data_dir}/satellite_imgs"

# median LS images at the cluster level
LS_median_img_dir = f"{sat_img_dir}/LS/LS_median_cluster"
LS_median_stats_pth = f"{sat_img_dir}/LS/LS_median_img_stats.pkl"

# the RS v2 images at the cluster level
RS_v2_between_img_dir = f"{sat_img_dir}/RS_v2/RS_v2_between"
RS_v2_between_stats_pth = f"{sat_img_dir}/RS_v2/RS_v2_between_img_stats.pkl"

# Load the data

In [13]:
# load the LSMS data and the feature data (OSM and precipitation)
lsms_df = pd.read_csv(lsms_pth).iloc[:100,:]
feat_df = pd.read_csv(feat_data_pth)

# add the mean variable at the cluster level
lsms_df['avg_log_mean_pc_cons_usd_2017'] = lsms_df.groupby('cluster_id')['log_mean_pc_cons_usd_2017'].transform('mean')

# merge the lsms_df and the feat_df
df = pd.merge(lsms_df, feat_df, on = ('unique_id','cluster_id'), how = 'left')

In [14]:
# define the target variable
between_target_var = 'avg_log_mean_pc_cons_usd_2017'

# define the tabular x_vars to keep in the data
between_x_vars = osm_dist_vars + osm_count_vars + ['avg_precipitation']

# define the mean cluster dataset
between_df = df[['cluster_id', 'lat', 'lon', 'country', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

# divide the data into k different folds
fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = random_seed)

Fold 0, specified test ratio: 0.2 - Actual test ratio 0.22
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.22
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.25
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.22
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.08


In [15]:
# get the image statistics for the Landsat images for each band
LS_img_stats = get_agg_img_stats(LS_median_stats_pth, between_df, id_var = 'cluster_id')
RS_img_stats = get_agg_img_stats(RS_v2_between_stats_pth, between_df, id_var = 'cluster_id')

# extract the relevant statistics for each band (i.e. the mean, std, min, max) and get them as a list
LS_feat_stats = get_feat_stats(LS_img_stats)
RS_feat_stats = get_feat_stats(RS_img_stats)

# For the RS feat stats, alter the mean and std of the last two channels (WSF and ESA LC)
# For these two channels normalisation does not introduce any advantage or yields meaningless numbers
# Thus just set mean and std for both channels to 0 and 1 (which effectively avoids normalisation)
RS_feat_stats['mean'][-2:] = [0,0]
RS_feat_stats['std'][-2:] = [1,1]

# get the stats for the target variable
between_target_stats = get_target_stats(df, between_target_var)

In [16]:
# get the data transforms for the target --> is used in the DataLoader object
target_transform = transforms.Compose([
        torchvision.transforms.Lambda(lambda t: standardise(t, between_target_stats['mean'], between_target_stats['std'])),
    ])

# get the data transform for the Landsat image (normalisation and random horizontal + vertical flips)
LS_transforms = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(LS_feat_stats['mean'], LS_feat_stats['std'])]
)

# same for RS:
RS_transforms = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(RS_feat_stats['mean'], RS_feat_stats['std'])]
)

In [17]:
LS_transforms.transforms[-1]

Normalize(mean=[0.14900175, 0.11407725, 0.07228344, 0.2512577, 0.27929834, 291.98022], std=[0.0433814236899497, 0.028872849398045144, 0.017511038528906585, 0.050454133618907694, 0.08556264748962224, 29.26664892412566])

# Run the Landsat model

In [11]:
model_name = 'between_cons_LS'
cv_object_name = 'between_cons_LS_cv'
data_type = 'LS'
id_var = 'cluster_id'

# set settings for the ResNet
input_channels = 6
ms = True
random_weights = False

# set hyper-parameters
hyper_params = {
    'lr': 1e-3,
    'batch_size': 128,
    'alpha': 0,
    'step_size': 1,
    'gamma': 0.96,
    'n_epochs': n_epochs
}


In [12]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(between_df, LS_median_img_dir, data_type,
                  between_target_var, id_var,
                  LS_transforms, target_transform)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'], shuffle = False)
_, _ = next(iter(_loader))

In [13]:
# initialise the model and the CrossValidator object
ResNet18 = init_resnet(input_channels, ms, random_weights, random_seed = random_seed)
ls_cv = CrossValidator(model = ResNet18,
                       lsms_df = between_df,
                       fold_ids = fold_ids,
                       img_dir = LS_median_img_dir,
                       data_type = data_type,
                       target_var = between_target_var,
                       id_var = id_var,
                       feat_transform = LS_transforms,
                       target_transform = target_transform,
                       device = device,
                       model_name = model_name,
                       random_seed = random_seed)

# run k-fold-cv
ls_cv.run_cv(hyper_params)

# save the cv object
ls_cv.save_object(name = cv_object_name)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 110MB/s]


  0%|          | 0/5 [00:00<?, ?it/s]


Training on fold 0
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.5425 - Train R2 -2.2749 - Val MSE: 26.2905 - Val R2 -31.4349
	EPOCH 1 - Train MSE: 20.3888 - Train R2 -17.8486 - Val MSE: 66.8984 - Val R2 -81.5334
Finished training after 19 seconds
Lowest loss on validation set in epoch 0: 26.290501
Maximum R2 on validation set in epoch 0: -31.434917
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 1
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.0672 - Train R2 -1.5969 - Val MSE: 1.8382 - Val R2 -1.5655
	EPOCH 1 - Train MSE: 19.5637 - Train R2 -15.5642 - Val MSE: 25.4624 - Val R2 -34.5366
Finished training after 15 seconds
Lowest loss on validation set in epoch 0: 1.838228
Maximum R2 on validation set in epoch 0: -1.565523
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 2
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0667 - Train R2 -1.3565 - Val MSE: 9.7685 - Val R2 -7.2843
	EPOCH 1 - Train MSE: 33.0435 - Train R2 -36.6773 - Val MSE: 104.8273 - Val R2 -87.9003
Finished training after 14 seconds
Lowest loss on validation set in epoch 0: 9.768533
Maximum R2 on validation set in epoch 0: -7.284343
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 3
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.2345 - Train R2 -1.8237 - Val MSE: 11.5567 - Val R2 -9.5383
	EPOCH 1 - Train MSE: 19.1279 - Train R2 -15.6989 - Val MSE: 56.1841 - Val R2 -50.2330
Finished training after 15 seconds
Lowest loss on validation set in epoch 0: 11.556669
Maximum R2 on validation set in epoch 0: -9.538265
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 4
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.9094 - Train R2 -3.0115 - Val MSE: 2.4467 - Val R2 -2.0500
	EPOCH 1 - Train MSE: 29.4602 - Train R2 -29.2290 - Val MSE: 40.4105 - Val R2 -49.3746
Finished training after 18 seconds
Lowest loss on validation set in epoch 0: 2.446676
Maximum R2 on validation set in epoch 0: -2.049954
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]

Finished Cross-validation after 86 seconds


In [14]:
# output the overall performance of the model
ls_cv.compute_overall_performance(use_fold_weights = True)

{'r2': -12.649627808851768, 'mse': 9.687755054897732}

# Run the RS model

In [15]:
model_name = 'between_cons_RS'
cv_object_name = 'between_cons_RS_cv'
data_type = 'RS_v2'
id_var = 'cluster_id'
img_dir = RS_v2_between_img_dir

# set settings for the ResNet
input_channels = 6
ms = False
random_weights = True

# set hyper-parameters
hyper_params = {
    'lr': 1e-3,
    'batch_size': 128,
    'alpha': 0,
    'step_size': 1,
    'gamma': 0.96,
    'n_epochs': n_epochs
}

In [16]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(between_df, img_dir, data_type, between_target_var, id_var,
                  RS_transforms, target_transform)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'], shuffle = False)
_, _ = next(iter(_loader))

In [17]:
# initialise the model and the CrossValidator object
ResNet18 = init_resnet(input_channels, ms, random_weights, random_seed = random_seed)
rs_cv = CrossValidator(model = ResNet18,
                       lsms_df = between_df,
                       fold_ids = fold_ids,
                       img_dir = img_dir,
                       data_type = data_type,
                       target_var = between_target_var,
                       id_var = id_var,
                       feat_transform = RS_transforms,
                       target_transform = target_transform,
                       device = device,
                       model_name = model_name,
                       random_seed = random_seed)

# run k-fold-cv
rs_cv.run_cv(hyper_params)

# save the cv object
rs_cv.save_object(name = cv_object_name)

  0%|          | 0/5 [00:00<?, ?it/s]


Training on fold 0
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0872 - Train R2 -0.9295 - Val MSE: 863197.0625 - Val R2 -1064935.9975
	EPOCH 1 - Train MSE: 185.5464 - Train R2 -170.5300 - Val MSE: 28724.4121 - Val R2 -35436.6652
Finished training after 15 seconds
Lowest loss on validation set in epoch 1: 28724.412109
Maximum R2 on validation set in epoch 1: -35436.665237
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 1
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.1962 - Train R2 -0.8594 - Val MSE: 999849.1875 - Val R2 -1395438.4305
	EPOCH 1 - Train MSE: 159.1697 - Train R2 -133.7658 - Val MSE: 5560.2856 - Val R2 -7759.2119
Finished training after 15 seconds
Lowest loss on validation set in epoch 1: 5560.285645
Maximum R2 on validation set in epoch 1: -7759.211894
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 2
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 1.3242 - Train R2 -0.5099 - Val MSE: 288294.0938 - Val R2 -244490.9244
	EPOCH 1 - Train MSE: 140.0256 - Train R2 -158.6619 - Val MSE: 8951.6465 - Val R2 -7590.5712
Finished training after 14 seconds
Lowest loss on validation set in epoch 1: 8951.646484
Maximum R2 on validation set in epoch 1: -7590.571222
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 3
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 1.7469 - Train R2 -0.5251 - Val MSE: 383390.4375 - Val R2 -349604.0698
	EPOCH 1 - Train MSE: 120.8659 - Train R2 -104.5178 - Val MSE: 1886.5184 - Val R2 -1719.2735
Finished training after 25 seconds
Lowest loss on validation set in epoch 1: 1886.518433
Maximum R2 on validation set in epoch 1: -1719.273539
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 4
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0445 - Train R2 -1.0978 - Val MSE: 34038.5703 - Val R2 -42430.4773
	EPOCH 1 - Train MSE: 7.2672 - Train R2 -6.4569 - Val MSE: 536310.1250 - Val R2 -668547.3321
Finished training after 22 seconds
Lowest loss on validation set in epoch 0: 34038.570312
Maximum R2 on validation set in epoch 0: -42430.477290
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]

Finished Cross-validation after 96 seconds


In [18]:
# output the overall performance
rs_cv.compute_overall_performance(use_fold_weights = True)

{'r2': -12599.14241216957, 'mse': 10677.938761393229}

# Run the between model

In [18]:
# define the name to save the between object in
between_object_name = 'between_cons'

# initialise the Between model
ls_cv_pth = "../results/model_objects/between_cons_LS_cv.pkl"
rs_cv_pth = "../results/model_objects/between_cons_RS_cv.pkl"
between_model = BetweenModel(ls_cv_pth, rs_cv_pth, between_df, between_target_var,
                             between_x_vars, fold_ids, device, random_seed)

# run the between model
between_model.train(min_samples_leaf = 10, n_components = 1)

Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.2059
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.3344
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9879
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9921


Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.1902
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.2368
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9856
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9909


Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.1768
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.4506
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9931
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9730


Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.1991
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.3362
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9856
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9918


Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.2311
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.8530
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9772
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9999


Finished training after 75 seconds


In [21]:
between_model.compute_overall_performance(use_fold_weights = True)
between_model.save_object(between_object_name)