In [1]:
import numpy as npc
import pandas as pd
from torch.utils.data import DataLoader, Dataset

In [2]:
# import the torch_framework package
from analysis_utils.torch_framework.CrossValidator import CrossValidator
from analysis_utils.torch_framework.ResNet18 import *
from analysis_utils.torch_framework.torch_helpers import *
from analysis_utils.torch_framework.SatDataset import SatDataset
from analysis_utils.torch_framework.BetweenModel import BetweenModel

# load the variable names of the tabular feature data
from analysis_utils.variable_names import *

# load the functions to do spatial CV
from analysis_utils.spatial_CV import *

In [3]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# the feature data (OSM + precipitation)
feat_data_pth = f"{root_data_dir}/feature_data/tabular_data.csv"

# set the random seed
random_seed = 348

# set the number of folds for k-fold CV
n_folds = 5

# set the number of epochs
n_epochs = 5

# training device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device: {device}")

Training device: cpu


In [4]:
# Set the paths for the satellite image satistics
sat_img_dir = f"{root_data_dir}/satellite_imgs"

# median LS images at the cluster level
LS_median_img_dir = f"{sat_img_dir}/LS/LS_median_cluster"
LS_median_stats_pth = f"{sat_img_dir}/LS/LS_median_img_stats.pkl"

# the RS v2 images at the cluster level
RS_v2_between_img_dir = f"{sat_img_dir}/RS_v2/RS_v2_between"
RS_v2_between_stats_pth = f"{sat_img_dir}/RS_v2/RS_v2_between_img_stats.pkl"

In [12]:
# load the LSMS data and the feature data (OSM and precipitation)
lsms_df = pd.read_csv(lsms_pth)#.iloc[:100,:]
feat_df = pd.read_csv(feat_data_pth)

# add the mean variable at the cluster level
lsms_df['avg_log_mean_pc_cons_usd_2017'] = lsms_df.groupby('cluster_id')['log_mean_pc_cons_usd_2017'].transform('mean')

# merge the lsms_df and the feat_df
df = pd.merge(lsms_df, feat_df, on = ('unique_id','cluster_id'), how = 'left')

In [13]:
# divide the data into k different folds
fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = random_seed)

Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18


In [8]:
# define the target variable
between_target_var = 'avg_log_mean_pc_cons_usd_2017'

# define the tabular x_vars to keep in the data
between_x_vars = osm_dist_vars + osm_count_vars + ['avg_precipitation']

# define the mean cluster dataset
between_df = df[['cluster_id', 'lat', 'lon', 'country', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

In [9]:
# get the image statistics for the Landsat images for each band
LS_img_stats = get_agg_img_stats(LS_median_stats_pth, between_df, id_var = 'cluster_id')
RS_img_stats = get_agg_img_stats(RS_v2_between_stats_pth, between_df, id_var = 'cluster_id')

# extract the relevant statistics for each band (i.e. the mean, std, min, max) and get them as a list
LS_feat_stats = get_feat_stats(LS_img_stats) 
RS_feat_stats = get_feat_stats(RS_img_stats)

# For the RS feat stats, alter the mean and std of the last two channels (WSF and ESA LC)
# For these two channels normalisation does not introduce any advantage or yields meaningless numbers
# Thus just set mean and std for both channels to 0 and 1 (which effectively avoids normalisation)
RS_feat_stats['mean'][-2:] = [0,0]
RS_feat_stats['std'][-2:] = [1,1]

# get the stats for the target variable
between_target_stats = get_target_stats(df, between_target_var)

In [None]:
# get the data transforms for the target --> is used in the DataLoader object
target_transform = transforms.Compose([
        torchvision.transforms.Lambda(lambda t: standardise(t, between_target_stats['mean'], between_target_stats['std'])),
    ])

# get the data transform for the Landsat image (normalisation and random horizontal + vertical flips)
LS_transforms = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(LS_feat_stats['mean'], LS_feat_stats['std'])]
)

# same for RS:
RS_transforms = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(RS_feat_stats['mean'], RS_feat_stats['std'])]
)

## Run the Landsat model

In [10]:
model_name = 'between_cons_LS'
cv_object_name = 'between_cons_LS_cv'
data_type = 'LS'
id_var = 'cluster_id'

# set settings for the ResNet
input_channels = 6
ms = True
random_weights = False

# set hyper-parameters
hyper_params = {
    'lr': 1e-3,
    'batch_size': 128,
    'alpha': 0,
    'step_size': 1,
    'gamma': 0.96,
    'n_epochs': n_epochs
}


In [11]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(between_df, LS_median_img_dir, data_type, between_target_var, id_var,
                  LS_transforms, target_transform)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'], shuffle = False)
_, _ = next(iter(_loader))

In [12]:
# initialise the model and the CrossValidator object
ResNet18 = init_resnet(input_channels, ms, random_weights, random_seed = random_seed)
ls_cv = CrossValidator(model = ResNet18, 
                       lsms_df = between_df, 
                       fold_ids = fold_ids,
                       img_dir = LS_median_img_dir, 
                       data_type = data_type, 
                       target_var = between_target_var,
                       id_var = id_var,
                       feat_transform = LS_transforms, 
                       target_transform = target_transform,
                       device = device,
                       model_name = model_name,
                       random_seed = random_seed)

# run k-fold-cv
ls_cv.run_cv(hyper_params)

# save the cv object
ls_cv.save_object(name = cv_object_name)

  0%|          | 0/5 [00:00<?, ?it/s]


Training on fold 0
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.5425 - Train R2 -2.2749 - Val MSE: 26.2618 - Val R2 -31.3995
	EPOCH 1 - Train MSE: 20.3883 - Train R2 -17.8482 - Val MSE: 66.7920 - Val R2 -81.4021
	EPOCH 2 - Train MSE: 21.7474 - Train R2 -19.1046 - Val MSE: 255.9488 - Val R2 -314.7672
	EPOCH 3 - Train MSE: 6.4311 - Train R2 -4.9452 - Val MSE: 149.4732 - Val R2 -183.4070
	EPOCH 4 - Train MSE: 1.9466 - Train R2 -0.7996 - Val MSE: 11.4001 - Val R2 -13.0644
Finished training after 38 seconds
Lowest loss on validation set in epoch 4: 11.400091
Maximum R2 on validation set in epoch 4: -13.064434
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 1
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.0672 - Train R2 -1.5969 - Val MSE: 1.8405 - Val R2 -1.5686
	EPOCH 1 - Train MSE: 19.5629 - Train R2 -15.5635 - Val MSE: 25.1064 - Val R2 -34.0398
	EPOCH 2 - Train MSE: 23.0227 - Train R2 -18.4929 - Val MSE: 72.9961 - Val R2 -100.8770
	EPOCH 3 - Train MSE: 26.0182 - Train R2 -21.0290 - Val MSE: 77.1862 - Val R2 -106.7249
	EPOCH 4 - Train MSE: 18.5402 - Train R2 -14.6976 - Val MSE: 101.9160 - Val R2 -141.2391
Finished training after 38 seconds
Lowest loss on validation set in epoch 0: 1.840458
Maximum R2 on validation set in epoch 0: -1.568635
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 2
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0667 - Train R2 -1.3565 - Val MSE: 9.7342 - Val R2 -7.2553
	EPOCH 1 - Train MSE: 33.0528 - Train R2 -36.6879 - Val MSE: 105.0355 - Val R2 -88.0768
	EPOCH 2 - Train MSE: 15.5135 - Train R2 -16.6890 - Val MSE: 13.3542 - Val R2 -10.3252
	EPOCH 3 - Train MSE: 7.0662 - Train R2 -7.0571 - Val MSE: 59.6572 - Val R2 -49.5931
	EPOCH 4 - Train MSE: 4.5963 - Train R2 -4.2408 - Val MSE: 114.8850 - Val R2 -96.4299
Finished training after 42 seconds
Lowest loss on validation set in epoch 0: 9.734236
Maximum R2 on validation set in epoch 0: -7.255258
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 3
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.2345 - Train R2 -1.8237 - Val MSE: 11.6959 - Val R2 -9.6652
	EPOCH 1 - Train MSE: 19.1163 - Train R2 -15.6888 - Val MSE: 55.4577 - Val R2 -49.5706
	EPOCH 2 - Train MSE: 19.6745 - Train R2 -16.1761 - Val MSE: 3.9774 - Val R2 -2.6269
	EPOCH 3 - Train MSE: 24.5234 - Train R2 -20.4093 - Val MSE: 7.8060 - Val R2 -6.1181
	EPOCH 4 - Train MSE: 30.3057 - Train R2 -25.4573 - Val MSE: 6.2410 - Val R2 -4.6910
Finished training after 42 seconds
Lowest loss on validation set in epoch 2: 3.977365
Maximum R2 on validation set in epoch 2: -2.626869
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 4
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.9094 - Train R2 -3.0115 - Val MSE: 2.4478 - Val R2 -2.0514
	EPOCH 1 - Train MSE: 29.4754 - Train R2 -29.2446 - Val MSE: 40.2339 - Val R2 -49.1543
	EPOCH 2 - Train MSE: 15.6601 - Train R2 -15.0688 - Val MSE: 28.8059 - Val R2 -34.9085
	EPOCH 3 - Train MSE: 12.8414 - Train R2 -12.1765 - Val MSE: 85.2654 - Val R2 -105.2894
	EPOCH 4 - Train MSE: 13.4875 - Train R2 -12.8395 - Val MSE: 93.7637 - Val R2 -115.8831
Finished training after 55 seconds
Lowest loss on validation set in epoch 0: 2.447798
Maximum R2 on validation set in epoch 0: -2.051353
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]

Finished Cross-validation after 221 seconds


In [None]:
# output the overall performance of the model
ls_cv.compute_overall_performance(use_fold_weights = True)

## Run the RS model

In [13]:
model_name = 'between_cons_RS'
cv_object_name = 'between_cons_RS_cv'
data_type = 'RS_v2'
id_var = 'cluster_id'
img_dir = RS_v2_between_img_dir

# set settings for the ResNet
input_channels = 6
ms = False
random_weights = True

# set hyper-parameters
hyper_params = {
    'lr': 1e-3,
    'batch_size': 128,
    'alpha': 0,
    'step_size': 1,
    'gamma': 0.96,
    'n_epochs': n_epochs
}

In [14]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(between_df, img_dir, data_type, between_target_var, id_var,
                  RS_transforms, target_transform)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'], shuffle = False)
_, _ = next(iter(_loader))

In [15]:
# initialise the model and the CrossValidator object
ResNet18 = init_resnet(input_channels, ms, random_weights, random_seed = random_seed)
rs_cv = CrossValidator(model = ResNet18, 
                       lsms_df = between_df, 
                       fold_ids = fold_ids,
                       img_dir = img_dir, 
                       data_type = data_type, 
                       target_var = between_target_var,
                       id_var = id_var,
                       feat_transform = RS_transforms, 
                       target_transform = target_transform,
                       device = device,
                       model_name = model_name,
                       random_seed = random_seed)

# run k-fold-cv
rs_cv.run_cv(hyper_params)

# save the cv object
rs_cv.save_object(name = cv_object_name)

  0%|          | 0/5 [00:00<?, ?it/s]


Training on fold 0
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0872 - Train R2 -0.9295 - Val MSE: 861281.3750 - Val R2 -1062572.5152
	EPOCH 1 - Train MSE: 185.5162 - Train R2 -170.5021 - Val MSE: 28484.6133 - Val R2 -35140.8213
	EPOCH 2 - Train MSE: 11.9891 - Train R2 -10.0834 - Val MSE: 2590.0142 - Val R2 -3194.3325
	EPOCH 3 - Train MSE: 33.7181 - Train R2 -30.1710 - Val MSE: 5984.0225 - Val R2 -7381.5632
	EPOCH 4 - Train MSE: 15.8119 - Train R2 -13.6174 - Val MSE: 10084.7168 - Val R2 -12440.6410
Finished training after 37 seconds
Lowest loss on validation set in epoch 2: 2590.014160
Maximum R2 on validation set in epoch 2: -3194.332545
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 1
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.1962 - Train R2 -0.8595 - Val MSE: 1001271.1250 - Val R2 -1397422.9029
	EPOCH 1 - Train MSE: 159.1958 - Train R2 -133.7879 - Val MSE: 5618.6724 - Val R2 -7840.6998
	EPOCH 2 - Train MSE: 6.2770 - Train R2 -4.3146 - Val MSE: 5525.3223 - Val R2 -7710.4148
	EPOCH 3 - Train MSE: 13.3253 - Train R2 -10.2822 - Val MSE: 10810.8008 - Val R2 -15087.0935
	EPOCH 4 - Train MSE: 7.3294 - Train R2 -5.2057 - Val MSE: 30120.3125 - Val R2 -42036.4089
Finished training after 46 seconds
Lowest loss on validation set in epoch 2: 5525.322266
Maximum R2 on validation set in epoch 2: -7710.414758
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 2
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 1.3242 - Train R2 -0.5099 - Val MSE: 288446.9688 - Val R2 -244620.5781
	EPOCH 1 - Train MSE: 140.0170 - Train R2 -158.6520 - Val MSE: 8664.1729 - Val R2 -7346.7755
	EPOCH 2 - Train MSE: 8.8408 - Train R2 -9.0805 - Val MSE: 12650.9209 - Val R2 -10727.7944
	EPOCH 3 - Train MSE: 2.3223 - Train R2 -1.6479 - Val MSE: 463.1288 - Val R2 -391.7630
	EPOCH 4 - Train MSE: 1.2248 - Train R2 -0.3966 - Val MSE: 2065.0098 - Val R2 -1750.2611
Finished training after 42 seconds
Lowest loss on validation set in epoch 3: 463.128845
Maximum R2 on validation set in epoch 3: -391.763032
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 3
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 1.7469 - Train R2 -0.5251 - Val MSE: 384275.1875 - Val R2 -350410.9032
	EPOCH 1 - Train MSE: 120.8588 - Train R2 -104.5115 - Val MSE: 1826.5724 - Val R2 -1664.6101
	EPOCH 2 - Train MSE: 8.0744 - Train R2 -6.0491 - Val MSE: 308.1467 - Val R2 -279.9920
	EPOCH 3 - Train MSE: 4.3428 - Train R2 -2.7913 - Val MSE: 257.4507 - Val R2 -233.7635
	EPOCH 4 - Train MSE: 3.0625 - Train R2 -1.6736 - Val MSE: 160.4022 - Val R2 -145.2671
Finished training after 37 seconds
Lowest loss on validation set in epoch 4: 160.402161
Maximum R2 on validation set in epoch 4: -145.267106
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 4
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0445 - Train R2 -1.0978 - Val MSE: 33906.1328 - Val R2 -42265.3842
	EPOCH 1 - Train MSE: 7.2598 - Train R2 -6.4492 - Val MSE: 536711.4375 - Val R2 -669047.6086
	EPOCH 2 - Train MSE: 77.7291 - Train R2 -78.7575 - Val MSE: 49295.2969 - Val R2 -61449.0629
	EPOCH 3 - Train MSE: 2.6644 - Train R2 -1.7339 - Val MSE: 98804.0938 - Val R2 -123165.2612
	EPOCH 4 - Train MSE: 5.9719 - Train R2 -5.1277 - Val MSE: 242780.2969 - Val R2 -302641.7184
Finished training after 46 seconds
Lowest loss on validation set in epoch 0: 33906.132812
Maximum R2 on validation set in epoch 0: -42265.384160
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]

Finished Cross-validation after 212 seconds


In [None]:
# output the overall performance
rs_cv.compute_overall_performance(use_fold_weights = True)

# Run the between model

In [10]:
# initialise the Between model
ls_cv_pth = "model_results/between_cons_LS_cv.pkl"
rs_cv_pth = "model_results/between_cons_RS_cv.pkl"
between_model = BetweenModel(ls_cv_pth, rs_cv_pth, between_df, between_target_var, 
                             between_x_vars, fold_ids, device, random_seed)

In [11]:
between_model.train_between_model()

	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 5 components: 0.6972
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 5 components: 0.9146
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 5 components: 0.9986
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 5 components: 0.9998
Sat feat shape: (28, 10)
Index(['school_dist', 'restaurant_dist', 'cafe_dist', 'fuel_dist',
       'marketplace_dist', 'hospital_dist', 'pharmacy_dist', 'bar_dist',
       'clinic_dist', 'university_dist', 'library_dist', 'restaurant_count',
       'bar_count', 'cafe_count', 'marketplace_count', 'school_count',
       'university_count', 'library_count', 'fuel_count', 'pharmacy_count',
       'hospital_count', 'clinic_count', 'avg_precipitation', 'cluster_id'],
      dtype='object')
