In [1]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset

In [2]:
# import the torch_framework package
from analysis_utils.torch_framework.CrossValidator import CrossValidator
from analysis_utils.torch_framework.ResNet18 import *
from analysis_utils.torch_framework.torch_helpers import *
from analysis_utils.torch_framework.SatDataset import SatDataset
from analysis_utils.torch_framework.BetweenModel import BetweenModel

# load the variable names of the tabular feature data
from analysis_utils.variable_names import *

# load the functions to do spatial CV
from analysis_utils.spatial_CV import *

In [3]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# the feature data (OSM + precipitation)
feat_data_pth = f"{root_data_dir}/feature_data/tabular_data.csv"

# set the random seed
random_seed = 348

# set the number of folds for k-fold CV
n_folds = 5

# set the number of epochs
n_epochs = 2

# training device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device: {device}")

Training device: cpu


In [4]:
# Set the paths for the satellite image satistics
sat_img_dir = f"{root_data_dir}/satellite_imgs"

# median LS images at the cluster level
LS_median_img_dir = f"{sat_img_dir}/LS/LS_median_cluster"
LS_median_stats_pth = f"{sat_img_dir}/LS/LS_median_img_stats.pkl"

# the RS v2 images at the cluster level
RS_v2_between_img_dir = f"{sat_img_dir}/RS_v2/RS_v2_between"
RS_v2_between_stats_pth = f"{sat_img_dir}/RS_v2/RS_v2_between_img_stats.pkl"

In [5]:
# load the LSMS data and the feature data (OSM and precipitation)
lsms_df = pd.read_csv(lsms_pth).iloc[:100,:]
feat_df = pd.read_csv(feat_data_pth)

# add the mean variable at the cluster level
lsms_df['avg_log_mean_pc_cons_usd_2017'] = lsms_df.groupby('cluster_id')['log_mean_pc_cons_usd_2017'].transform('mean')

# merge the lsms_df and the feat_df
df = pd.merge(lsms_df, feat_df, on = ('unique_id','cluster_id'), how = 'left')

In [6]:
# divide the data into k different folds
fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = random_seed)

Fold 0, specified test ratio: 0.2 - Actual test ratio 0.22
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.22
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.25
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.22
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.08


In [7]:
# define the target variable
between_target_var = 'avg_log_mean_pc_cons_usd_2017'

# define the tabular x_vars to keep in the data
between_x_vars = osm_dist_vars + osm_count_vars + ['avg_precipitation']

# define the mean cluster dataset
between_df = df[['cluster_id', 'lat', 'lon', 'country', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

In [8]:
# get the image statistics for the Landsat images for each band
LS_img_stats = get_agg_img_stats(LS_median_stats_pth, between_df, id_var = 'cluster_id')
RS_img_stats = get_agg_img_stats(RS_v2_between_stats_pth, between_df, id_var = 'cluster_id')

# extract the relevant statistics for each band (i.e. the mean, std, min, max) and get them as a list
LS_feat_stats = get_feat_stats(LS_img_stats) 
RS_feat_stats = get_feat_stats(RS_img_stats)

# For the RS feat stats, alter the mean and std of the last two channels (WSF and ESA LC)
# For these two channels normalisation does not introduce any advantage or yields meaningless numbers
# Thus just set mean and std for both channels to 0 and 1 (which effectively avoids normalisation)
RS_feat_stats['mean'][-2:] = [0,0]
RS_feat_stats['std'][-2:] = [1,1]

# get the stats for the target variable
between_target_stats = get_target_stats(df, between_target_var)

In [9]:
# get the data transforms for the target --> is used in the DataLoader object
target_transform = transforms.Compose([
        torchvision.transforms.Lambda(lambda t: standardise(t, between_target_stats['mean'], between_target_stats['std'])),
    ])

# get the data transform for the Landsat image (normalisation and random horizontal + vertical flips)
LS_transforms = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(LS_feat_stats['mean'], LS_feat_stats['std'])]
)

# same for RS:
RS_transforms = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(RS_feat_stats['mean'], RS_feat_stats['std'])]
)

## Run the Landsat model

In [10]:
model_name = 'between_cons_LS'
cv_object_name = 'between_cons_LS_cv'
data_type = 'LS'
id_var = 'cluster_id'

# set settings for the ResNet
input_channels = 6
ms = True
random_weights = False

# set hyper-parameters
hyper_params = {
    'lr': 1e-3,
    'batch_size': 128,
    'alpha': 0,
    'step_size': 1,
    'gamma': 0.96,
    'n_epochs': n_epochs
}


In [11]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(between_df, LS_median_img_dir, data_type, between_target_var, id_var,
                  LS_transforms, target_transform)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'], shuffle = False)
_, _ = next(iter(_loader))

In [12]:
# initialise the model and the CrossValidator object
ResNet18 = init_resnet(input_channels, ms, random_weights, random_seed = random_seed)
ls_cv = CrossValidator(model = ResNet18, 
                       lsms_df = between_df, 
                       fold_ids = fold_ids,
                       img_dir = LS_median_img_dir, 
                       data_type = data_type, 
                       target_var = between_target_var,
                       id_var = id_var,
                       feat_transform = LS_transforms, 
                       target_transform = target_transform,
                       device = device,
                       model_name = model_name,
                       random_seed = random_seed)

# run k-fold-cv
ls_cv.run_cv(hyper_params)

# save the cv object
ls_cv.save_object(name = cv_object_name)

  0%|          | 0/5 [00:00<?, ?it/s]


Training on fold 0
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.5425 - Train R2 -2.2749 - Val MSE: 26.2618 - Val R2 -31.3995
	EPOCH 1 - Train MSE: 20.3883 - Train R2 -17.8482 - Val MSE: 66.7920 - Val R2 -81.4021
Finished training after 16 seconds
Lowest loss on validation set in epoch 0: 26.261824
Maximum R2 on validation set in epoch 0: -31.399537
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 1
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.0672 - Train R2 -1.5969 - Val MSE: 1.8405 - Val R2 -1.5686
	EPOCH 1 - Train MSE: 19.5629 - Train R2 -15.5635 - Val MSE: 25.1064 - Val R2 -34.0398
Finished training after 17 seconds
Lowest loss on validation set in epoch 0: 1.840458
Maximum R2 on validation set in epoch 0: -1.568635
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 2
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0667 - Train R2 -1.3565 - Val MSE: 9.7342 - Val R2 -7.2553
	EPOCH 1 - Train MSE: 33.0528 - Train R2 -36.6879 - Val MSE: 105.0355 - Val R2 -88.0768
Finished training after 16 seconds
Lowest loss on validation set in epoch 0: 9.734236
Maximum R2 on validation set in epoch 0: -7.255258
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 3
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.2345 - Train R2 -1.8237 - Val MSE: 11.6959 - Val R2 -9.6652
	EPOCH 1 - Train MSE: 19.1163 - Train R2 -15.6888 - Val MSE: 55.4577 - Val R2 -49.5706
Finished training after 17 seconds
Lowest loss on validation set in epoch 0: 11.695909
Maximum R2 on validation set in epoch 0: -9.665236
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 4
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 3.9094 - Train R2 -3.0115 - Val MSE: 2.4478 - Val R2 -2.0514
	EPOCH 1 - Train MSE: 29.4754 - Train R2 -29.2446 - Val MSE: 40.2339 - Val R2 -49.1543
Finished training after 20 seconds
Lowest loss on validation set in epoch 0: 2.447798
Maximum R2 on validation set in epoch 0: -2.051353
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]

Finished Cross-validation after 90 seconds


In [13]:
-31.3995

-31.3995

In [14]:
# output the overall performance of the model
ls_cv.compute_overall_performance(use_fold_weights = True)

{'r2': -12.6599205087245, 'mse': 9.690663178761799}

## Run the RS model

In [15]:
model_name = 'between_cons_RS'
cv_object_name = 'between_cons_RS_cv'
data_type = 'RS_v2'
id_var = 'cluster_id'
img_dir = RS_v2_between_img_dir

# set settings for the ResNet
input_channels = 6
ms = False
random_weights = True

# set hyper-parameters
hyper_params = {
    'lr': 1e-3,
    'batch_size': 128,
    'alpha': 0,
    'step_size': 1,
    'gamma': 0.96,
    'n_epochs': n_epochs
}

In [16]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(between_df, img_dir, data_type, between_target_var, id_var,
                  RS_transforms, target_transform)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'], shuffle = False)
_, _ = next(iter(_loader))

In [None]:
# initialise the model and the CrossValidator object
ResNet18 = init_resnet(input_channels, ms, random_weights, random_seed = random_seed)
rs_cv = CrossValidator(model = ResNet18, 
                       lsms_df = between_df, 
                       fold_ids = fold_ids,
                       img_dir = img_dir, 
                       data_type = data_type, 
                       target_var = between_target_var,
                       id_var = id_var,
                       feat_transform = RS_transforms, 
                       target_transform = target_transform,
                       device = device,
                       model_name = model_name,
                       random_seed = random_seed)

# run k-fold-cv
rs_cv.run_cv(hyper_params)

# save the cv object
rs_cv.save_object(name = cv_object_name)

  0%|          | 0/5 [00:00<?, ?it/s]


Training on fold 0
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0872 - Train R2 -0.9295 - Val MSE: 861281.3750 - Val R2 -1062572.5152


In [None]:
# output the overall performance
rs_cv.compute_overall_performance(use_fold_weights = True)

# Run the between model

In [None]:
# initialise the Between model
ls_cv_pth = "../results/model_objects/between_cons_LS_cv.pkl"
rs_cv_pth = "../results/model_objects/between_cons_RS_cv.pkl"
between_model = BetweenModel(ls_cv_pth, rs_cv_pth, between_df, between_target_var, 
                             between_x_vars, fold_ids, device, random_seed)

In [None]:
between_model.train(min_samples_leaf = 10, n_components = 1)

In [None]:
rs_cv.best_model_paths