In [1]:
import numpy as np
import pandas as pd
import pickle
from torch.utils.data import DataLoader, Dataset

In [2]:
# import the torch_framework package
from analysis_utils.torch_framework.CrossValidator import CrossValidator
from analysis_utils.torch_framework.ResNet18 import *
from analysis_utils.torch_framework.torch_helpers import *
from analysis_utils.torch_framework.torch_helpers import standardise as torch_standardise
from analysis_utils.torch_framework.SatDataset import SatDataset

# load the variable names of the tabular feature data
from analysis_utils.variable_names import *

# import the helpers to demean the data and get the deltas etc. 
from analysis_utils.analysis_helpers import demean_df, make_delta_df

# load the functions to do spatial CV
from analysis_utils.spatial_CV import *

In [3]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# the feature data (OSM + precipitation)
# could also include precipication data (but this is most likely covered by the NDWI) and furthermore,
# it would require to add an additional model step (combining the precipitation data and the sat features...)
# feat_data_pth = f"{root_data_dir}/feature_data/tabular_data.csv"

# set the random seed
random_seed = 348

# set the number of folds for k-fold CV
n_folds = 2

# set the number of epochs
n_epochs = 2

# training device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device: {device}")

Training device: cpu


In [4]:
# Set the paths for the satellite image satistics
sat_img_dir = f"{root_data_dir}/satellite_imgs"

# define the delta image directory (includes demeaned images)
delta_img_dir = f"{sat_img_dir}/RS_v2/RS_v2_delta"
delta_stats_pth = f"{sat_img_dir}/RS_v2/RS_v2_delta_img_stats.pkl"

# Load the data

In [5]:
# load the LSMS data
lsms_df = pd.read_csv(lsms_pth).iloc[:100,:]

cl_df = lsms_df[['cluster_id', 'country', 'lat', 'lon']].copy().drop_duplicates().reset_index(drop = True)
within_df = lsms_df[['cluster_id', 'unique_id', 'log_mean_pc_cons_usd_2017']]

# demean the df
demeaned_df = demean_df(within_df)
demeaned_df = demeaned_df.rename(columns = {'unique_id':'delta_id'})
demeaned_df = pd.merge(demeaned_df, cl_df, on = 'cluster_id', how = 'left')

# create delta df
print("Make Delta DF")
delta_df = make_delta_df(within_df)
delta_df = pd.merge(delta_df, cl_df, on = 'cluster_id', how = 'left')

# add the demeaned df to the delta df
aux = demeaned_df.copy().rename(columns = {'unique_id': 'delta_id'})
delta_df = pd.concat([delta_df, demeaned_df]).reset_index(drop = True)

Make Delta DF


  0%|          | 0/36 [00:00<?, ?it/s]

In [6]:
# get the fold ids from spatial CV
fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = random_seed)

Fold 0, specified test ratio: 0.5 - Actual test ratio 0.50
Fold 1, specified test ratio: 0.5 - Actual test ratio 0.50


# Demeaned model

In [7]:
# define the target variable
demeaned_target_var = 'log_mean_pc_cons_usd_2017'

In [8]:
# extract the image statistics for the demeaned images
demeaned_img_stats = get_agg_img_stats(delta_stats_pth, demeaned_df, id_var = 'delta_id')
demeaned_feat_stats = get_feat_stats(demeaned_img_stats)

# get the target stats
demeaned_target_stats = get_target_stats(demeaned_df, demeaned_target_var)

# define the transforms
# get the data transforms for the target --> is used in the DataLoader object
demeaned_target_transform = transforms.Compose([
        torchvision.transforms.Lambda(lambda t: torch_standardise(t, demeaned_target_stats['mean'], demeaned_target_stats['std'])),
    ])

# get the data transform for the Landsat image (normalisation and random horizontal + vertical flips)
demeaned_feat_transform = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(demeaned_feat_stats['mean'], demeaned_feat_stats['std'])]
)

## Run the Model

In [9]:
model_name = 'within_cons_demeaned'
cv_object_name = 'within_cons_demeaned_cv'
data_type = 'RS_v2'
id_var = 'delta_id'

# set settings for the ResNet
input_channels = 4
ms = False
random_weights = True

# set hyper-parameters
hyper_params = {
    'lr': 1e-2,
    'batch_size': 128,
    'alpha': 1e-2,
    'step_size': 1,
    'gamma': 0.96,
    'n_epochs': n_epochs
}

In [10]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(demeaned_df, delta_img_dir, data_type,
                  demeaned_target_var, id_var,
                  demeaned_feat_transform, demeaned_target_transform)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'], shuffle = False)
_, _ = next(iter(_loader))

In [11]:
# initialise the model and the CrossValidator object
ResNet18 = init_resnet(input_channels, ms, random_weights, random_seed = random_seed)
demeaned_cv = CrossValidator(model = ResNet18,
                            lsms_df = demeaned_df,
                            fold_ids = fold_ids,
                            img_dir = delta_img_dir,
                            data_type = data_type,
                            target_var = demeaned_target_var,
                            id_var = id_var,
                            feat_transform = demeaned_feat_transform,
                            target_transform = demeaned_target_transform,
                            device = device,
                            model_name = model_name,
                            random_seed = random_seed)

# run k-fold-cv
demeaned_cv.run_cv(hyper_params)

# save the cv object
demeaned_cv.save_object(name = cv_object_name)

  0%|          | 0/2 [00:00<?, ?it/s]


Training on fold 0
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 1.8473 - Train R2 -0.2278 - Val MSE: 16041940619949506560.0000 - Val R2 -60691460168588320768.0000
	EPOCH 1 - Train MSE: 481.4398 - Train R2 -318.9831 - Val MSE: 927789440040960.0000 - Val R2 -3510105321484388.5000
Finished training after 27 seconds
Lowest loss on validation set in epoch 1: 927789440040960.000000
Maximum R2 on validation set in epoch 1: -3510105321484388.500000
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 1
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 0.9881 - Train R2 -0.3356 - Val MSE: 745447580594339840.0000 - Val R2 -857666476063516416.0000
	EPOCH 1 - Train MSE: 520.7944 - Train R2 -702.9431 - Val MSE: 2301825713700864.0000 - Val R2 -2648340364346970.5000
Finished training after 27 seconds
Lowest loss on validation set in epoch 1: 2301825713700864.000000
Maximum R2 on validation set in epoch 1: -2648340364346970.500000
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]

Finished Cross-validation after 64 seconds


In [12]:
# output the overall performance of the model
demeaned_cv.compute_overall_performance(use_fold_weights = True)

{'r2': -3442083925671059.0, 'mse': 3359902584012800.0}

# The Delta model

In [13]:
# define the target variable
delta_target_var = 'log_mean_pc_cons_usd_2017'

In [14]:
# extract the image statistics for the demeaned images
delta_img_stats = get_agg_img_stats(delta_stats_pth, delta_df, id_var = 'delta_id')
delta_feat_stats = get_feat_stats(delta_img_stats)

# get the target stats
delta_target_stats = get_target_stats(delta_df, delta_target_var)

# define the transforms
# get the data transforms for the target --> is used in the DataLoader object
delta_target_transform = transforms.Compose([
        torchvision.transforms.Lambda(lambda t: torch_standardise(t, delta_target_stats['mean'], delta_target_stats['std'])),
    ])

# get the data transform for the Landsat image (normalisation and random horizontal + vertical flips)
delta_feat_transform = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(delta_feat_stats['mean'], delta_feat_stats['std'])]
)

## Run the model

In [15]:
model_name = 'within_cons_delta'
cv_object_name = 'within_cons_delta_cv'
data_type = 'RS_v2'
id_var = 'delta_id'

# set settings for the ResNet
input_channels = 4
ms = False
random_weights = True

# set hyper-parameters
hyper_params = {
    'lr': 1e-2,
    'batch_size': 128,
    'alpha': 1e-2,
    'step_size': 1,
    'gamma': 0.96,
    'n_epochs': n_epochs
}

In [16]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(delta_df, delta_img_dir, data_type,
                  delta_target_var, id_var,
                  delta_feat_transform, delta_target_transform)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'], shuffle = False)
_, _ = next(iter(_loader))

In [17]:
# initialise the model and the CrossValidator object
ResNet18 = init_resnet(input_channels, ms, random_weights, random_seed = random_seed)
delta_cv = CrossValidator(model = ResNet18,
                       lsms_df = delta_df,
                       fold_ids = fold_ids,
                       img_dir = delta_img_dir,
                       data_type = data_type,
                       target_var = delta_target_var,
                       id_var = id_var,
                       feat_transform = delta_feat_transform,
                       target_transform = delta_target_transform,
                       device = device,
                       model_name = model_name,
                       random_seed = random_seed)

# run k-fold-cv
delta_cv.run_cv(hyper_params)

# save the cv object
delta_cv.save_object(name = cv_object_name)

  0%|          | 0/2 [00:00<?, ?it/s]


Training on fold 0
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0793 - Train R2 -0.3739 - Val MSE: 10065925598196269056.0000 - Val R2 -37539036359982571520.0000
	EPOCH 1 - Train MSE: 558.6413 - Train R2 -368.1187 - Val MSE: 945620399423488.0000 - Val R2 -3526519057229571.0000
Finished training after 59 seconds
Lowest loss on validation set in epoch 1: 945620399423488.000000
Maximum R2 on validation set in epoch 1: -3526519057229571.000000
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 1
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 1.1245 - Train R2 -0.5992 - Val MSE: 12966173389478690816.0000 - Val R2 -14731146685712551936.0000
	EPOCH 1 - Train MSE: 647.5916 - Train R2 -919.9939 - Val MSE: 949644381126656.0000 - Val R2 -1078911262110028.8750
Finished training after 58 seconds
Lowest loss on validation set in epoch 1: 949644381126656.000000
Maximum R2 on validation set in epoch 1: -1078911262110028.875000
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]

Finished Cross-validation after 148 seconds


In [18]:
# output the overall performance of the model
delta_cv.compute_overall_performance(use_fold_weights = True)

{'r2': -2480164330503185.5, 'mse': 2266446793015296.0}