# Load the files and packages


In [1]:
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset

In [2]:
# import the torch_framework package
from analysis_utils.torch_framework.CrossValidator import CrossValidator
from analysis_utils.torch_framework.ResNet18 import *
from analysis_utils.torch_framework.torch_helpers import *
from analysis_utils.torch_framework.SatDataset import SatDataset
from analysis_utils.torch_framework.BetweenModel import BetweenModel
from analysis_utils.torch_framework.EarlyStopper import EarlyStopper

# load the variable names of the tabular feature data
from analysis_utils.variable_names import *

# load the functions to do spatial CV
from analysis_utils.spatial_CV import *

In [3]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# the feature data (OSM + precipitation)
feat_data_pth = f"{root_data_dir}/feature_data/tabular_data.csv"

# set the random seed
random_seed = 348

# set the number of folds for k-fold CV
n_folds = 2

# training device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device: {device}")

Training device: cpu


In [4]:
# Set the paths for the satellite image satistics
sat_img_dir = f"{root_data_dir}/satellite_imgs"

# median LS images at the cluster level
LS_median_img_dir = f"{sat_img_dir}/LS/LS_median_cluster"
LS_median_stats_pth = f"{sat_img_dir}/LS/LS_median_img_stats.pkl"

# the RS v2 images at the cluster level
RS_v2_between_img_dir = f"{sat_img_dir}/RS_v2/RS_v2_between"
RS_v2_between_stats_pth = f"{sat_img_dir}/RS_v2/RS_v2_between_img_stats.pkl"

# Load the data

In [5]:
# load the LSMS data and the feature data (OSM and precipitation)
lsms_df = pd.read_csv(lsms_pth).iloc[:100,:]
feat_df = pd.read_csv(feat_data_pth)

# add the mean variable at the cluster level
lsms_df['avg_log_mean_pc_cons_usd_2017'] = lsms_df.groupby('cluster_id')['log_mean_pc_cons_usd_2017'].transform('mean')

# merge the lsms_df and the feat_df
df = pd.merge(lsms_df, feat_df, on = ('unique_id','cluster_id'), how = 'left')

In [6]:
# define the target variable
between_target_var = 'avg_log_mean_pc_cons_usd_2017'

# define the tabular x_vars to keep in the data
between_x_vars = osm_dist_vars + osm_count_vars + ['avg_precipitation']

# define the mean cluster dataset
between_df = df[['cluster_id', 'lat', 'lon', 'country', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

# divide the data into k different folds
fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = random_seed)

Fold 0, specified test ratio: 0.5 - Actual test ratio 0.50
Fold 1, specified test ratio: 0.5 - Actual test ratio 0.50


In [7]:
# get the image statistics for the Landsat images for each band
LS_img_stats = get_agg_img_stats(LS_median_stats_pth, between_df, id_var = 'cluster_id')
RS_img_stats = get_agg_img_stats(RS_v2_between_stats_pth, between_df, id_var = 'cluster_id')

# extract the relevant statistics for each band (i.e. the mean, std, min, max) and get them as a list
LS_feat_stats = get_feat_stats(LS_img_stats)
RS_feat_stats = get_feat_stats(RS_img_stats)

# For the RS feat stats, alter the mean and std of the last two channels (WSF and ESA LC)
# For these two channels normalisation does not introduce any advantage or yields meaningless numbers
# Thus just set mean and std for both channels to 0 and 1 (which effectively avoids normalisation)
RS_feat_stats['mean'][-2:] = [0,0]
RS_feat_stats['std'][-2:] = [1,1]

# get the stats for the target variable
between_target_stats = get_target_stats(df, between_target_var)

In [8]:
# get the data transforms for the target --> is used in the DataLoader object
target_transform = transforms.Compose([
        torchvision.transforms.Lambda(lambda t: standardise(t, between_target_stats['mean'], between_target_stats['std'])),
    ])

# get the data transform for the Landsat image (normalisation and random horizontal + vertical flips)
LS_transforms = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(LS_feat_stats['mean'], LS_feat_stats['std'])]
)

# LS_transforms = torchvision.transforms.Compose(
#     [transforms.Normalize(LS_feat_stats['mean'], LS_feat_stats['std'])]
# )

# same for RS:
RS_transforms = torchvision.transforms.Compose(
    [torchvision.transforms.RandomVerticalFlip(.5),
    torchvision.transforms.RandomHorizontalFlip(.5),
    transforms.Normalize(RS_feat_stats['mean'], RS_feat_stats['std'])]
)

# Run the Landsat model

In [9]:
model_name = 'between_cons_LS'
cv_object_name = 'between_cons_LS_cv'
data_type = 'LS'
id_var = 'cluster_id'

# set settings for the ResNet
input_channels = 6
scaled_weight_init = True
pretrained_weights = True

# set hyper-parameters
hyper_params = {
    'lr': [1e-3],
    'batch_size': [128],
    'alpha': [1e-1,1e-2],
    'step_size': [1],
    'gamma': [0.96],
    'n_epochs': [150],
    'patience': [10]
}

In [15]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(between_df, LS_median_img_dir, data_type,
                  between_target_var, id_var,
                  LS_transforms, target_transform, random_seed)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'][0], shuffle = False)
_, _ = next(iter(_loader))

In [19]:
# initialise the model and the CrossValidator object
resnet18 = ResNet18(
    input_channels = input_channels,
    pretrained_weights = True, 
    scaled_weight_init = True,
    random_seed = random_seed
)

ls_cv = CrossValidator(
    model_class = resnet18,
    lsms_df = between_df,
    fold_ids = fold_ids,
    img_dir = LS_median_img_dir,
    data_type = data_type,
    target_var = between_target_var,
    id_var = id_var,
    feat_transform = LS_transforms,
    target_transform = target_transform,
    device = device,
    model_name = model_name,
    random_seed = random_seed
)

# run k-fold-cv
ls_cv.run_cv(hyper_params, tune_hyper_params=True)

# save the cv object
ls_cv.save_object(name = cv_object_name)

  0%|          | 0/2 [00:00<?, ?it/s]



Training on fold 0
doing random seed stuff yihaaa
	Tune Hyper-parameters


	------------------------------------------------------
	Combination 1 of 2  -- Hyperparameters: {'lr': 0.001, 'batch_size': 128, 'alpha': 0.1, 'step_size': 1, 'gamma': 0.96, 'n_epochs': 150, 'patience': 10}
		Initialising training
		EPOCH 1 - Train MSE: 7.6469 - Train R2 -6.5194 - Val MSE: 69.6436 - Val R2 -125.8402
		EPOCH 2 - Train MSE: 58.2718 - Train R2 -56.3001 - Val MSE: 10.9482 - Val R2 -18.9396


KeyboardInterrupt: 

In [17]:
# output the overall performance of the model
ls_cv.compute_overall_performance(use_fold_weights = True)

{'r2': 0.32722959523439704, 'mse': 0.5872688862278298}

# Run the RS model

In [18]:
model_name = 'between_cons_RS'
cv_object_name = 'between_cons_RS_cv'
data_type = 'RS_v2'
id_var = 'cluster_id'
img_dir = RS_v2_between_img_dir

# set settings for the ResNet
input_channels = 6
ms = False
random_weights = True

# set hyper-parameters
hyper_params = {
    'lr': 1e-3,
    'batch_size': 128,
    'alpha': 1e-1,
    'step_size': 1,
    'gamma': 0.96,
    'n_epochs': n_epochs
}

In [None]:
# load the data into RAM first
# this reduces training times by ~60%...
_dat = SatDataset(between_df, img_dir, data_type, between_target_var, id_var,
                  RS_transforms, target_transform)
_loader = DataLoader(_dat, batch_size = hyper_params['batch_size'], shuffle = False)
_, _ = next(iter(_loader))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-584f860a86c7>", line 6, in <cell line: 6>
    _, _ = next(iter(_loader))
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 633, in __next__
    data = self._next_data()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 677, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/content/gdrive/MyDrive/master_thesis/predicting_po

In [None]:
# initialise the model and the CrossValidator object
ResNet18 = init_resnet(input_channels, ms, random_weights, random_seed = random_seed)
rs_cv = CrossValidator(model = ResNet18,
                       lsms_df = between_df,
                       fold_ids = fold_ids,
                       img_dir = img_dir,
                       data_type = data_type,
                       target_var = between_target_var,
                       id_var = id_var,
                       feat_transform = RS_transforms,
                       target_transform = target_transform,
                       device = device,
                       model_name = model_name,
                       random_seed = random_seed)

# run k-fold-cv
rs_cv.run_cv(hyper_params)

# save the cv object
rs_cv.save_object(name = cv_object_name)

  0%|          | 0/5 [00:00<?, ?it/s]


Training on fold 0
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0872 - Train R2 -0.9295 - Val MSE: 863197.0625 - Val R2 -1064935.9975
	EPOCH 1 - Train MSE: 185.5464 - Train R2 -170.5300 - Val MSE: 28724.4121 - Val R2 -35436.6652
Finished training after 15 seconds
Lowest loss on validation set in epoch 1: 28724.412109
Maximum R2 on validation set in epoch 1: -35436.665237
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 1
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.1962 - Train R2 -0.8594 - Val MSE: 999849.1875 - Val R2 -1395438.4305
	EPOCH 1 - Train MSE: 159.1697 - Train R2 -133.7658 - Val MSE: 5560.2856 - Val R2 -7759.2119
Finished training after 15 seconds
Lowest loss on validation set in epoch 1: 5560.285645
Maximum R2 on validation set in epoch 1: -7759.211894
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 2
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 1.3242 - Train R2 -0.5099 - Val MSE: 288294.0938 - Val R2 -244490.9244
	EPOCH 1 - Train MSE: 140.0256 - Train R2 -158.6619 - Val MSE: 8951.6465 - Val R2 -7590.5712
Finished training after 14 seconds
Lowest loss on validation set in epoch 1: 8951.646484
Maximum R2 on validation set in epoch 1: -7590.571222
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 3
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 1.7469 - Train R2 -0.5251 - Val MSE: 383390.4375 - Val R2 -349604.0698
	EPOCH 1 - Train MSE: 120.8659 - Train R2 -104.5178 - Val MSE: 1886.5184 - Val R2 -1719.2735
Finished training after 25 seconds
Lowest loss on validation set in epoch 1: 1886.518433
Maximum R2 on validation set in epoch 1: -1719.273539
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]


Training on fold 4
Initialising training


  0%|          | 0/2 [00:00<?, ?it/s]

	EPOCH 0 - Train MSE: 2.0445 - Train R2 -1.0978 - Val MSE: 34038.5703 - Val R2 -42430.4773
	EPOCH 1 - Train MSE: 7.2672 - Train R2 -6.4569 - Val MSE: 536310.1250 - Val R2 -668547.3321
Finished training after 22 seconds
Lowest loss on validation set in epoch 0: 34038.570312
Maximum R2 on validation set in epoch 0: -42430.477290
Predicting values


  0%|          | 0/1 [00:00<?, ?it/s]

Finished Cross-validation after 96 seconds


In [None]:
# output the overall performance
rs_cv.compute_overall_performance(use_fold_weights = True)

{'r2': -12599.14241216957, 'mse': 10677.938761393229}

# Run the between model

In [None]:
# define the name to save the between object in
between_object_name = 'between_cons'

# initialise the Between model
ls_cv_pth = "../results/model_objects/between_cons_LS_cv.pkl"
rs_cv_pth = "../results/model_objects/between_cons_RS_cv.pkl"
between_model = BetweenModel(ls_cv_pth, rs_cv_pth, between_df, between_target_var,
                             between_x_vars, fold_ids, device, random_seed)

# run the between model
between_model.train(min_samples_leaf = 10, n_components = 1)

Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.2059
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.3344
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9879
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9921


Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.1902
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.2368
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9856
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9909


Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.1768
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.4506
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9931
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9730


Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.1991
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.3362
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9856
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9918


Landsat Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.2311
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.8530
RS Feature Extraction
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9772
	Extracting Features


  0%|          | 0/1 [00:00<?, ?it/s]

	Total variance explained by first 1 components: 0.9999


Finished training after 75 seconds


In [None]:
between_model.compute_overall_performance(use_fold_weights = True)
between_model.save_object(between_object_name)