## Demo

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json

import pickle
import numpy as np
import pandas as pd
import torch 
import torchmetrics
import torchvision
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
from models.from_config import build_from_config
from models.double_branch import DoubleBranchCNN
from data_handlers.csv_dataset import CustomDatasetFromDataFrame
from utils import utils
from utils import transfer_learning as tl
from train import train, dual_train
from test import test

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
CSV_PATH=os.path.join('data','dataset.csv')
TEST_CSV=os.path.join('data','madagascar_test_dataset.csv')
TRAIN_CSV=os.path.join('data','madagascar_train_dataset.csv')
DATA_DIR=os.path.join('data','landsat_7','')
FOLD_PATH=os.path.join('data','dhs_incountry_folds.pkl')
CONFIG_FILE_MS = os.path.join('configs','resnet18_ms_e2e_l7_yeh.json')
CONFIG_FILE_MSNL = os.path.join('configs','resnet18_msnl_e2e_l7_yeh.json')
TILE_MIN = [-0.0994, -0.0574, -0.0318, -0.0209, -0.0102, -0.0152, 0.0, -0.07087274]
TILE_MAX = [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 316.7, 3104.1401]

In [4]:
with open( CONFIG_FILE_MS ) as f:
    config_ms = json.load(f)
with open( CONFIG_FILE_MSNL ) as f:
    config_msnl = json.load(f)
csv = pd.read_csv(CSV_PATH)
# csv.drop("bounding_box", axis=1, inplace=True)
# csv = csv.loc[:, ~csv.columns.str.contains('^Unnamed')]
csv.reset_index(inplace=True)
csv.head()

Unnamed: 0,index,country,year,cluster,lat,lon,households,wealthpooled
0,0,angola,2011,1,-12.350257,13.534922,36,2.312757
1,1,angola,2011,2,-12.360865,13.551494,32,2.010293
2,2,angola,2011,3,-12.613421,13.413085,36,0.877744
3,3,angola,2011,4,-12.581454,13.397711,35,1.066994
4,4,angola,2011,5,-12.578135,13.418748,37,1.750153


In [5]:
# COMPUTE THE MEAN AND STD OF NORMED IMAGES OVER THE COMPLETE DATASET
# EXECUTE ONCE -> to script

# TEST_TRANSFORM  = torch.nn.Sequential(
#         torchvision.transforms.CenterCrop(size=224),
#     )
# dummy_dataset = CustomDatasetFromDataFrame(csv,
#                                            DATA_DIR,
#                                            transform=TEST_TRANSFORM,
#                                            tile_max=TILE_MAX,
#                                            tile_min=TILE_MIN)
# dummy_loader = torch.utils.data.DataLoader(
#         dummy_dataset, 
#         batch_size=64
#     )

# def compute_mean_and_std(dataloader, batch_size):
#     channels_sum, channels_squared_sum, num_batches = 0, 0, 0
#     for data, _ in dataloader:
#         if data is not None:
#             weight = data.size()[0] / batch_size
#             # Mean over batch, height and width, but not over the channels
#             channels_sum += weight*torch.mean(data, dim=[0,2,3])
#             channels_squared_sum += weight*torch.mean(data**2, dim=[0,2,3])
#             num_batches += weight
#     mean = channels_sum / num_batches
#     # std = sqrt(E[X^2] - (E[X])^2)
#     std = (channels_squared_sum / num_batches - mean ** 2) ** 0.5
#     return mean, std

# means, stds = compute_mean_and_std(dummy_loader, 64)

In [6]:
# means, stds
means = torch.tensor([0.6952, 0.6890, 0.6851, 0.6834, 0.6818, 0.6826, 0.0043])
stds = torch.tensor([9.5266, 9.7209, 9.8435, 9.8968, 9.9495, 9.9249, 0.0632])

In [7]:
TRAIN_TRANSFORM = torch.nn.Sequential(
        torchvision.transforms.CenterCrop(size=224),
        torchvision.transforms.RandomHorizontalFlip(p=0.5),
        torchvision.transforms.Normalize(
            mean=means,
            std=stds
        )
    )
TEST_TRANSFORM  = torch.nn.Sequential(
        torchvision.transforms.CenterCrop(size=224),
        torchvision.transforms.Normalize(
            mean=means,
            std=stds
        )
    )

In [8]:
# Spatially Aware Cross-Validation
with open(FOLD_PATH, 'rb') as f:
    folds = pickle.load(f)
results = dict()
device = "cuda" if torch.cuda.is_available() else "cpu"
# for fold in folds:
writer = SummaryWriter()
r2 = torchmetrics.R2Score().to(device=device)
# Index split
csv_train = pd.read_csv('data/madagascar_train_dataset.csv')
train_split=np.arange(len(csv_train))
csv_test = pd.read_csv('data/madagascar_test_dataset.csv')
csv_test.reset_index(inplace=True)
val_split=(len(csv_test))
# train_split = np.concatenate((folds['A']['train'],folds['B']['train'],folds['C']['train']))
# val_split = folds['E']['train']
# CSV split
# train_df = csv.iloc[train_split]
train_df = csv_train
# val_df = csv.iloc[val_split]
val_df = csv_test
# Datasets
train_dataset = CustomDatasetFromDataFrame(train_df, DATA_DIR,transform=TRAIN_TRANSFORM,tile_max=TILE_MAX,
                                        tile_min=TILE_MIN )
val_dataset = CustomDatasetFromDataFrame(val_df, DATA_DIR, transform=TEST_TRANSFORM,tile_max=TILE_MAX,
                                        tile_min=TILE_MIN )

# DataLoaders
train_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=config_ms['batch_size'], 
    shuffle=True,
    num_workers=8,
    pin_memory=True
)
val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=config_ms['batch_size'],
    shuffle=True,
    num_workers=8,
    pin_memory=True
)

base_model = torchvision.models.resnet18(weights='ResNet18_Weights.DEFAULT')
# base_model = torchgeo.models.resnet18(weights=torchgeo.models.ResNet18_Weights.SENTINEL2_ALL_MOCO)
ms_branch = build_from_config( base_model=base_model, config_file=CONFIG_FILE_MS )
# nl_branch = tl.update_single_layer(torchvision.models.resnet18())
# model = DoubleBranchCNN(b1=ms_branch, b2=nl_branch, output_features=1)
model = ms_branch.to(device=device)
# CONFIGURE LOSS, OPTIM
loss_fn = utils.configure_loss( config_ms )
optimizer = utils.configure_optimizer( config_ms, model )
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer)
# print(f"Training on fold {fold}")
print(f"Training on fold (All)")
results = train(
    model=model,
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    loss_fn=loss_fn,
    epochs=config_ms['n_epochs'],
    batch_size=config_ms['batch_size'],
    in_channels=config_ms['in_channels'],
    writer=writer,
    device=device,
    ckpt_path=config_ms['checkpoint_path']+'_fold_'+'all'+".pth",
    r2=r2
)

torch.save(model.state_dict(), config_ms['checkpoint_path']+'_fold_'+'all'+".pth")
# final_results = utils.compute_average_crossval_results(results=results)

2023-05-11 16:37:24.034781: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-11 16:37:24.129310: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Training on fold (All)


 37%|███▋      | 185/498 [08:04<03:19,  1.57it/s]

In [None]:
results

[autoreload of certifi failed: Traceback (most recent call last):
  File "/home/matthieu/anaconda3/envs/mpa_env/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 273, in check
    superreload(m, reload, self.old_objects)
  File "/home/matthieu/anaconda3/envs/mpa_env/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 471, in superreload
    module = reload(module)
  File "/home/matthieu/anaconda3/envs/mpa_env/lib/python3.10/importlib/__init__.py", line 166, in reload
    spec = module.__spec__ = _bootstrap._find_spec(name, pkgpath, target)
  File "<frozen importlib._bootstrap>", line 945, in _find_spec
  File "<frozen importlib._bootstrap_external>", line 1439, in find_spec
  File "<frozen importlib._bootstrap_external>", line 1408, in _get_spec
  File "<frozen importlib._bootstrap_external>", line 1366, in _path_importer_cache
OSError: [Errno 5] Input/output error
]
[autoreload of utils.utils failed: Traceback (most recent call last):
  File "/home/

{}

In [None]:
# Spatially Aware Cross-Validation
with open(FOLD_PATH, 'rb') as f:
    folds = pickle.load(f)
results = dict()
device = "cuda" if torch.cuda.is_available() else "cpu"
for fold in folds:
    writer = SummaryWriter()
    r2 = torchmetrics.R2Score().to(device=device)
    # Index split
    train_split = folds[fold]['train']
    val_split = folds[fold]['val']
    test_split = folds[fold]['test']
    # CSV split
    train_df = csv.iloc[train_split]
    val_df = csv.iloc[train_split]
    test_df = csv.iloc[test_split]
    # Datasets
    train_dataset = CustomDatasetFromDataFrame(train_df, DATA_DIR,transform=TRAIN_TRANSFORM )
    val_dataset = CustomDatasetFromDataFrame(val_df, DATA_DIR, transform=TEST_TRANSFORM )
    test_dataset  = CustomDatasetFromDataFrame(test_df, DATA_DIR, transform=TEST_TRANSFORM )
    # DataLoaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset, 
        batch_size=config_msnl['batch_size'], 
        shuffle=True,
        num_workers=8,
        pin_memory=True
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=config_msnl['batch_size'],
        shuffle=True,
        num_workers=8,
        pin_memory=True
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config_msnl['batch_size'],
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )
    base_model = torchvision.models.resnet18(weights='ResNet18_Weights.DEFAULT')
    # base_model = torchgeo.models.resnet18(weights=torchgeo.models.ResNet18_Weights.SENTINEL2_ALL_MOCO)
    ms_branch = build_from_config( base_model=base_model, config_file=CONFIG_FILE_MSNL )
    nl_branch = tl.update_single_layer(torchvision.models.resnet18())
    model = DoubleBranchCNN(b1=ms_branch, b2=nl_branch, output_features=1)
    model = model.to(device=device)
    # CONFIGURE LOSS, OPTIM
    loss_fn = utils.configure_loss( config_msnl )
    optimizer = utils.configure_optimizer( config_msnl, model )
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer)
    print(f"Training on fold {fold}")
    results[fold] = dual_train(
        model=model,
        train_dataloader=train_loader,
        val_dataloader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        loss_fn=loss_fn,
        epochs=config_msnl['n_epochs'],
        batch_size=config_msnl['batch_size'],
        in_channels=config_msnl['in_channels'],
        writer=writer,
        device=device,
        r2=r2
    )
    torch.save(model.state_dict(), config_msnl['checkpoint_path']+'_fold_'+str(fold)+".pth")
final_results_nl = utils.compute_average_crossval_results(results=results)

In [None]:
final_results_nl

3. Test Results

In [None]:
# test_r2, Y_true, Y_pred = test(model=model, dataloader=val_loader, device=device)
# # Y_true = [ utils.denormalize_asset(asset) for asset in Y_true]
# # Y_pred = [ utils.denormalize_asset(asset) for asset in Y_pred]
# results = pd.DataFrame({
#     'true index':np.array(Y_true),
#     'predicted index':np.array(Y_pred)
# })
# from scipy.stats import pearsonr
# import seaborn as sns
# sns.set_palette("rocket")
# sns.regplot(x='true index', y='predicted index', data=results).set(title='R2 = '+str(test_r2))