## Demo

In [1]:
%load_ext autoreload
%autoreload 

In [36]:
import os
import json

import pickle
import numpy as np
import pandas as pd
import torch 
import torchmetrics
import torchvision
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
from models.from_config import build_from_config
from models.double_branch import DoubleBranchCNN
from data_handlers.csv_dataset import CustomDatasetFromDataFrame
from utils import utils
from utils import transfer_learning as tl
from train import train, dual_train
from test import test

In [39]:
CSV_PATH=os.path.join('data','wealth_index.csv')
DATA_DIR=os.path.join('data','landsat_7','')
FOLD_PATH=os.path.join('data','dhs_incountry_folds.pkl')
CONFIG_FILE_MS = os.path.join('configs','resnet18_ms_e2e_l7_yeh.json')
CONFIG_FILE_MSNL = os.path.join('configs','resnet18_msnl_e2e_l7_yeh.json')


In [40]:
with open( CONFIG_FILE_MS ) as f:
    config_ms = json.load(f)
with open( CONFIG_FILE_MSNL ) as f:
    config_msnl = json.load(f)
csv = pd.read_csv(CSV_PATH)
# csv['filename']=csv.apply(
#     lambda row:  str(row['lat'])[:5].replace('.','_')+"_"+str(row.lon)[:5].replace('.','_')+'.tif', axis=1
# )
csv.drop("bounding_box", axis=1, inplace=True)
csv = csv.loc[:, ~csv.columns.str.contains('^Unnamed')]
csv.head()

Unnamed: 0,country,year,cluster,lat,lon,households,wealthpooled,geometry,filename
0,angola,2011,1,-12.350257,13.534922,36,2.312757,POINT (1506700.58557273 -1385596.0684884773),-12_3_13_53.tif
1,angola,2011,2,-12.360865,13.551494,32,2.010293,POINT (1508545.372017885 -1386804.9130245172),-12_3_13_55.tif
2,angola,2011,3,-12.613421,13.413085,36,0.877744,POINT (1493137.790366379 -1415600.6075743325),-12_6_13_41.tif
3,angola,2011,4,-12.581454,13.397711,35,1.066994,POINT (1491426.3440705661 -1411954.2588894619),-12_5_13_39.tif
4,angola,2011,5,-12.578135,13.418748,37,1.750153,POINT (1493768.1835246533 -1411575.617279712),-12_5_13_41.tif


In [41]:
TRAIN_TRANSFORM = torch.nn.Sequential(
        torchvision.transforms.CenterCrop(size=224),
        torchvision.transforms.RandomHorizontalFlip(p=0.5),
        torchvision.transforms.ColorJitter(),
    )
TEST_TRANSFORM  = torch.nn.Sequential(
        torchvision.transforms.CenterCrop(size=224),
    )

In [42]:
# Spatially Aware Cross-Validation
with open(FOLD_PATH, 'rb') as f:
    folds = pickle.load(f)
results = dict()
device = "cuda" if torch.cuda.is_available() else "cpu"
for fold in folds:
    writer = SummaryWriter()
    r2 = torchmetrics.R2Score().to(device=device)
    # Index split
    train_split = folds[fold]['train']
    val_split = folds[fold]['val']
    test_split = folds[fold]['test']
    # CSV split
    train_df = csv.iloc[train_split]
    val_df = csv.iloc[train_split]
    test_df = csv.iloc[test_split]
    # Datasets
    train_dataset = CustomDatasetFromDataFrame(train_df, DATA_DIR,transform=TRAIN_TRANSFORM )
    val_dataset = CustomDatasetFromDataFrame(val_df, DATA_DIR, transform=TEST_TRANSFORM )
    test_dataset  = CustomDatasetFromDataFrame(test_df, DATA_DIR, transform=TEST_TRANSFORM )
    # DataLoaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset, 
        batch_size=config_ms['batch_size'], 
        shuffle=True,
        num_workers=8,
        pin_memory=True
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=config_ms['batch_size'],
        shuffle=True,
        num_workers=8,
        pin_memory=True
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config_ms['batch_size'],
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )
    base_model = torchvision.models.resnet18(weights='ResNet18_Weights.DEFAULT')
    # base_model = torchgeo.models.resnet18(weights=torchgeo.models.ResNet18_Weights.SENTINEL2_ALL_MOCO)
    ms_branch = build_from_config( base_model=base_model, config_file=CONFIG_FILE_MS )
    # nl_branch = tl.update_single_layer(torchvision.models.resnet18())
    # model = DoubleBranchCNN(b1=ms_branch, b2=nl_branch, output_features=1)
    model = ms_branch.to(device=device)
    # CONFIGURE LOSS, OPTIM
    loss_fn = utils.configure_loss( config_ms )
    optimizer = utils.configure_optimizer( config_ms, model )
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer)
    print(f"Training on fold {fold}")
    results[fold] = train(
        model=model,
        train_dataloader=train_loader,
        val_dataloader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        loss_fn=loss_fn,
        epochs=config_ms['n_epochs'],
        batch_size=config_ms['batch_size'],
        in_channels=config_ms['in_channels'],
        writer=writer,
        device=device,
        r2=r2
    )
    torch.save(model.state_dict(), config_ms['checkpoint_path']+'_fold_'+str(fold)+".pth")
final_results = utils.compute_average_crossval_results(results=results)

Training on fold A


RasterioIOError: Caught RasterioIOError in DataLoader worker process 3.
Original Traceback (most recent call last):
  File "rasterio/_base.pyx", line 308, in rasterio._base.DatasetBase.__init__
  File "rasterio/_base.pyx", line 219, in rasterio._base.open_dataset
  File "rasterio/_err.pyx", line 221, in rasterio._err.exc_wrap_pointer
rasterio._err.CPLE_OpenFailedError: data/landsat_7/2_531_34_60.tif: No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/matthieu/anaconda3/envs/mpa_env/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/matthieu/anaconda3/envs/mpa_env/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/matthieu/anaconda3/envs/mpa_env/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/media/matthieu/LaCie/2-mpa/data_handlers/csv_dataset.py", line 36, in __getitem__
    tile = np.array(rio.open(tile_name).read())
  File "/home/matthieu/anaconda3/envs/mpa_env/lib/python3.10/site-packages/rasterio/env.py", line 451, in wrapper
    return f(*args, **kwds)
  File "/home/matthieu/anaconda3/envs/mpa_env/lib/python3.10/site-packages/rasterio/__init__.py", line 304, in open
    dataset = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)
  File "rasterio/_base.pyx", line 310, in rasterio._base.DatasetBase.__init__
rasterio.errors.RasterioIOError: data/landsat_7/2_531_34_60.tif: No such file or directory


In [None]:
final_results

In [None]:
# Spatially Aware Cross-Validation
with open(FOLD_PATH, 'rb') as f:
    folds = pickle.load(f)
results = dict()
device = "cuda" if torch.cuda.is_available() else "cpu"
for fold in folds:
    writer = SummaryWriter()
    r2 = torchmetrics.R2Score().to(device=device)
    # Index split
    train_split = folds[fold]['train']
    val_split = folds[fold]['val']
    test_split = folds[fold]['test']
    # CSV split
    train_df = csv.iloc[train_split]
    val_df = csv.iloc[train_split]
    test_df = csv.iloc[test_split]
    # Datasets
    train_dataset = CustomDatasetFromDataFrame(train_df, DATA_DIR,transform=TRAIN_TRANSFORM )
    val_dataset = CustomDatasetFromDataFrame(val_df, DATA_DIR, transform=TEST_TRANSFORM )
    test_dataset  = CustomDatasetFromDataFrame(test_df, DATA_DIR, transform=TEST_TRANSFORM )
    # DataLoaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset, 
        batch_size=config_msnl['batch_size'], 
        shuffle=True,
        num_workers=8,
        pin_memory=True
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=config_msnl['batch_size'],
        shuffle=True,
        num_workers=8,
        pin_memory=True
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config_msnl['batch_size'],
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )
    base_model = torchvision.models.resnet18(weights='ResNet18_Weights.DEFAULT')
    # base_model = torchgeo.models.resnet18(weights=torchgeo.models.ResNet18_Weights.SENTINEL2_ALL_MOCO)
    ms_branch = build_from_config( base_model=base_model, config_file=CONFIG_FILE_MSNL )
    nl_branch = tl.update_single_layer(torchvision.models.resnet18())
    model = DoubleBranchCNN(b1=ms_branch, b2=nl_branch, output_features=1)
    model = model.to(device=device)
    # CONFIGURE LOSS, OPTIM
    loss_fn = utils.configure_loss( config_msnl )
    optimizer = utils.configure_optimizer( config_msnl, model )
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer)
    print(f"Training on fold {fold}")
    results[fold] = dual_train(
        model=model,
        train_dataloader=train_loader,
        val_dataloader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        loss_fn=loss_fn,
        epochs=config_msnl['n_epochs'],
        batch_size=config_msnl['batch_size'],
        in_channels=config_msnl['in_channels'],
        writer=writer,
        device=device,
        r2=r2
    )
    torch.save(model.state_dict(), config_msnl['checkpoint_path']+'_fold_'+str(fold)+".pth")
final_results_nl = utils.compute_average_crossval_results(results=results)

In [None]:
final_results_nl

3. Test Results

In [None]:
# test_r2, Y_true, Y_pred = test(model=model, dataloader=val_loader, device=device)
# # Y_true = [ utils.denormalize_asset(asset) for asset in Y_true]
# # Y_pred = [ utils.denormalize_asset(asset) for asset in Y_pred]
# results = pd.DataFrame({
#     'true index':np.array(Y_true),
#     'predicted index':np.array(Y_pred)
# })
# from scipy.stats import pearsonr
# import seaborn as sns
# sns.set_palette("rocket")
# sns.regplot(x='true index', y='predicted index', data=results).set(title='R2 = '+str(test_r2))