### VERGE: Vector-mode Regional Geospatial Encoding
# Masked Geospatial Model Implementation

Here we build and train a "masked geospatial model".
This is a model in which each input is a set of encoded geospatial entities,
consisting of a concatenation of a multi-point proximity encoding and a one-hot label vector.
Modeling consists of masking the labels for a random selection of entities,
passing the data through an encoder-based architecutre to predicte the labels of masked entities.
The idea is that the encodings then capture information about the region.


## Processing Setup

In [1]:
# Google colab setup
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install geo_encodings



In [3]:
# Local processing setup
# project_home = '..'

## Source

In [4]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

## Setup

In [5]:
import pandas as pd
import numpy as np
import glob
import pickle
import os
import torch
import torch.nn as nn
import torch.utils
import torch.utils.data
from torch.utils.data import DataLoader
import copy
import json
from geo_encodings import MPPEncoder

import sys
sys.path.append(project_home)
from utils.geo_transformer_mem import VergeDataset, verge_collate_fn, GeospatialTransformer


In [None]:
np.random.seed(5)

## Parameters

In [6]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
encoding_resolution = roi['encoding_resolution']

# We need the dimension of the encoding.
encoder = MPPEncoder(
    region=[0, 0, tile_size, tile_size],
    resolution=encoding_resolution,
    center=True
)
geo_encoding_dim = len(encoder)
print('%d elements in encodings' % geo_encoding_dim)


400 elements in encodings


In [7]:
# A unique identifier for this run. This will be a component of any
# output file names.
run_id = '007'

# What type of device to train on.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device', device)

# Fraction of cases to use for training.
train_fraction = 0.8

# Number of epochs to run.
epoch_count = 5

using device cuda


## Preliminaries

In [8]:
# Read the list of labels.
fname = '%s/labels.csv' % data_home
labels = pd.read_csv(fname)
n_classes = len(labels)
print('%d labels in this dataset' % n_classes)

label_id_lookup = {
    z['label']: z['id']
    for z in labels.to_dict('records')
}

label_name_lookup = {
    z['id']: z['label']
    for z in labels.to_dict('records')
}

22 labels in this dataset


In [9]:
# Read the file that gives class probabilities.
fname = '%s/class_info.csv' % roi_home
class_info = pd.read_csv(fname)
print('%d class info records' % len(class_info))

22 class info records


## Load data
The data exist as NPZ files containing features and label vectors.
Each is in a sub-folder for its AOI.
We want to divide into train / validation splits according to the AOI,
not the individual tile. This reduces autocorrelation effects that could
bias performance assessments.

In [10]:
# Get a list of AOI folders.
globstring = '%s/encodings/*' % roi_home
aoi_dnames = glob.glob(globstring)

# Loop over those, adding their files to either the train or val sets.
train_fnames = []
val_fnames = []
split_records = []
np.random.seed(5)
for aoi_dname in aoi_dnames:
    globstring = '%s/*.npz' % aoi_dname
    tile_fnames = glob.glob(globstring)

    # Hack: downsample the tile file names.
    tile_fnames = list(np.random.choice(tile_fnames, int(len(tile_fnames) * 0.2)))

    if np.random.random() < train_fraction:
        split = 'train'
        train_fnames += tile_fnames
    else:
        split = 'val'
        val_fnames += tile_fnames
    # print('added %d files to the %s set' % (len(tile_fnames), split))
    split_records.append({'aoi': aoi_dname, 'split': split})

print('%d training instances' % len(train_fnames))
print('%d validation instances' % len(val_fnames))

# Save the split records
fname = '%s/splits-%s.csv' % (roi_home, run_id)
pd.DataFrame(split_records).to_csv(fname, index=False)

2524 training instances
548 validation instances


In [11]:
# # Test that.
# dataset = VergeDataset(train_tiles, n_classes, mask_fraction=0.15)
# batch = [dataset[k] for k in [0, 12, 17, 23]]
# batch_features, batch_labels, batch_attention_mask = verge_collate_fn(batch)
# print('test:')
# print('batch_features.shape', batch_features.shape)
# print('batch_labels.shape', batch_labels.shape)
# print('batch_attention_mask.shape', batch_attention_mask.shape)


In [12]:
# The dataset constructor requires a lookup table for class probabilities.
class_prob_lookup = {
    z['label']: z['prob']
    for z in class_info.to_dict('records')
}

# Initialize training and validation datasets.
train_dataset = VergeDataset(train_fnames, n_classes, mask_fraction=0.15, class_prob=class_prob_lookup)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=64, # Tune depending on GPU memory
    shuffle=True,
    num_workers=4,
    collate_fn=verge_collate_fn,
    drop_last=False
)

val_dataset = VergeDataset(val_fnames, n_classes, mask_fraction=0.15, class_prob=class_prob_lookup)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=64, # Tune depending on GPU memory
    shuffle=True,
    num_workers=4,
    collate_fn=verge_collate_fn,
    drop_last=False
)

loading instance 0 / 2524
loading instance 1000 / 2524


KeyboardInterrupt: 

## Model definition

In [None]:
model = GeospatialTransformer(
    feature_dim = geo_encoding_dim + n_classes,
    model_dim=128,
    num_heads=4,
    num_layers=5,
    num_classes=n_classes,
    dropout=0.2
)
n_param = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('%d trainable parameters in model' % n_param)

In [None]:
# Testing
# dataset = VergeDataset(train_tiles, n_classes, mask_fraction=0.15)
# dataloader = DataLoader(
#     dataset,
#     batch_size=2,            # Tune depending on GPU memory
#     shuffle=True,
#     collate_fn=verge_collate_fn,   # Key for padding variable-length instances
#     drop_last=False
# )

# features, labels, attention_mask = dataloader.__iter__().__next__()
# print(features.shape, labels.shape, attention_mask.shape)

In [None]:
# model(features, attention_mask)

### Training loop

In [None]:
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

losses = []

model.train()
for epoch in range(epoch_count):

    # Training.
    model.train()
    ibatch = 0
    for features, labels, attention_mask in train_dataloader:
        ibatch += 1
        if ibatch % 10 == 0:
            print('epoch %d, batch %d' % (epoch, ibatch))

        features = features.to(device)
        labels = labels.to(device)
        attention_mask = attention_mask.to(device)

        logits = model(features, attention_mask)
        loss = criterion(
            logits.view(-1, n_classes),
            labels.view(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation loss
    model.eval()
    with torch.no_grad():
        for features, labels, attention_mask in val_dataloader:
            features = features.to(device)
            labels = labels.to(device)
            attention_mask = attention_mask.to(device)
            logits = model(features, attention_mask)
            val_loss = criterion(
                logits.view(-1, n_classes),
                labels.view(-1)
            )

    losses.append({
        'epoch': epoch,
        'train_loss': loss.item(),
        'val_loss': val_loss.item()
    })

    print(f"Epoch {epoch+1}, train loss: {loss.item():.4f}, val_loss: {val_loss.item():.4f}")


In [None]:
# Save the model.
model_fname = '%s/model-%s' % (roi_home, run_id)
torch.save(model, model_fname)
print('saved %s' % model_fname)

## Loss history

In [None]:
import plotly
from plotly.subplots import make_subplots
from plotly.graph_objects import Scatter

epochs = [d['epoch'] for d in losses]
train_losses = [d['train_loss'] for d in losses]
val_losses = [d['val_loss'] for d in losses]

fig = make_subplots(rows=1, cols=1)
trace = Scatter(
    x=epochs, y=train_losses, name='training loss',
    mode='markers+lines', marker_color='blue'
)
fig.append_trace(trace, 1, 1)

trace = Scatter(
    x=epochs, y=val_losses, name='validation loss',
    mode='markers+lines', marker_color='green'
)
fig.append_trace(trace, 1, 1)

fig

## Validation Visualization

In [None]:
# Process the validation dataset, getting the class probability predictions
# for every instance.
model.to(device)
cases = []
model.eval()
for features, labels, attention_mask in val_dataloader:

    features = features.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    logits = model(features, attention_mask)

    batch_size = logits.shape[0]
    for i in range(batch_size):
        case_logits = logits[i]
        case_probs = torch.softmax(case_logits, dim=1)
        case_labels = labels[i]
        entity_count = len(case_labels)
        for k in range(entity_count):
            if case_labels[k].item() >= 0: # Skips the "-100" labels.
                cases.append({
                    'true_label': case_labels[k].item(),
                    'probs': torch.Tensor.cpu(case_probs[k, :]).detach().numpy()
                })

print('compiled prediction probabilities for %d validation instances' % len(cases))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

class_count = max(d["true_label"] for d in cases) + 1
probs_by_class = defaultdict(list)

for d in cases:
    label = d["true_label"]
    probs = np.array(d["probs"])
    probs_by_class[label].append(probs)

# For each true class, compute the mean probability vector
mean_probs = []
for t in range(class_count):
    if probs_by_class[t]:
        mean = np.stack(probs_by_class[t]).mean(axis=0)
    else:
        mean = np.zeros(class_count)  # if no samples for this class
    mean_probs.append(mean)

# Convert to 2D array: [true_class, predicted_class]
matrix = np.stack(mean_probs)  # shape [C, C]

# Plot heatmap
fig, ax = plt.subplots(figsize=(9, 6))
im = ax.imshow(matrix, cmap='viridis', aspect='auto')

plt.colorbar(im, ax=ax, label='Avg Predicted Probability')
ax.set_title("Mean Predicted Probabilities by True Class")
ax.set_xlabel("Predicted Class")
ax.set_ylabel("True Class")
ax.set_xticks(range(class_count))
ax.set_yticks(range(class_count))
ax.set_yticklabels([label_name_lookup[i] for i in range(class_count)])
plt.tight_layout()
plt.show()

In [None]:
# class_count = max(d["true_label"] for d in cases) + 1
# print(class_count)
# cmat = np.zeros((class_count, class_count))

# for d in cases:
#     true_label = d["true_label"]
#     pred_label = np.argmax(d["probs"])
#     cmat[true_label, pred_label] += 1

# cmat = np.sqrt(cmat)

# # Plot heatmap
# fig, ax = plt.subplots(figsize=(12, 8))
# im = ax.imshow(cmat, cmap='viridis', aspect='auto')

# plt.colorbar(im, ax=ax, label='Number Of Cases')
# ax.set_title("Confusion Matrix")
# ax.set_xlabel("Predicted Class")
# ax.set_ylabel("True Class")
# ax.set_xticks(range(class_count))
# ax.set_yticks(range(class_count))
# ax.set_yticklabels(['%s [%d]' % (label_name_lookup[i], i) for i in range(class_count)])
# plt.tight_layout()
# plt.show()
