### VERGE: Vector-Mode Regional Geospatial Encoding
# Initial Embeddings 

Back in the "02" folder, we trained a Masked Geospatial Model.
That model can compute a set of embeddings for any tile, 
for which the inputs are a set of geospatial entities. 
Thise embeddings are permutation-equivariant ("perm-e") 
with respect to the input features. In thi folder we are building 
a fully permutation-invariant ("perm-i") aggregation of the perm-e
outputs. 

In this notebook, we compute those perm-e embeddings for all instances
in our training and validation datasets.


## Processing Setup

In [1]:
# Google colab setup
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# project_home = '/content/drive/MyDrive/Projects/verge'
# os.chdir(project_home)

In [2]:
# !pip install geo_encodings

In [3]:
# Local processing setup
project_home = '..'

## Notebook Setup

In [4]:
import pandas as pd
import numpy as np
import glob
import pickle
import os
import torch
import torch.nn as nn
import torch.utils
import torch.utils.data
from torch.utils.data import DataLoader
import copy
import json
from geo_encodings import MPPEncoder

import sys
sys.path.append(project_home)
from utils.geo_transformer_mem import VergeDataset, verge_collate_fn, GeospatialTransformer


## Parameters

In [5]:
# The name of the ROI to use.
roi_name = 'ne-laptop'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# The unique identifier of the model to be used.
run_id = '101'

# What type of device to train on.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device', device)


using device cpu


## Preliminaries

In [6]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
encoding_resolution = roi['encoding_resolution']

# We need the dimension of the encoding.
encoder = MPPEncoder(
    region=[0, 0, tile_size, tile_size],
    resolution=encoding_resolution,
    center=True
)
geo_encoding_dim = len(encoder)
print('%d elements in encodings' % geo_encoding_dim)


400 elements in encodings


In [7]:
# Read the list of labels.
fname = '%s/labels.csv' % data_home
labels = pd.read_csv(fname)
n_classes = len(labels)
print('%d labels in this dataset' % n_classes)

label_id_lookup = {
    z['label']: z['id']
    for z in labels.to_dict('records')
}

label_name_lookup = {
    z['id']: z['label']
    for z in labels.to_dict('records')
}

22 labels in this dataset


In [8]:
# Read the file that gives class probabilities.
fname = '%s/class_info.csv' % roi_home
class_info = pd.read_csv(fname)
print('%d class info records' % len(class_info))

22 class info records


## Load data
We determine which files to read by loading the associated "split" file.

In [9]:
# Get the list of AOI tags. They can be found in the splits file.
fname = '%s/models/splits-%s.csv' % (roi_home, run_id)
splits = pd.read_csv(fname)
aoi_tags = np.unique(splits['aoi_tag'])

## Prep model and data

In [10]:
# The dataset constructor requires a lookup table for class probabilities.
class_prob_lookup = {
    z['label']: z['prob']
    for z in class_info.to_dict('records')
}

In [11]:
# Load the model.
model_fname = '%s/models/model-%s' % (roi_home, run_id)
model = torch.load(model_fname, weights_only=False)
print('loaded %s' % model_fname)

model.to(device)
model.eval()

loaded ../data/ne-laptop/models/model-101


GeospatialTransformer(
  (input_proj): Linear(in_features=422, out_features=128, bias=True)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (output_head): Linear(in_features=128, out_features=22, bias=True)
)

In [12]:
# Loop over encoded input files. For each one, define a dataset,
# run it through the model, and generate an initial set of embeddings.
for aoi_tag in aoi_tags:
    
    encoding_fname = '%s/encodings/%s.pkl' % (roi_home, aoi_tag)
    print('\n', encoding_fname)

    # Define a dataset and datloader for this input file.
    # Note that we set the batch size to 1. This effectively removes all
    # padding, as the dataloader pads to the largest object
    # in the batch. 
    dataset = VergeDataset([encoding_fname], n_classes, mask_fraction=0.0, class_prob=class_prob_lookup)
    dataloader = DataLoader(
        dataset,
        batch_size=1,
        shuffle=True,
        collate_fn=verge_collate_fn,
        drop_last=False
    )

    # Get embeddings for every tile in this AOI.
    embeddings_for_this_aoi = []
    for features, labels, attention_mask, idents in dataloader:
        features = features.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        embeddings = model.embed(features, attention_mask)
        embeddings_for_this_aoi.append({
            'aoi_tag': idents[0].split(':')[0],
            'tile_tag': idents[0].split(':')[1],
            'embedding': embeddings
        })

    # Save those.
    ofname = '%s/initials/%s.pkl' % (roi_home, aoi_tag)
    os.makedirs(os.path.dirname(ofname), exist_ok=True)
    with open(ofname, 'wb') as dest:
        pickle.dump(embeddings_for_this_aoi, dest)
    print('wrote %s' % ofname)



 ../data/ne-laptop/encodings/0731w-413n.pkl
loaded 63 instances from ../data/ne-laptop/encodings/0731w-413n.pkl
wrote ../data/ne-laptop/initials/0731w-413n.pkl

 ../data/ne-laptop/encodings/0731w-414n.pkl
loaded 63 instances from ../data/ne-laptop/encodings/0731w-414n.pkl
wrote ../data/ne-laptop/initials/0731w-414n.pkl

 ../data/ne-laptop/encodings/0731w-415n.pkl
loaded 63 instances from ../data/ne-laptop/encodings/0731w-415n.pkl
wrote ../data/ne-laptop/initials/0731w-415n.pkl

 ../data/ne-laptop/encodings/0732w-413n.pkl
loaded 63 instances from ../data/ne-laptop/encodings/0732w-413n.pkl
wrote ../data/ne-laptop/initials/0732w-413n.pkl

 ../data/ne-laptop/encodings/0732w-415n.pkl
loaded 60 instances from ../data/ne-laptop/encodings/0732w-415n.pkl
wrote ../data/ne-laptop/initials/0732w-415n.pkl

 ../data/ne-laptop/encodings/0732w-418n.pkl
loaded 36 instances from ../data/ne-laptop/encodings/0732w-418n.pkl
wrote ../data/ne-laptop/initials/0732w-418n.pkl

 ../data/ne-laptop/encodings/0732

In [13]:
idents[0].split(':')[0]

'0735w-446n'

In [14]:
# # Initialize training and validation datasets.
# train_dataset = VergeDataset(train_fnames, n_classes, mask_fraction=0.0, class_prob=class_prob_lookup)
# train_dataloader = DataLoader(
#     train_dataset,
#     batch_size=64, # Tune depending on GPU memory
#     shuffle=True,
#     collate_fn=verge_collate_fn,
#     drop_last=False
# )

# val_dataset = VergeDataset(val_fnames, n_classes, mask_fraction=0.0, class_prob=class_prob_lookup)
# val_dataloader = DataLoader(
#     val_dataset,
#     batch_size=64, # Tune depending on GPU memory
#     shuffle=True,
#     collate_fn=verge_collate_fn,
#     drop_last=False
# )

In [15]:
# val_dataset = VergeDataset(val_fnames, n_classes, mask_fraction=0.0, class_prob=class_prob_lookup)
# val_dataloader = DataLoader(
#     val_dataset,
#     batch_size=64, # Tune depending on GPU memory
#     shuffle=True,
#     collate_fn=verge_collate_fn,
#     drop_last=False
# )

## QA