### VERGE: Vector-mode Regional Geospatial Embeddings

# Encode geospatial data for VERGE processing

Elsewhere we assembled geospatial data for a bunch of tiles.
Here we come up with encodings for each such entitiy.
The encodings will consist of a concatenation of a Multi-Point Proximity (MPP) encoding
and a one-hot vector indicating the entity type.

## Processing setup

In [1]:
# Google colab setup
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install geo-encodings



In [3]:
# # Local processing setup
# project_home = '..'

## Source

In [4]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)


## Setup

In [5]:
import pandas as pd
import numpy as np
import glob
import geopandas
import os
import pickle
import json
from geo_encodings import MPPEncoder

## Data prep

In [6]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
tile_shift = roi['tile_shift']
encoding_resolution = roi['encoding_resolution']

In [7]:
# Read the list of labels.
fname = '%s/labels.csv' % data_home
labels = pd.read_csv(fname)
label_count = len(labels)
print('%d labels' % label_count)

label_id_lookup = {
    z['label']: z['id']
    for z in labels.to_dict('records')
}

22 labels


In [8]:
# Set up the MPP encoder.
encoder = MPPEncoder(
    region=[0, 0, tile_size, tile_size],
    resolution=encoding_resolution,
    center=True
)
print('%d elements in encodings' % len(encoder))

400 elements in encodings


## Processing

In [9]:
# Get a list of AOI tags.
globstring = '%s/tiles/*' % roi_home
tile_dnames = glob.glob(globstring)
aoi_names = [
    z[z.rfind('/') + 1 :]
    for z in tile_dnames
]
print('%d AOIs in this ROI (%s)' % (len(aoi_names), roi_name))

263 AOIs in this ROI (newengland)


In [10]:
# Loop over tiles

for k, aoi_name in enumerate(aoi_names):

    print('\nhandling AOI %s (%d / %d)' % (aoi_name, k, len(aoi_names)))

    # This will hold all encodings for this AOI
    tile_encodings_for_aoi = []

    # Get a list of all tiles for this AOI.
    globstring = '%s/tiles/%s/*.pq' % (roi_home, aoi_name)
    fnames = glob.glob(globstring)
    print('%d tiles' % len(fnames))
    for fname in fnames:
        tile_vectors = []
        gdf = geopandas.read_parquet(fname)
        # for _, rec in gdf.iterrows():
        for _, rec in gdf.sample(frac=1).iterrows():
            encoding = encoder.encode(rec['geometry']).values()
            etype = np.zeros(label_count)
            label_string = '%s : %s' % (rec['category'], rec['label'])
            label_id = label_id_lookup[label_string]
            etype[label_id] = 1.0

            # Concatenate the encoding and type vectors.
            vector = np.hstack((etype, encoding))
            tile_vectors.append(vector)
        tile_encoding = np.vstack(tile_vectors)
        tile_encodings_for_aoi.append(tile_encoding)

    # Save all of the encodings for this tile.
    fname = '%s/encodings/%s.pkl' % (roi_home, aoi_name)
    os.makedirs(os.path.dirname(fname), exist_ok=True)
    with open(fname, 'wb') as dest:
        pickle.dump(tile_encodings_for_aoi, dest)
    print('wrote %s' % (fname))



handling AOI 0735w-413n (0 / 263)
63 tiles
wrote /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0735w-413n.pkl

handling AOI 0735w-414n (1 / 263)
63 tiles
wrote /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0735w-414n.pkl

handling AOI 0735w-415n (2 / 263)
58 tiles
wrote /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0735w-415n.pkl

handling AOI 0735w-446n (3 / 263)
39 tiles
wrote /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0735w-446n.pkl

handling AOI 0734w-414n (4 / 263)
63 tiles
wrote /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0734w-414n.pkl

handling AOI 0733w-413n (5 / 263)
63 tiles
wrote /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0733w-413n.pkl

handling AOI 0733w-414n (6 / 263)
62 tiles
wrote /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0733w-414n.pkl

handling AOI 0733w-423n (7 / 263)
41 tiles
wrote /content/drive/MyDrive/Projects/verge/data/newe

## QA / visualizations

In [11]:
# import numpy as np
# import matplotlib.pyplot as plt

# # Create a 2D array (example data)
# data = tile_encodings_for_aoi[0]

# # Plot as heatmap
# plt.imshow(data, cmap='viridis', origin='upper')
# plt.colorbar(label='Intensity')
# plt.title('encodings')
# plt.ylabel('entity number')
# plt.xlabel('encoding element')
# plt.show()


In [12]:
# gdf[['category', 'label']].value_counts().sort_index()