### VERGE: Vector-mode Regional Geospatial Embeddings

# Encode geospatial data for VERGE processing

Elsewhere we assembled geospatial data for a bunch of tiles.
Here we come up with encodings for each such entitiy.
The encodings will consist of a concatenation of a Multi-Point Proximity (MPP) encoding
and a one-hot vector indicating the entity type.

## Outputs:
* data/encodings/*.pkl: encodings for tiles; one file per AOI


## Processing setup

In [2]:
# Google colab setup
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)
!pip install geo-encodings

Mounted at /content/drive
Collecting geo-encodings
  Downloading geo_encodings-1.0.4-py2.py3-none-any.whl.metadata (4.0 kB)
Downloading geo_encodings-1.0.4-py2.py3-none-any.whl (6.9 kB)
Installing collected packages: geo-encodings
Successfully installed geo-encodings-1.0.4


In [3]:
# Local processing setup
# project_home = '..'

## Notebook setup

In [4]:
import pandas as pd
import numpy as np
import glob
import geopandas
import os
import pickle
import json
from geo_encodings import MPPEncoder

## Parameters

In [5]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# # A unique identifier for this run.
# run_id = '201'


## Data prep

In [6]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
tile_shift = roi['tile_shift']
encoding_resolution = roi['encoding_resolution']

In [7]:
# Read the list of labels.
fname = '%s/labels.csv' % data_home
labels = pd.read_csv(fname)
num_classes = len(labels)
print('%d labels' % num_classes)

label_id_lookup = {
    z['label']: z['id']
    for z in labels.to_dict('records')
}

22 labels


In [8]:
# Set up the MPP encoder.
encoder = MPPEncoder(
    region=[0, 0, tile_size, tile_size],
    resolution=encoding_resolution,
    center=True
)
print('%d elements in encodings' % len(encoder))

400 elements in encodings


## Processing

In [9]:
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles in this ROI' % len(tile_info))
tile_info.head(3)


15793 tiles in this ROI


Unnamed: 0,aoi_tag,tile_tag,lon0,lat0,lon1,lat1,center_lon,center_lat
0,0735w-413n,005-005,-73.5,41.3,-73.476879,41.318544,-73.48844,41.309272
1,0735w-413n,005-006,-73.50037,41.308994,-73.477246,41.327538,-73.488808,41.318266
2,0735w-413n,005-007,-73.50074,41.317988,-73.477613,41.336533,-73.489176,41.32726


In [11]:
aoi_tags = np.unique(tile_info['aoi_tag'].values)

In [12]:
# Loop over AOIs

for k, aoi_tag in enumerate(aoi_tags):

    # Check whether the output file already exists.
    fname = '%s/encodings/%s.pkl' % (roi_home, aoi_tag)
    if os.path.exists(fname):
      print('\nskipping %s' % (fname))
      continue

    print('\nhandling AOI %s (%d / %d)' % (aoi_tag, k, len(aoi_tags)))

    # Get a list of the tile tags for this AOI.
    iok = tile_info['aoi_tag'] == aoi_tag
    tile_tags = tile_info['tile_tag'].values[iok]

    # This will hold all encodings for this AOI
    tile_encodings_for_this_aoi = []

    # Loop over tiles.
    for tile_tag in tile_tags:

        print('tile tag %s' % tile_tag)

        tile_vectors = []
        tile_labels = []
        tile_fname = '%s/tiles/%s/%s.pq' % (roi_home, aoi_tag, tile_tag)
        gdf = geopandas.read_parquet(tile_fname)
        for _, rec in gdf.sample(frac=1).iterrows():

            # Geometric encoding vector
            encoding = encoder.encode(rec['geometry']).values()

            # One-hot label vector
            etype = np.zeros(num_classes)
            label_string = '%s : %s' % (rec['category'], rec['label'])
            label_id = label_id_lookup[label_string]
            etype[label_id] = 1.0

            # Save the label as an integer too. This will make training easier.
            tile_labels.append(label_id)

            # Concatenate the encoding and type vectors.
            vector = np.hstack((etype, encoding))
            tile_vectors.append(vector)

        # Combine all per-feature vectors into one big feature matrix.
        tile_features = np.vstack(tile_vectors)

        tile_encodings_for_this_aoi.append({
            'aoi_tag': aoi_tag,
            'tile_tag': tile_tag,
            'features': tile_features,
            'labels': np.array(tile_labels),
        })

    # Save all of the encodings for this tile.
    fname = '%s/encodings/%s.pkl' % (roi_home, aoi_tag)
    os.makedirs(os.path.dirname(fname), exist_ok=True)
    with open(fname, 'wb') as dest:
        pickle.dump(tile_encodings_for_this_aoi, dest)
    print('wrote %s' % (fname))



skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0687w-449n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0688w-447n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0688w-448n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0689w-447n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0697w-438n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0697w-445n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0698w-442n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0698w-443n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0699w-439n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0700w-417n.pkl

skipping /content/drive/MyDrive/Projects/verge/data/newengland/encodings/0700w-418n.pkl

skipping /content/dr

## QA and visualizations

In [15]:
import glob
globstring = '%s/encodings/*.pkl' % roi_home
fnames = glob.glob(globstring)
print(len(fnames))

263
