### VERGE: Vector-mode Regional Geospatial Embeddings

# Encode geospatial data for VERGE processing

Elsewhere we assembled geospatial data for a bunch of tiles.
Here we come up with encodings for each such entitiy.
The encodings will consist of a concatenation of a Multi-Point Proximity (MPP) encoding
and a one-hot vector indicating the entity type.

## Processing setup

In [1]:
# Google colab setup
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# project_home = '/content/drive/MyDrive/Projects/verge'
# os.chdir(project_home)

In [2]:
# !pip install geo-encodings

In [3]:
# Local processing setup
project_home = '..'

## Notebook setup

In [4]:
import pandas as pd
import numpy as np
import glob
import geopandas
import os
import pickle
import json
from geo_encodings import MPPEncoder

## Parameters

In [5]:
# The name of the ROI to use.
roi_name = 'ne-laptop'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# # A unique identifier for this run.
# run_id = '101'


## Data prep

In [6]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
tile_shift = roi['tile_shift']
encoding_resolution = roi['encoding_resolution']

In [7]:
# Read the list of labels.
fname = '%s/labels.csv' % data_home
labels = pd.read_csv(fname)
num_classes = len(labels)
print('%d labels' % num_classes)

label_id_lookup = {
    z['label']: z['id']
    for z in labels.to_dict('records')
}

22 labels


In [8]:
# Set up the MPP encoder.
encoder = MPPEncoder(
    region=[0, 0, tile_size, tile_size],
    resolution=encoding_resolution,
    center=True
)
print('%d elements in encodings' % len(encoder))

400 elements in encodings


## Processing

In [9]:
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles in this ROI' % len(tile_info))
tile_info.head(3)


1056 tiles in this ROI


Unnamed: 0,aoi_tag,tile_tag,lon0,lat0,lon1,lat1,center_lon,center_lat
0,0735w-413n,005-005,-73.5,41.3,-73.476879,41.318544,-73.48844,41.309272
1,0735w-413n,005-006,-73.50037,41.308994,-73.477246,41.327538,-73.488808,41.318266
2,0735w-413n,005-007,-73.50074,41.317988,-73.477613,41.336533,-73.489176,41.32726


In [10]:
# In the loop below, we will also accumulate a histogram of class labels.
# This will be saved and used during model training to balance sampling.
class_counts = {z: 0 for z in range(num_classes)}

In [11]:
aoi_tags = np.unique(tile_info['aoi_tag'].values)
aoi_tags

array(['0731w-413n', '0731w-414n', '0731w-415n', '0732w-413n',
       '0732w-415n', '0732w-418n', '0732w-426n', '0732w-444n',
       '0732w-445n', '0733w-413n', '0733w-414n', '0733w-423n',
       '0733w-424n', '0733w-444n', '0733w-445n', '0734w-414n',
       '0735w-413n', '0735w-414n', '0735w-415n', '0735w-446n'],
      dtype=object)

In [12]:
# Loop over AOIs

for k, aoi_tag in enumerate(aoi_tags):

    # Check whether the output file already exists.
    fname = '%s/encodings/%s.pkl' % (roi_home, aoi_tag)
    if os.path.exists(fname):
      print('\nskipping %s' % (fname))
      continue

    print('\nhandling AOI %s (%d / %d)' % (aoi_tag, k, len(aoi_tags)))

    # Get a list of the tile tags for this AOI.
    iok = tile_info['aoi_tag'] == aoi_tag
    tile_tags = tile_info['tile_tag'].values[iok]

    # This will hold all encodings for this AOI
    tile_encodings_for_this_aoi = []

    # Loop over tiles.
    for tile_tag in tile_tags:

        print('tile tag %s' % tile_tag)

        tile_vectors = []
        tile_labels = []
        tile_fname = '%s/tiles/%s/%s.pq' % (roi_home, aoi_tag, tile_tag)
        gdf = geopandas.read_parquet(tile_fname)
        for _, rec in gdf.sample(frac=1).iterrows():

            # Geometric encoding vector
            encoding = encoder.encode(rec['geometry']).values()

            # One-hot label vector
            etype = np.zeros(num_classes)
            label_string = '%s : %s' % (rec['category'], rec['label'])
            label_id = label_id_lookup[label_string]
            etype[label_id] = 1.0

            # Save the label as an integer too. This will make training easier.
            tile_labels.append(label_id)

            # Keep track of class counts across all tiles.
            class_counts[label_id] += 1

            # Concatenate the encoding and type vectors.
            vector = np.hstack((etype, encoding))
            tile_vectors.append(vector)

        # Combine all per-feature vectors into one big feature matrix.
        tile_features = np.vstack(tile_vectors)

        tile_encodings_for_this_aoi.append({
            'aoi_tag': aoi_tag,
            'tile_tag': tile_tag,
            'features': tile_features,
            'labels': np.array(tile_labels),
        })

    # Save all of the encodings for this tile.
    fname = '%s/encodings/%s.pkl' % (roi_home, aoi_tag)
    os.makedirs(os.path.dirname(fname), exist_ok=True)
    with open(fname, 'wb') as dest:
        pickle.dump(tile_encodings_for_this_aoi, dest)
    print('wrote %s' % (fname))



handling AOI 0731w-413n (0 / 20)
tile tag 039-004
tile tag 039-005
tile tag 039-006
tile tag 039-007
tile tag 039-008
tile tag 039-009
tile tag 039-010
tile tag 039-011
tile tag 039-012
tile tag 040-004
tile tag 040-005
tile tag 040-006
tile tag 040-007
tile tag 040-008
tile tag 040-009
tile tag 040-010
tile tag 040-011
tile tag 040-012
tile tag 041-004
tile tag 041-005
tile tag 041-006
tile tag 041-007
tile tag 041-008
tile tag 041-009
tile tag 041-010
tile tag 041-011
tile tag 041-012
tile tag 042-004
tile tag 042-005
tile tag 042-006
tile tag 042-007
tile tag 042-008
tile tag 042-009
tile tag 042-010
tile tag 042-011
tile tag 042-012
tile tag 043-004
tile tag 043-005
tile tag 043-006
tile tag 043-007
tile tag 043-008
tile tag 043-009
tile tag 043-010
tile tag 043-011
tile tag 043-012
tile tag 044-004
tile tag 044-005
tile tag 044-006
tile tag 044-007
tile tag 044-008
tile tag 044-009
tile tag 044-010
tile tag 044-011
tile tag 044-012
tile tag 045-004
tile tag 045-005
tile tag 045-0

In [13]:
# Save the label probabilities.
total = 0.0
for key in class_counts:
    total += class_counts[key]

records = [
    {
        'label': z,
        'count': class_counts[z],
        'prob': class_counts[z] / total
    } for z in range(num_classes)
]
fname = '%s/class_info.csv' % roi_home
pd.DataFrame(records).to_csv(fname, index=False)

## QA and visualizations

In [14]:
import glob
globstring = '%s/encodings/*.pkl' % roi_home
fnames = glob.glob(globstring)
print(len(fnames))

20
