### VERGE: Vector-mode Regional Geospatial Embeddings

# Encode geospatial data for VERGE processing

Elsewhere we assembled geospatial data for a bunch of tiles.
Here we come up with encodings for each such entitiy.
The encodings will consist of a concatenation of a Multi-Point Proximity (MPP) encoding
and a one-hot vector indicating the entity type.

## Processing setup

In [1]:
# # Google colab setup
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# project_home = '/content/drive/MyDrive/Projects/verge'
# os.chdir(project_home)

In [2]:
# !pip install geo-encodings

In [3]:
# Local processing setup
project_home = '..'

## Source

In [4]:
# The name of the ROI to use.
roi_name = 'ne-dev'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)


## Setup

In [5]:
import pandas as pd
import numpy as np
import glob
import geopandas
import os
import pickle
import json
from geo_encodings import MPPEncoder

## Data prep

In [6]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
tile_shift = roi['tile_shift']
encoding_resolution = roi['encoding_resolution']

In [7]:
# Read the list of labels.
fname = '%s/labels.csv' % data_home
labels = pd.read_csv(fname)
num_classes = len(labels)
print('%d labels' % num_classes)

label_id_lookup = {
    z['label']: z['id']
    for z in labels.to_dict('records')
}

22 labels


In [8]:
# Set up the MPP encoder.
encoder = MPPEncoder(
    region=[0, 0, tile_size, tile_size],
    resolution=encoding_resolution,
    center=True
)
print('%d elements in encodings' % len(encoder))

400 elements in encodings


## Processing

In [9]:
# Get a list of AOI tags.
globstring = '%s/tiles/*' % roi_home
tile_dnames = glob.glob(globstring)
aoi_names = [
    z[z.rfind('/') + 1 :]
    for z in tile_dnames
]
print('%d AOIs in this ROI (%s)' % (len(aoi_names), roi_name))

20 AOIs in this ROI (ne-dev)


In [10]:
# In he loop below, we will also accumulate a histogram of class labels. 
# This will be saved and used during model training to balance sampling.
class_counts = {z: 0 for z in range(num_classes)}

In [12]:
# Loop over tiles

for k, aoi_name in enumerate(aoi_names[:5]):

    print('\nhandling AOI %s (%d / %d)' % (aoi_name, k, len(aoi_names)))

    # # This will hold all encodings for this AOI
    # tile_encodings_for_aoi = []

    # Get a list of all tiles for this AOI.
    globstring = '%s/tiles/%s/*.pq' % (roi_home, aoi_name)
    fnames = glob.glob(globstring)
    print('%d tiles' % len(fnames))

    # # Loop over tiles. 
    # for fname in fnames:
    #     tile_vectors = []
    #     gdf = geopandas.read_parquet(fname)
    #     # for _, rec in gdf.iterrows():
    #     for _, rec in gdf.sample(frac=1).iterrows():
    #         encoding = encoder.encode(rec['geometry']).values()
    #         etype = np.zeros(label_count)
    #         label_string = '%s : %s' % (rec['category'], rec['label'])
    #         label_id = label_id_lookup[label_string]
    #         etype[label_id] = 1.0

    #         # Concatenate the encoding and type vectors.
    #         vector = np.hstack((etype, encoding))
    #         tile_vectors.append(vector)
    #     tile_encoding = np.vstack(tile_vectors)
    #     tile_encodings_for_aoi.append(tile_encoding)
        
    # Loop over tiles. 
    for fname in fnames:

        # Extract the base name for this tile.
        ix0 = fname.rfind('/') + 1
        ix1 = fname.rfind('.')
        tile_name = fname[ix0 : ix1] 
        
        tile_vectors = []
        tile_labels = []
        gdf = geopandas.read_parquet(fname)
        for _, rec in gdf.sample(frac=1).iterrows():
            
            # Geometric encoding vector
            encoding = encoder.encode(rec['geometry']).values()

            # One-hot label vector
            etype = np.zeros(num_classes)
            label_string = '%s : %s' % (rec['category'], rec['label'])
            label_id = label_id_lookup[label_string]
            etype[label_id] = 1.0

            # Save the label as an integer too. This will make training easier. 
            tile_labels.append(label_id)
            class_counts[label_id] += 1

            # Concatenate the encoding and type vectors.
            vector = np.hstack((etype, encoding))
            tile_vectors.append(vector)

        # Combine all per-feature vectors into one big feature matrix.
        tile_features = np.vstack(tile_vectors)

        # Save the encodings as a NPZ file.
        ofname = '%s/encodings/%s/%s.npz' % (roi_home, aoi_name, tile_name)
        os.makedirs(os.path.dirname(ofname), exist_ok=True)
        np.savez_compressed(ofname, features=tile_features, labels=tile_labels)
        # print('wrote: %s' % ofname)


    # # Save all of the encodings for this tile.
    # fname = '%s/encodings/%s.pkl' % (roi_home, aoi_name)
    # os.makedirs(os.path.dirname(fname), exist_ok=True)
    # with open(fname, 'wb') as dest:
    #     pickle.dump(tile_encodings_for_aoi, dest)
    # print('wrote %s' % (fname))



handling AOI 0732w-415n (0 / 20)
60 tiles

handling AOI 0732w-444n (1 / 20)
61 tiles

handling AOI 0731w-415n (2 / 20)
63 tiles

handling AOI 0732w-426n (3 / 20)
22 tiles

handling AOI 0732w-418n (4 / 20)
36 tiles


In [15]:
# Save the label probabilities.
total = 0.0
for key in class_counts:
    total += class_counts[key]

records = [
    {
        'label': z,q
        'count': class_counts[z],
        'prob': class_counts[z] / total
    } for z in range(num_classes)
]
fname = '%s/class_info.csv' % roi_home
pd.DataFrame(records).to_csv(fname, index=False)

## QA and visualizations

In [None]:
import glob
globstring = '%s/encodings/*/*.npz' % (roi_home)
fnames = glob.glob(globstring)
print('%d files' % len(fnames))
print(fnames[0])

In [None]:
data = np.load(fnames[0])
print(data['features'].shape, data['labels'].shape)

In [None]:
data['labels']

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt

# # Create a 2D array (example data)
# data = tile_encodings_for_aoi[0]

# # Plot as heatmap
# plt.imshow(data, cmap='viridis', origin='upper')
# plt.colorbar(label='Intensity')
# plt.title('encodings')
# plt.ylabel('entity number')
# plt.xlabel('encoding element')
# plt.show()


In [None]:
# gdf[['category', 'label']].value_counts().sort_index()