### VERGE processing
# Assemble geospatial entities for a ROI

We start by reading in a definition of a region of interest. 
We loop over "sub-ROIs" which are lon/lat boxes of a size specified below.
For each one, we use `osmnx` to pull various geospatial features for that sub-ROI. 
Then we apply some filtering for entities that we are interested in, do some
reformatting, and save the results for further analysis. 

Output is in the folder `gents`. Every file contains the geospatial entities for
one sub-ROI, which is a lon/lat box of dimensions defined below.

## Processing Setup

In [24]:
# AWS / SageMaker
import sagemaker
project_home = 's3://odyssey-geospatial/verge'

In [25]:
# # Google colab
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# project_home = '/content/drive/MyDrive/Projects/verge'
# os.chdir(project_home)
# !pip install geo_encodings osmnx

In [26]:
# Local processing setup
# project_home = '..'

## Notebook Setup

In [27]:
import os
import sys

import pandas as pd
import numpy as np
import pyproj
import boto3
import json
import s3fs


In [28]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset
# from typing import List, Tuple, Optional

# import pickle
# import json
import copy
# import pandas as pd
# import numpy as np
import shapely
import osmnx
import geopandas

# import sys
# sys.path.append('%s/03-embeddings' % project_home)
# from embedderv5 import *

sys.path.append('..')
from utils.verge import rules


## Parameters

In [29]:
# The name of the ROI to use.
roi_name = 'ne-test'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# # The unique identifier of the model to be used.
# transformer_run_id = '301b'
# collector_run_id = '301b'

## Preliminaries

In [30]:
role = sagemaker.get_execution_role()
print(role)

arn:aws:iam::445567107167:role/service-role/AmazonSageMaker-ExecutionRole-20250825T121753


In [31]:
# Read the ROI definition

# Initialize S3 client
s3 = boto3.client('s3')

# Read the JSON file that defines the region of interest.
bucket_name = 'odyssey-geospatial'
file_key = 'verge/data/newengland/roi.json'

response = s3.get_object(Bucket=bucket_name, Key=file_key)
roi = json.loads(response['Body'].read().decode('utf-8'))
print(json.dumps(roi, indent=2))


{
  "name": "newengland",
  "lon0": -73.564321,
  "lat0": 41.253746,
  "lon1": -68.058533,
  "lat1": 45.116468,
  "proj_def": "\n+proj=tmerc +lat_0=43.185107 +lon_0=-70.81142700000001\n+k=1.0 +x_0=231000.0 +y_0=211000.0 +datum=WGS84 +units=m +no_defs\n",
  "tile_size": 2000,
  "tile_shift": 1000,
  "encoding_resolution": 100
}


In [32]:
# Read the file containing labels for different type of geospatial entities
# that we are interested in.

fname = '%s/labels.csv' % data_home
labels = pd.read_csv(fname)

# Make a lookup table to get a numerical label from a text label.
label_lookup = {
    z['label']: z['id']
    for z in labels.to_dict('records')
}
label_count = len(label_lookup)
label_lookup

{'amenity : commercial': 0,
 'amenity : food and drink': 1,
 'amenity : parking lot': 2,
 'amenity : recreation': 3,
 'landuse : agricultural': 4,
 'landuse : commercial': 5,
 'landuse : forest': 6,
 'landuse : industrial': 7,
 'landuse : meadow': 8,
 'landuse : recreation': 9,
 'landuse : residential': 10,
 'landuse : retail': 11,
 'railway : rail': 12,
 'railway : rail stop': 13,
 'route : highway': 14,
 'route : primary road': 15,
 'route : residential road': 16,
 'route : secondary road': 17,
 'route : tertiary road': 18,
 'waterway : lakes and ponds': 19,
 'waterway : land': 20,
 'waterway : rivers and streams': 21}

In [33]:
# Define a local map projection, using the definition from the ROI file.
def get_projections(proj_def):
    ltm_crs = pyproj.CRS.from_proj4(proj_def)
    wgs84_crs = pyproj.CRS.from_epsg(4326)
    proj_forward = pyproj.Transformer.from_crs(wgs84_crs, ltm_crs, always_xy=True).transform
    proj_inverse = pyproj.Transformer.from_crs(ltm_crs, wgs84_crs, always_xy=True).transform
    return proj_forward, proj_inverse


proj_forward, proj_inverse = get_projections(roi['proj_def'])

In [34]:
# Read the coastline file.
fname = '%s/coastlines/coastlines.shp' % (roi_home)
print(fname)
coastlines_gdf = geopandas.read_file(fname)
print('%d coastline polygons' % len(coastlines_gdf))


s3://odyssey-geospatial/verge/data/ne-test/coastlines/coastlines.shp
3514 coastline polygons


In [35]:
def get_land_water(bounds, features):

    # Create a baseline polygon consisting of the whole AOI.
    landwater = copy.deepcopy(bounds)

    # Intersect that with the coastlines data.
    coastlines = shapely.union_all(coastlines_gdf['geometry'].values)
    landwater = landwater.intersection(coastlines)

    # subtract out any polygonal water feature.
    for _, f in features.iterrows():
        if f['geometry'].geom_type in ['Polygon', 'MultiPolygon']:
            if f['natural'] == 'water':
                landwater = shapely.difference(landwater, f['geometry'])

    return landwater

## Processing


### Pull OSM data for the area around this location

In [36]:
tags = {
    'landuse': True,
    'place': True,
    'highway': True,
    'railway': True,
    'bridge': True,
    'tunnel': True,
    'natural': True,
    'waterway': True,
    'landcover': True,
    'amenity': True,
    'shop': True,
    'leisure': True
}


In [37]:
# We need to break up the ROI into smaller chunks for querying.

dd = 0.2  # size of query boxes, in degrees
sub_rois = []

lon0 = np.ceil(roi['lon0'] / dd) * dd
while lon0 < roi['lon1'] - dd:
    lon1 = lon0 + dd

    lat0 = np.ceil(roi['lat0'] / dd) * dd
    while lat0 < roi['lat1'] - dd:
        lat1 = lat0 + dd

        query_bounds = [lon0, lat0, lon1, lat1]
        sub_rois.append(query_bounds)

        lat0 += dd

    lon0 += dd

print('will process %d sub-rois' % len(sub_rois))


will process 468 sub-rois


In [38]:
# This function does the processing for one sub-ROI. Below we
# will run this in parallel.

def process_sub_roi(sub_roi):

    # Check whether an output file has already been generated for this sub-roi.
    ofname = '%s/gents/gents_%+.1f_%+.1f.csv' % (roi_home, sub_roi[0], sub_roi[1])
    print(ofname)
    print(sub_roi)
    fs = s3fs.S3FileSystem()
    if fs.exists(ofname):
        print('--> %s already exists; skipping' % ofname)
        return ofname

    query_bounds = sub_roi
    try:
        sub_roi_features = osmnx.features.features_from_bbox(query_bounds, tags=tags).reset_index()
        print('%d features from OSM' % len(sub_roi_features))
    except:
        print('OSM query failed')
        return "fail"

    # Just retain the relevant columns.
    columns_in_rules = set(['id', 'geometry', 'amenity', 'highway', 'landuse', 'railway', 'water', 'waterway', 'natural'])
    columns_in_features = set(sub_roi_features.columns)
    columns_to_keep = list(columns_in_rules.intersection(columns_in_features))
    sub_roi_features = sub_roi_features[columns_to_keep]

    # Down-select and re-format any relevant geospatial entities ("gents").
    sub_roi_gents = []
    for feature in sub_roi_features.to_dict('records'):

        gtype = feature['geometry'].geom_type

        for rule in rules:
            if gtype == rule['gtype']:
                osm_key = rule['osm_key']
                if osm_key in feature:
                    osm_value = str(feature[osm_key])
                    if osm_value in rule['osm_values']:
                        geomxy = shapely.ops.transform(proj_forward, feature['geometry'])
                        if geomxy.is_empty:
                            continue
                        sub_roi_gents.append({
                            'id': feature['id'],
                            'category': rule['gent_category'],
                            'label': rule['gent_label'],
                            'geom': feature['geometry'],
                            'geomxy': geomxy,
                            'gtype': gtype
                        })

    # Create a "land/water" polygon.
    try:
        lon0, lat0, lon1, lat1 = sub_roi
        lons = [lon0, lon1, lon1, lon0, lon0]
        lats = [lat0, lat0, lat1, lat1, lat0]
        lonlat_bounds = shapely.Polygon(list(zip(lons, lats)))
        landwater = get_land_water(lonlat_bounds, sub_roi_features)
        landwaterxy = shapely.ops.transform(proj_forward, landwater)
        sub_roi_gents.append({
          'id': None,
          'category': 'waterway',
          'label': 'land',
          'geom': None,
          'geomxy': landwaterxy,
          'gtype': landwaterxy.geom_type
        })
    except:
        pass

    os.makedirs(os.path.dirname(ofname), exist_ok=True)
    sub_roi_df = pd.DataFrame(sub_roi_gents)
    sub_roi_df.to_csv(ofname, index=False)
    print('%d records to %s' % (len(sub_roi_df), ofname))
    return ofname


In [39]:
from concurrent.futures import ThreadPoolExecutor, as_completed

with ThreadPoolExecutor(max_workers=3) as executor:
    futures = {
        executor.submit(process_sub_roi, sub_roi): sub_roi
        for sub_roi in sub_rois
    }

    for future in as_completed(futures):
        try:
            result = future.result()
            # print(f"==> completed: {result}")
        except Exception as e:
            print(f"Error: {e}")


s3://odyssey-geospatial/verge/data/ne-test/gents/gents_-73.4_+41.4.csv
[-73.4, 41.400000000000006, -73.2, 41.60000000000001]
s3://odyssey-geospatial/verge/data/ne-test/gents/gents_-73.4_+41.6.csv
[-73.4, 41.60000000000001, -73.2, 41.80000000000001]
s3://odyssey-geospatial/verge/data/ne-test/gents/gents_-73.4_+41.8.csv
[-73.4, 41.80000000000001, -73.2, 42.000000000000014]
--> s3://odyssey-geospatial/verge/data/ne-test/gents/gents_-73.4_+41.4.csv already exists; skipping
s3://odyssey-geospatial/verge/data/ne-test/gents/gents_-73.4_+42.0.csv
[-73.4, 42.000000000000014, -73.2, 42.20000000000002]
--> s3://odyssey-geospatial/verge/data/ne-test/gents/gents_-73.4_+41.8.csv already exists; skipping
s3://odyssey-geospatial/verge/data/ne-test/gents/gents_-73.4_+42.2.csv
[-73.4, 42.20000000000002, -73.2, 42.40000000000002]
--> s3://odyssey-geospatial/verge/data/ne-test/gents/gents_-73.4_+41.6.csv already exists; skipping
s3://odyssey-geospatial/verge/data/ne-test/gents/gents_-73.4_+42.4.csv
[-73.4