# VERGE batch processing 1:
Assemble geospatial entities for a ROI.

Output is in the folder `gents`. Every file contains the geospatial entities for
one sub-ROI, which is a lon/lat box of dimensions defined below.



## Processing Setup

In [None]:
# Google colab
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)
!pip install geo_encodings osmnx

Mounted at /content/drive
Collecting geo_encodings
  Downloading geo_encodings-1.0.4-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting osmnx
  Downloading osmnx-2.0.6-py3-none-any.whl.metadata (4.9 kB)
Downloading geo_encodings-1.0.4-py2.py3-none-any.whl (6.9 kB)
Downloading osmnx-2.0.6-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m585.2 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: geo_encodings, osmnx
Successfully installed geo_encodings-1.0.4 osmnx-2.0.6


In [None]:
# Local processing setup
# project_home = '..'

## Notebook Setup

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from typing import List, Tuple, Optional

import pickle
import json
import copy
import pandas as pd
import numpy as np
import pyproj
import shapely
import osmnx
import geopandas

import sys
sys.path.append('%s/03-embeddings' % project_home)
from embedderv5 import *

sys.path.append(project_home)
from utils.verge import rules


## Parameters

In [None]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# The unique identifier of the model to be used.
transformer_run_id = '301b'
collector_run_id = '301b'

## Preliminaries

In [None]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
encoding_resolution = roi['encoding_resolution']

roi

{'name': 'newengland',
 'lon0': -73.564321,
 'lat0': 41.253746,
 'lon1': -68.058533,
 'lat1': 45.116468,
 'proj_def': '\n+proj=tmerc +lat_0=43.185107 +lon_0=-70.81142700000001\n+k=1.0 +x_0=231000.0 +y_0=211000.0 +datum=WGS84 +units=m +no_defs\n',
 'tile_size': 2000,
 'tile_shift': 1000,
 'encoding_resolution': 100}

In [None]:
# Re-define the ROI. The original is taking too long.
# this just includes the more urbanized part of sourhtern New England
roi['lat0'] =  41.2
roi['lon0'] = -73.4

roi['lat1'] = 43.4
roi['lon1'] = -69.7

# # even smaller:
# roi['lat1'] = 42.0
# roi['lon1'] = -72.3


In [None]:
# Read the file containing labels.
fname = '%s/labels.csv' % data_home
labels = pd.read_csv(fname)

# Make a lookup table to get a numerical label from a text label.
label_lookup = {
    z['label']: z['id']
    for z in labels.to_dict('records')
}
label_count = len(label_lookup)
label_lookup

{'amenity : commercial': 0,
 'amenity : food and drink': 1,
 'amenity : parking lot': 2,
 'amenity : recreation': 3,
 'landuse : agricultural': 4,
 'landuse : commercial': 5,
 'landuse : forest': 6,
 'landuse : industrial': 7,
 'landuse : meadow': 8,
 'landuse : recreation': 9,
 'landuse : residential': 10,
 'landuse : retail': 11,
 'railway : rail': 12,
 'railway : rail stop': 13,
 'route : highway': 14,
 'route : primary road': 15,
 'route : residential road': 16,
 'route : secondary road': 17,
 'route : tertiary road': 18,
 'waterway : lakes and ponds': 19,
 'waterway : land': 20,
 'waterway : rivers and streams': 21}

In [None]:
# Define a local map projection, using the definition from the ROI file.
def get_projections(proj_def):
    ltm_crs = pyproj.CRS.from_proj4(proj_def)
    wgs84_crs = pyproj.CRS.from_epsg(4326)
    proj_forward = pyproj.Transformer.from_crs(wgs84_crs, ltm_crs, always_xy=True).transform
    proj_inverse = pyproj.Transformer.from_crs(ltm_crs, wgs84_crs, always_xy=True).transform
    return proj_forward, proj_inverse

proj_forward, proj_inverse = get_projections(roi['proj_def'])

In [None]:
# Read the coastline file.
fname = '%s/coastlines' % (roi_home)
coastlines_gdf = geopandas.read_file(fname)
print('%d coastline polygons' % len(coastlines_gdf))

def get_land_water(bounds, features):

    # Create a baseline polygon consisting of the whole AOI.
    landwater = copy.deepcopy(bounds)

    # Intersect that with the coastlines data.
    coastlines = shapely.union_all(coastlines_gdf['geometry'].values)
    landwater = landwater.intersection(coastlines)

    # subtract out any polygonal water feature.
    for _, f in features.iterrows():
        if f['geometry'].geom_type in ['Polygon', 'MultiPolygon']:
            if f['natural'] == 'water':
                landwater = shapely.difference(landwater, f['geometry'])

    return landwater

3514 coastline polygons


## Processing


### Pull OSM data for the area around this location

In [None]:
import osmnx
tags = {
    'landuse': True,
    'place': True,
    'highway': True,
    'railway': True,
    #'aeroway': True,
    'bridge': True,
    'tunnel': True,
    #'power': True,
    'natural': True,
    'waterway': True,
    'landcover': True,
    #'building': True,
    'amenity': True,
    'shop': True,
    'leisure': True
}


In [None]:
# We need to break up the ROI into smaller chunks for querying.

dd = 0.2 # size of query boxes, in degrees
sub_rois = []

lon0 = roi['lon0']
while lon0 < roi['lon1']:
    lon1 = lon0 + dd

    lat0 = roi['lat0']
    while lat0 < roi['lat1']:
        lat1 = lat0 + dd

        query_bounds = [lon0, lat0, lon1, lat1]
        sub_rois.append(query_bounds)

        lat0 += dd

    lon0 += dd

print('will process %d sub-rois' % len(sub_rois))


will process 209 sub-rois


In [None]:
for k, sub_roi in enumerate(sub_rois):

  print('\n%d/%d' % (k+1, len(sub_rois)))

  # Check whether an output file has already been generated for this sub-roi.
  ofname = '%s/batch/gents/gents_%+.1f_%+.1f.csv' % (roi_home, sub_roi[0], sub_roi[1])
  if os.path.exists(ofname):
    print('%s already exists; skipping' % ofname)
    continue

  query_bounds = sub_roi
  print(query_bounds)
  try:
    sub_roi_features = osmnx.features.features_from_bbox(query_bounds, tags=tags).reset_index()
    print('%d features from OSM' % len(sub_roi_features))
  except:
    print('OSM query failed')
    continue

  # Just retain the relevant columns.
  columns_in_rules = set(['id', 'geometry', 'amenity', 'highway', 'landuse', 'railway', 'water', 'waterway', 'natural'])
  columns_in_features = set(sub_roi_features.columns)
  columns_to_keep = list(columns_in_rules.intersection(columns_in_features))
  sub_roi_features = sub_roi_features[columns_to_keep]

  # Down-select and re-format any relevant geospatial entities ("gents").
  sub_roi_gents = []
  for feature in sub_roi_features.to_dict('records'):

      gtype = feature['geometry'].geom_type

      for rule in rules:
          if gtype == rule['gtype']:
              osm_key = rule['osm_key']
              if osm_key in feature:
                  osm_value = str(feature[osm_key])
                  if osm_value in rule['osm_values']:
                      geomxy = shapely.ops.transform(proj_forward, feature['geometry'])
                      if geomxy.is_empty:
                          continue
                      sub_roi_gents.append({
                          'id': feature['id'],
                          'category': rule['gent_category'],
                          'label': rule['gent_label'],
                          'geom': feature['geometry'],
                          'geomxy': geomxy,
                          'gtype': gtype
                      })


  # Create a "land/water" polygon.
  try:
    lon0, lat0, lon1, lat1 = sub_roi
    lons = [lon0, lon1, lon1, lon0, lon0]
    lats = [lat0, lat0, lat1, lat1, lat0]
    lonlat_bounds = shapely.Polygon(list(zip(lons, lats)))
    landwater = get_land_water(lonlat_bounds, sub_roi_features)
    landwaterxy = shapely.ops.transform(proj_forward, landwater)
    sub_roi_gents.append({
        'id': None,
        'category': 'waterway',
        'label': 'land',
        'geom': None,
        'geomxy': landwaterxy,
        'gtype': landwaterxy.geom_type
    })
  except:
    pass

  os.makedirs(os.path.dirname(ofname), exist_ok=True)
  sub_roi_df = pd.DataFrame(sub_roi_gents)
  sub_roi_df.to_csv(ofname, index=False)
  print('%d records to %s' % (len(sub_roi_df), ofname))



1/209
/content/drive/MyDrive/Projects/verge/data/newengland/batch/gents/gents_-73.4_+41.2.csv already exists; skipping

2/209
/content/drive/MyDrive/Projects/verge/data/newengland/batch/gents/gents_-73.4_+41.4.csv already exists; skipping

3/209
/content/drive/MyDrive/Projects/verge/data/newengland/batch/gents/gents_-73.4_+41.6.csv already exists; skipping

4/209
/content/drive/MyDrive/Projects/verge/data/newengland/batch/gents/gents_-73.4_+41.8.csv already exists; skipping

5/209
/content/drive/MyDrive/Projects/verge/data/newengland/batch/gents/gents_-73.4_+42.0.csv already exists; skipping

6/209
/content/drive/MyDrive/Projects/verge/data/newengland/batch/gents/gents_-73.4_+42.2.csv already exists; skipping

7/209
/content/drive/MyDrive/Projects/verge/data/newengland/batch/gents/gents_-73.4_+42.4.csv already exists; skipping

8/209
/content/drive/MyDrive/Projects/verge/data/newengland/batch/gents/gents_-73.4_+42.6.csv already exists; skipping

9/209
/content/drive/MyDrive/Projects/v