### VERGE: Vector-mode Regional Geospatial Embeddings

# Assemble geographical data for the VERGE project

This notebook pulls and organizes the geospatial data that we will use in this effort.

We have a list of AOIs, defined as lon/lat bounding boxes. Here we pull data for each one,
and divide it up into tiles of a given size.

## Processing setup

In [18]:
# Google colab setup
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# project_home = '/content/drive/MyDrive/Projects/verge'
# os.chdir(project_home)

In [19]:
# !pip install osmnx pygeohash geo-encodings

In [20]:
# # Local processing setup
project_home = '..'

In [21]:
!pwd

/Users/john/dev/projects/verge/02-mgm


## Notebook setup

In [22]:
import numpy as np
import pandas as pd
import pyproj
import shapely
import osmnx
import pygeohash
import geopandas as gpd
import os
import copy
import json

import plotly
from plotly.subplots import make_subplots
from plotly.graph_objects import Scatter

from geo_encodings import draw_shape

import sys
sys.path.append(project_home)
from utils.verge import rules


## Parameters

In [23]:
# The name of the ROI to use.
roi_name = 'ne-laptop'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# A tile has to contain this many entities in order to be retained.
min_entity_count = 20

# Set to True if we want to include the overall land/water polygon for each tile.
include_land_water = True

# If not None, then only process this many AOIs
aoi_limit = 20


## Preliminaries

In [24]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
tile_shift = roi['tile_shift']
encoding_resolution = roi['encoding_resolution']

In [25]:
# Define a local map projection, using the definition from the ROI file.
def get_projections(proj_def):
    ltm_crs = pyproj.CRS.from_proj4(proj_def)
    wgs84_crs = pyproj.CRS.from_epsg(4326)
    proj_forward = pyproj.Transformer.from_crs(wgs84_crs, ltm_crs, always_xy=True).transform
    proj_inverse = pyproj.Transformer.from_crs(ltm_crs, wgs84_crs, always_xy=True).transform
    return proj_forward, proj_inverse

proj_forward, proj_inverse = get_projections(roi['proj_def'])

In [26]:
# This function gets an overall land/water polygon for an AOI.
# It does it by conasidering both the "coastlines" shapefile
# and all polygonal water features.

# Read the coastline file.
if include_land_water:
    fname = '%s/coastlines' % (roi_home)
    coastlines_gdf = gpd.read_file(fname)
    print('%d coastline polygons' % len(coastlines_gdf))

def get_land_water(bounds, features):

    # Create a baseline polygon consisting of the whole AOI.
    landwater = copy.deepcopy(bounds)

    # Intersect that with the coastlines data.
    coastlines = shapely.union_all(coastlines_gdf['geometry'].values)
    landwater = landwater.intersection(coastlines)

    # subtract out any polygonal water feature.
    for _, f in features.iterrows():
        if f['geometry'].geom_type in ['Polygon', 'MultiPolygon']:
            if f['natural'] == 'water':
                landwater = shapely.difference(landwater, f['geometry'])

    return landwater

3514 coastline polygons


## Processing

In [27]:
# Read the file with the list of AOIs
fname = '%s/aois.csv' % (roi_home)
aois = pd.read_csv(fname).to_dict('records')
# aois = np.random.permutation(aois)
print('%d areas of interest' % len(aois))

263 areas of interest


In [28]:
aois[0]

{'aoi_tag': '0735w-413n',
 'lon0': -73.5,
 'lat0': 41.3,
 'lon1': -73.4,
 'lat1': 41.4}

In [29]:
# This will save some extra info that we will need.
tile_info = []

if aoi_limit is None:
    aoi_list = aois
else:
    aoi_list = aois[:aoi_limit]

for k, aoi in enumerate(aoi_list):

    print('\nprocessing AOI %s (%d / %d)' % (aoi['aoi_tag'], k, len(aoi_list)))

    # Use that projection to define lon/lat bounds for the query below. Make sure the bounds go
    # a little farther out than necessary to avoid edge artifacts from map projections.
    buffer = 0.01
    lon0 = aoi['lon0'] - buffer
    lat0 = aoi['lat0'] - buffer
    lon1 = aoi['lon1'] + buffer
    lat1 = aoi['lat1'] + buffer
    query_bounds = [lon0, lat0, lon1, lat1]

    # print('re-projected query bounds, with buffer:')
    # print(proj_forward(lon0, lat0))
    # print(proj_forward(lon1, lat1))

    # Query for all the geospatial entities we need within the bounding box.
    tags = {
        'landuse': True,
        'place': True,
        'highway': True,
        'railway': True,
        #'aeroway': True,
        'bridge': True,
        'tunnel': True,
        #'power': True,
        'natural': True,
        'waterway': True,
        'landcover': True,
        #'building': True,
        'amenity': True,
        'shop': True,
        'leisure': True
    }
    features = osmnx.features.features_from_bbox(query_bounds, tags=tags).reset_index()
    print('%d features from OSM' % len(features))

    # Re-format and filter everything.
    # BTW, "gents" is "geospatial entities".
    gents = []
    for feature in features.to_dict('records'):

        geomxy = shapely.ops.transform(proj_forward, feature['geometry'])
        if geomxy.is_empty:
            continue
        gtype = geomxy.geom_type

        for rule in rules:
            if gtype == rule['gtype']:
                osm_key = rule['osm_key']
                if osm_key in feature:
                    osm_value = str(feature[osm_key])
                    if osm_value in rule['osm_values']:
                        gents.append({
                            'feature': feature,
                            'category': rule['gent_category'],
                            'label': rule['gent_label'],
                            'geomxy': geomxy,
                            'gtype': gtype
                        })
    print('%d features selected' % len(gents))

    # We need some special handling to create a general "land/water" polygon.
    if include_land_water:
        lons = [lon0, lon1, lon1, lon0, lon0]
        lats = [lat0, lat0, lat1, lat1, lat0]
        lonlat_bounds = shapely.Polygon(list(zip(lons, lats)))
        landwater = get_land_water(lonlat_bounds, features)
        landwaterxy = shapely.ops.transform(proj_forward, landwater)
        gents.append({
            'category': 'waterway',
            'label': 'land',
            'geomxy': landwaterxy,
            'gtype': landwaterxy.geom_type
        })

    # Determine the AOI bounds in projected coordinates.
    aoi_x0, aoi_y0 = proj_forward(aoi['lon0'], aoi['lat0'])
    aoi_x1, aoi_y1 = proj_forward(aoi['lon1'], aoi['lat1'])

    # Loop over tiles within this AOI.
    x0 = aoi_x0
    while x0 + tile_size < aoi_x1:

        y0 = aoi_y0
        while y0 + tile_size < aoi_y1:

            # print('tile xo, yo:', x0, y0)

            # Figure out the lon/lat center of this tile. We will need it
            # for some later processing.
            tile_lon0, tile_lat0 = proj_inverse(x0, y0)
            tile_lon1, tile_lat1 = proj_inverse(x0 + tile_size, y0 + tile_size)
            tile_lon_center = (tile_lon0 + tile_lon1) / 2
            tile_lat_center = (tile_lat0 + tile_lat1) / 2
            # xc = x0 + tile_size / 2
            # yc = y0 + tile_size / 2
            # tile_lon, tile_lat = proj_inverse(xc, yc)
            # print('tile lon, lat', tile_lon, tile_lat)

            # Define a bounding box in projected coordinates. It will be used
            # to clip geospatial entities below.
            x1 = x0 + tile_size
            y1 = y0 + tile_size
            xx = [x0, x1, x1, x0, x0]
            yy = [y0, y0, y1, y1, y0]
            tile_bbox = shapely.Polygon(list(zip(xx, yy)))

            tile_gents = []
            for gent in gents:
                geomxy = shapely.affinity.translate(
                    gent['geomxy'].intersection(tile_bbox),
                    xoff=-x0, yoff=-y0
                )
                if geomxy.is_empty:
                    continue
                tile_gents.append({
                    'category': gent['category'],
                    'label': gent['label'],
                    'geometry': geomxy,
                    'gtype': gent['gtype'],
                    'xoff': x0,
                    'yoff': y0,
                })
            # print('%d geospatial entities' % len(tile_gents))

            # Save that if it's big enough.
            if len(tile_gents) >= min_entity_count:
                tx = '%03d' % (x0 / tile_shift)
                ty = '%03d' % (y0 / tile_shift)
                tile_tag = '%s-%s' % (tx, ty)
                fname = '%s/tiles/%s/%s.pq' % (roi_home, aoi['aoi_tag'], tile_tag)
                os.makedirs(os.path.dirname(fname), exist_ok=True)
                gdf = gpd.GeoDataFrame(tile_gents).drop_duplicates()
                gdf.to_parquet(fname, index=False, compression="zstd")
                print('saved %s (%4d entities)' % (fname, len(gdf)))

                # Save the tile info.
                tile_info.append({
                    'aoi_tag': aoi['aoi_tag'],
                    'tile_tag': tile_tag,
                    'lon0': tile_lon0,
                    'lat0': tile_lat0,
                    'lon1': tile_lon1,
                    'lat1': tile_lat1,
                    'center_lon': tile_lon_center,
                    'center_lat': tile_lat_center,
                })

            y0 += tile_shift
        x0 += tile_shift

# Save the tile info.
fname = '%s/tiles.csv' % roi_home
pd.DataFrame(tile_info).to_csv(fname, index=False)
print('\nsaved tile info in %s' % fname)



processing AOI 0735w-413n (0 / 20)
11830 features from OSM
3256 features selected
saved ../data/ne-laptop/tiles/0735w-413n/005-005.pq (  71 entities)
saved ../data/ne-laptop/tiles/0735w-413n/005-006.pq (  57 entities)
saved ../data/ne-laptop/tiles/0735w-413n/005-007.pq (  68 entities)
saved ../data/ne-laptop/tiles/0735w-413n/005-008.pq (  87 entities)
saved ../data/ne-laptop/tiles/0735w-413n/005-009.pq (  48 entities)
saved ../data/ne-laptop/tiles/0735w-413n/005-010.pq (  23 entities)
saved ../data/ne-laptop/tiles/0735w-413n/005-011.pq (  53 entities)
saved ../data/ne-laptop/tiles/0735w-413n/005-012.pq ( 130 entities)
saved ../data/ne-laptop/tiles/0735w-413n/005-013.pq ( 210 entities)
saved ../data/ne-laptop/tiles/0735w-413n/006-005.pq (  81 entities)
saved ../data/ne-laptop/tiles/0735w-413n/006-006.pq (  64 entities)
saved ../data/ne-laptop/tiles/0735w-413n/006-007.pq (  53 entities)
saved ../data/ne-laptop/tiles/0735w-413n/006-008.pq (  45 entities)
saved ../data/ne-laptop/tiles/073

In [30]:
# for g in tile_gents:
#     print(g['geometry'].bounds)

In [31]:
# For debugging.

# def pr_feature(f):
#     print('feature')
#     for k in f:
#         if str(f[k]) != 'nan':
#             print('    ', k, f[k])

# def pr_gent(g):
#     for k in g:
#         if k == 'feature':
#             pr_feature(g[k])
#         else:
#             print(k, g[k])
#         geomxy = shapely.ops.transform(proj_inverse, feature['geometry'])

# pr_gent(gents[121])

In [32]:
pd.DataFrame(gents)[['category', 'label', 'gtype']].value_counts().sort_index()

category  label               gtype       
amenity   commercial          Point             31
          food and drink      Point              9
          parking lot         Polygon          667
          recreation          Point              3
landuse   agricultural        Polygon            4
          commercial          Polygon            3
          industrial          Polygon           24
          meadow              Polygon            8
          recreation          Polygon            8
          residential         Polygon           14
          retail              Polygon           16
railway   rail                LineString        74
          rail stop           Point              5
route     highway             LineString       353
          primary road        LineString       153
          residential road    LineString      2302
          secondary road      LineString       388
          tertiary road       LineString       194
waterway  lakes and ponds     Polygon  

## QA Check

In [33]:
# # Take a look at the land/water polygons.
# import folium
# center_lon = (lon0 + lon1) / 2.0
# center_lat = (lat0 + lat1) / 2.0

# map_center = [center_lat, center_lon]
# m = folium.Map(location=map_center, zoom_start=10)
# geo_json = folium.GeoJson(landwater)
# geo_json.add_to(m)
# m


In [34]:
# import folium
# center_lon = (lon0 + lon1) / 2.0
# center_lat = (lat0 + lat1) / 2.0

# map_center = [center_lat, center_lon]
# m = folium.Map(location=map_center, zoom_start=10)
# for gent in tile_gents:
#     g0 = gent['geometry']
#     g1 = shapely.affinity.translate(g0, xoff=gent['xoff'], yoff=gent['yoff'])
#     g2 = shapely.ops.transform(proj_inverse, g1)
#     geo_json = folium.GeoJson(g2)
#     geo_json.add_to(m)
# m
