### VERGE: Vector-mode Regional Geospatial Embeddings

# Assemble geographical data for the VERGE project

This notebook pulls and organizes the geospatial data that we will use in this effort.

We have a list of AOIs, defined as lon/lat bounding boxes. Here we pull data for each one,
and divide it up into tiles of a given size.

## Processing setup

In [1]:
# Google colab setup
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install osmnx pygeohash geo-encodings



In [3]:
# # Local processing setup
# project_home = '..'

In [4]:
!pwd

/content/drive/MyDrive/Projects/verge


## Code

In [5]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)


## Setup

In [6]:
import numpy as np
import pandas as pd
import pyproj
import shapely
import osmnx
import pygeohash
import geopandas as gpd
import os
import copy
import json

import plotly
from plotly.subplots import make_subplots
from plotly.graph_objects import Scatter

from geo_encodings import draw_shape

import sys
from utils.verge import rules


## Parameters

In [7]:
# A tile has to contain this many entities in order to be retained.
min_entity_count = 20

# Set to True if we want to include the overall land/water polygon for each tile.
include_land_water = True


In [8]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
tile_shift = roi['tile_shift']
encoding_resolution = roi['encoding_resolution']

## Preliminaries

In [9]:
# Define a local map projection
def get_projections(proj_def):
    ltm_crs = pyproj.CRS.from_proj4(proj_def)
    wgs84_crs = pyproj.CRS.from_epsg(4326)
    proj_forward = pyproj.Transformer.from_crs(wgs84_crs, ltm_crs, always_xy=True).transform
    proj_inverse = pyproj.Transformer.from_crs(ltm_crs, wgs84_crs, always_xy=True).transform
    return proj_forward, proj_inverse

proj_forward, proj_inverse = get_projections(roi['proj_def'])

In [10]:
# This function gets an overall land/water polygon for an AOI.
# It does it by conasidering both the "coastlines" shapefile
# and all polygonal water features.

# Read the coastline file.
if include_land_water:
    fname = '%s/coastlines' % (roi_home)
    coastlines_gdf = gpd.read_file(fname)
    print('%d coastline polygons' % len(coastlines_gdf))

def get_land_water(bounds, features):

    # Create a baseline polygon consisting of the whole AOI.
    landwater = copy.deepcopy(bounds)

    # Intersect that with the coastlines data.
    coastlines = shapely.union_all(coastlines_gdf['geometry'].values)
    landwater = landwater.intersection(coastlines)

    # subtract out any polygonal water feature.
    for _, f in features.iterrows():
        if f['geometry'].geom_type in ['Polygon', 'MultiPolygon']:
            if f['natural'] == 'water':
                landwater = shapely.difference(landwater, f['geometry'])

    return landwater

3514 coastline polygons


## Processing

In [11]:
# Read the file with the list of AOIs
fname = '%s/aois.csv' % (roi_home)
aois = pd.read_csv(fname).to_dict('records')
# aois = np.random.permutation(aois)
print('%d areas of interest' % len(aois))

263 areas of interest


In [12]:
# This will save some extra info that we will need.
tile_info = []

for k, aoi in enumerate(aois):

    print('\nprocessing AOI %s (%d / %d)' % (aoi['name'], k, len(aois)))

    # Use that projection to define lon/lat bounds for the query below. Make sure the bounds go
    # a little farther out than necessary to avoid edge artifacts from map projections.
    buffer = 0.01
    lon0 = aoi['lon0'] - buffer
    lat0 = aoi['lat0'] - buffer
    lon1 = aoi['lon1'] + buffer
    lat1 = aoi['lat1'] + buffer
    query_bounds = [lon0, lat0, lon1, lat1]

    # print('re-projected query bounds, with buffer:')
    # print(proj_forward(lon0, lat0))
    # print(proj_forward(lon1, lat1))

    # Query for all the geospatial entities we need within the bounding box.
    print('getting OSM features')
    tags = {
        'landuse': True,
        'place': True,
        'highway': True,
        'railway': True,
        #'aeroway': True,
        'bridge': True,
        'tunnel': True,
        #'power': True,
        'natural': True,
        'waterway': True,
        'landcover': True,
        #'building': True,
        'amenity': True,
        'shop': True,
        'leisure': True
    }
    features = osmnx.features.features_from_bbox(query_bounds, tags=tags).reset_index()
    print('%d features from OSM' % len(features))

    # Re-format and filter everything.
    # BTW, "gents" is "geospatial entities".
    gents = []
    for feature in features.to_dict('records'):

        geomxy = shapely.ops.transform(proj_forward, feature['geometry'])
        if geomxy.is_empty:
            continue
        gtype = geomxy.geom_type

        for rule in rules:
            if gtype == rule['gtype']:
                osm_key = rule['osm_key']
                if osm_key in feature:
                    osm_value = str(feature[osm_key])
                    if osm_value in rule['osm_values']:
                        gents.append({
                            'feature': feature,
                            'category': rule['gent_category'],
                            'label': rule['gent_label'],
                            'geomxy': geomxy,
                            'gtype': gtype
                        })
    print('%d features selected' % len(gents))

    # We need some special handling to create a general "land/water" polygon.
    if include_land_water:
        lons = [lon0, lon1, lon1, lon0, lon0]
        lats = [lat0, lat0, lat1, lat1, lat0]
        lonlat_bounds = shapely.Polygon(list(zip(lons, lats)))
        landwater = get_land_water(lonlat_bounds, features)
        landwaterxy = shapely.ops.transform(proj_forward, landwater)
        gents.append({
            'category': 'waterway',
            'label': 'land',
            'geomxy': landwaterxy,
            'gtype': landwaterxy.geom_type
        })

    # Determine the AOI bounds in projected coordinates.
    aoi_x0, aoi_y0 = proj_forward(aoi['lon0'], aoi['lat0'])
    aoi_x1, aoi_y1 = proj_forward(aoi['lon1'], aoi['lat1'])

    # Loop over tiles within this AOI.
    x0 = aoi_x0
    while x0 + tile_size < aoi_x1:

        y0 = aoi_y0
        while y0 + tile_size < aoi_y1:

            # Figure out the lon/lat center of this tile. We will need it
            # for some later processing.
            xc = x0 + tile_size / 2
            yc = y0 + tile_size / 2
            tile_lon, tile_lat = proj_inverse(xc, yc)

            x1 = x0 + tile_size
            y1 = y0 + tile_size
            xx = [x0, x1, x1, x0, x0]
            yy = [y0, y0, y1, y1, y0]
            tile_bbox = shapely.Polygon(list(zip(xx, yy)))

            tile_gents = []
            for gent in gents:
                geomxy = shapely.affinity.translate(
                    gent['geomxy'].intersection(tile_bbox),
                    xoff=-x0, yoff=-y0
                )
                if geomxy.is_empty:
                    continue
                tile_gents.append({
                    'category': gent['category'],
                    'label': gent['label'],
                    'geometry': geomxy,
                    'gtype': gent['gtype'],
                    'xoff': x0,
                    'yoff': y0,
                })

            # Save that if it's big enough.
            if len(tile_gents) >= min_entity_count:
                tx = '%03d' % (x0 / tile_shift)
                ty = '%03d' % (y0 / tile_shift)
                fname = '%s/tiles/%s/%s-%s.pq' % (roi_home, aoi['name'], tx, ty)
                os.makedirs(os.path.dirname(fname), exist_ok=True)
                gdf = gpd.GeoDataFrame(tile_gents).drop_duplicates()
                gdf.to_parquet(fname, index=False, compression="zstd")
                # print('wrote %s (%d)' % (fname, len(gdf)))

                # Save the tile info.
                tile_info.append({
                    'fname': fname,
                    'center_lon': tile_lon,
                    'center_lat': tile_lat
                })

            y0 += tile_shift
        x0 += tile_shift

# Save the tile info.
fname = '%s/tile_info.csv' % roi_home
pd.DataFrame(tile_info).to_csv(fname, index=False)
print('saved tile info in %s' % fname)



processing AOI: 0735w-413n
getting OSM features
11827 features from OSM
3254 features selected

processing AOI: 0735w-414n
getting OSM features
16283 features from OSM
3349 features selected

processing AOI: 0735w-415n
getting OSM features
6819 features from OSM
1550 features selected

processing AOI: 0735w-446n
getting OSM features
3536 features from OSM
938 features selected

processing AOI: 0734w-414n
getting OSM features
11773 features from OSM
2084 features selected

processing AOI: 0733w-413n
getting OSM features
8413 features from OSM
2041 features selected

processing AOI: 0733w-414n
getting OSM features
9025 features from OSM
2048 features selected

processing AOI: 0733w-423n
getting OSM features
3077 features from OSM
927 features selected

processing AOI: 0733w-424n
getting OSM features
5860 features from OSM
2191 features selected

processing AOI: 0733w-444n
getting OSM features
13191 features from OSM
2271 features selected

processing AOI: 0733w-445n
getting OSM features

In [13]:
# for g in tile_gents:
#     print(g['geometry'].bounds)

In [14]:
# For debugging.

# def pr_feature(f):
#     print('feature')
#     for k in f:
#         if str(f[k]) != 'nan':
#             print('    ', k, f[k])

# def pr_gent(g):
#     for k in g:
#         if k == 'feature':
#             pr_feature(g[k])
#         else:
#             print(k, g[k])
#         geomxy = shapely.ops.transform(proj_inverse, feature['geometry'])

# pr_gent(gents[121])

In [15]:
pd.DataFrame(gents)[['category', 'label', 'gtype']].value_counts().sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
category,label,gtype,Unnamed: 3_level_1
amenity,commercial,Point,7
amenity,food and drink,Point,2
amenity,parking lot,Polygon,210
landuse,agricultural,Polygon,7
landuse,forest,Polygon,18
landuse,industrial,Polygon,6
landuse,meadow,Polygon,1
landuse,recreation,Polygon,1
landuse,residential,Polygon,17
landuse,retail,Polygon,1


## QA Check

In [16]:
# # Take a look at the land/water polygons.
# import folium
# center_lon = (lon0 + lon1) / 2.0
# center_lat = (lat0 + lat1) / 2.0

# map_center = [center_lat, center_lon]
# m = folium.Map(location=map_center, zoom_start=10)
# geo_json = folium.GeoJson(landwater)
# geo_json.add_to(m)
# m


In [17]:
# import folium
# center_lon = (lon0 + lon1) / 2.0
# center_lat = (lat0 + lat1) / 2.0

# map_center = [center_lat, center_lon]
# m = folium.Map(location=map_center, zoom_start=10)
# for gent in tile_gents:
#     g0 = gent['geometry']
#     g1 = shapely.affinity.translate(g0, xoff=gent['xoff'], yoff=gent['yoff'])
#     g2 = shapely.ops.transform(proj_inverse, g1)
#     geo_json = folium.GeoJson(g2)
#     geo_json.add_to(m)
# m
