# Feature vectors for training an embedding model

In another notebook, we created
embeddings for our tiles, using a masked geospatial model.
They consist of a set of vectors that are still associated witht he individual
geospatial entities in each tile. And the transformation is perm-e,
which is not what we want for a regional embedding.

We seek a model that aggregates these initial embeddings,
using contrastive loss. But that means we need some feature vector by which
we can judge different tiles as "similar" or "dissimilar".
In this notebook, we create such feature vectors.

## Processing Setup

In [7]:
# Google colab setup
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Local processing setup
# project_home = '..'

## Notebook Setup

In [9]:
import pandas as pd
import numpy as np
import glob
import pickle
import os
import torch
import torch.nn as nn
import torch.utils
import torch.utils.data
from torch.utils.data import DataLoader
import copy
import json
import geopandas

import sys
sys.path.append(project_home)
from utils.geo_transformer_mem import VergeDataset, verge_collate_fn, GeospatialTransformer


## Parameters

In [10]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# The unique identifier of the model to be used.
run_id = '201b'


## Preliminaries

In [11]:
# Load labels
fname = "%s/labels.csv" % data_home
labels = pd.read_csv(fname)
labels.head(3)

Unnamed: 0,id,label
0,0,amenity : commercial
1,1,amenity : food and drink
2,2,amenity : parking lot


In [12]:
# Get a list of tiles.
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles' % len(tile_info))
tile_info.head(3)

15793 tiles


Unnamed: 0,aoi_tag,tile_tag,lon0,lat0,lon1,lat1,center_lon,center_lat
0,0735w-413n,005-005,-73.5,41.3,-73.476879,41.318544,-73.48844,41.309272
1,0735w-413n,005-006,-73.50037,41.308994,-73.477246,41.327538,-73.488808,41.318266
2,0735w-413n,005-007,-73.50074,41.317988,-73.477613,41.336533,-73.489176,41.32726


In [13]:
# Get the list of AOI tags.
aoi_tags = np.unique(tile_info['aoi_tag'])
print('%d unique AOIs' % len(aoi_tags))

263 unique AOIs


## Processing

In [16]:
for k_aoi, aoi_tag in enumerate(aoi_tags):

    # If the output fiel already exists, skip it.
    ofname = '%s/features/%s.pkl' % (roi_home, aoi_tag)
    if os.path.exists(ofname):
        print('skipping %s [%d/%d]' % (ofname, k_aoi, len(aoi_tags)))
        continue


    # This will hold the feature info for each tile in this AOI.
    feature_info = []

    iok = tile_info['aoi_tag'] == aoi_tag
    aoi_tiles = tile_info[iok]

    for k, ti in enumerate(aoi_tiles.to_dict('records')):

        tile_tag = ti['tile_tag']
        # print('handling tile %s' % tile_tag)

        # Read the tile data.
        fname = "%s/tiles/%s/%s.pq" % (roi_home, aoi_tag, tile_tag)
        tile = geopandas.read_parquet(fname)

        # Get a tally of the coverage of each geospatial entity type -- point,
        ## linestring, or polygon.
        coverages = { z: 0.0 for z in labels['label'].values }

        for rec in tile.to_dict('records'):
            if rec['gtype'] == 'Polygon':
                numerator = rec['geometry'].area / 1000000.0
            elif rec['gtype'] == 'LineString':
                numerator = rec['geometry'].length / 1000.0
            else:
                numerator = 1.0

            label = '%s : %s' % (rec['category'], rec['label'])
            coverages[label] += numerator

        # Get a feature vector and add it to the list.
        v = np.array([coverages[z] for z in sorted(labels['label'].values)])
        feature_info.append({
            'aoi_tag': aoi_tag,
            'tile_tag': tile_tag,
            'lon0': ti['lon0'],
            'lat0': ti['lat0'],
            'lon1': ti['lon1'],
            'lat1': ti['lat1'],
            'features': v
        })

    # Save that.
    ofname = '%s/features/%s.pkl' % (roi_home, aoi_tag)
    os.makedirs(os.path.dirname(ofname), exist_ok=True)
    with open(ofname, 'wb') as dest:
        pickle.dump(feature_info, dest)
    print('%d feature records written to %s [%d/%d]' % (len(feature_info), ofname, k_aoi, len(aoi_tags)))


skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0687w-449n.pkl [0/263]
skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0688w-447n.pkl [1/263]
skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0688w-448n.pkl [2/263]
skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0689w-447n.pkl [3/263]
skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0697w-438n.pkl [4/263]
skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0697w-445n.pkl [5/263]
skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0698w-442n.pkl [6/263]
skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0698w-443n.pkl [7/263]
skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0699w-439n.pkl [8/263]
skipping /content/drive/MyDrive/Projects/verge/data/newengland/features/0700w-417n.pkl [9/263]
skipping /content/drive/MyDrive/Projects/verge/dat

In [17]:
len(feature_info)

39

In [18]:
feature_info[0]

{'aoi_tag': '0735w-446n',
 'tile_tag': '017-371',
 'lon0': -73.5,
 'lat0': 44.60000000000001,
 'lon1': -73.47565874649815,
 'lat1': 44.61856841903063,
 'features': array([0.        , 0.        , 0.00747865, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 4.29397661,
        1.83464746, 4.22998132, 0.        , 0.10437164, 0.01677498,
        1.        , 0.        ])}