# Feature vectors for training an embedding model

In another notebook, we created
embeddings for our tiles, using a masked geospatial model.
They consist of a set of vectors that are still associated witht he individual 
geospatial entities in each tile. And the transformation is perm-e, 
which is not what we want for a regional embedding.

We seek a model that aggregates these initial embeddings,
using contrastive loss. But that means we need some feature vector by which
we can judge different tiles as "similar" or "dissimilar". 
In this notebook, we create such feature vectors.

## Processing Setup

In [None]:
# Google colab setup
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# project_home = '/content/drive/MyDrive/Projects/verge'
# os.chdir(project_home)

In [None]:
# Local processing setup
project_home = '..'

## Notebook Setup

In [None]:
import pandas as pd
import numpy as np
import glob
import pickle
import os
import torch
import torch.nn as nn
import torch.utils
import torch.utils.data
from torch.utils.data import DataLoader
import copy
import json
import geopandas

import sys
sys.path.append(project_home)
from utils.geo_transformer_mem import VergeDataset, verge_collate_fn, GeospatialTransformer


## Parameters

In [None]:
# The name of the ROI to use.
roi_name = 'ne-laptop'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# The unique identifier of the model to be used.
run_id = '102'


## Preliminaries

In [None]:
# Load labels
fname = "%s/labels.csv" % data_home
labels = pd.read_csv(fname)
labels.head(3)

In [None]:
# Get a list of tiles.
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles' % len(tile_info))
tile_info.head(3)

In [None]:
# Get the list of AOI tags.
aoi_tags = np.unique(tile_info['aoi_tag'])
print('%d unique AOIs' % len(aoi_tags))

## Processing

In [None]:
for aoi_tag in aoi_tags:

    # This will hold the feature info for each tile in this AOI.
    feature_info = []
    
    iok = tile_info['aoi_tag'] == aoi_tag
    aoi_tiles = tile_info[iok]

    for k, ti in enumerate(aoi_tiles.to_dict('records')):

        tile_tag = ti['tile_tag']
        # print('handling tile %s' % tile_tag)
        
        # Read the tile data.
        fname = "%s/tiles/%s/%s.pq" % (roi_home, aoi_tag, tile_tag)
        tile = geopandas.read_parquet(fname)

        # Get a tally of the coverage of each geospatial entity type -- point, 
        ## linestring, or polygon.
        coverages = { z: 0.0 for z in labels['label'].values }
        
        for rec in tile.to_dict('records'):
            if rec['gtype'] == 'Polygon':
                numerator = rec['geometry'].area / 1000000.0
            elif rec['gtype'] == 'LineString':
                numerator = rec['geometry'].length / 1000.0
            else:
                numerator = 1.0
        
            label = '%s : %s' % (rec['category'], rec['label'])
            coverages[label] += numerator

        # Get a feature vector and add it to the list.
        v = np.array([coverages[z] for z in sorted(labels['label'].values)])
        feature_info.append({
            'aoi_tag': aoi_tag,
            'tile_tag': tile_tag,
            'lon0': ti['lon0'],
            'lat0': ti['lat0'],
            'lon1': ti['lon1'],
            'lat1': ti['lat1'],
            'features': v
        })

    # Save that.
    ofname = '%s/features/%s.pkl' % (roi_home, aoi_tag)
    os.makedirs(os.path.dirname(ofname), exist_ok=True)
    with open(ofname, 'wb') as dest:
        pickle.dump(feature_info, dest)
    print('%d feature records written to %s' % (len(feature_info), ofname))
    

In [None]:
len(feature_info)

In [None]:
feature_info[0]