# Feature vectors for training an embedding model

In another notebook, we created
embeddings for our tiles, using a masked geospatial model.
They consist of a set of vectors that are still associated witht he individual 
geospatial entities in each tile. And the transformation is perm-e, 
which is not what we want for a regional embedding.

We seek a model that aggregates these initial embeddings,
using contrastive loss. But that means we need some feature vector by which
we can judge different tiles as "similar" or "dissimilar". 
In this notebook, we create such feature vectors.

## Processing Setup

In [None]:
# Google colab setup
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# project_home = '/content/drive/MyDrive/Projects/verge'
# os.chdir(project_home)

In [None]:
# Local processing setup
project_home = '..'

## Notebook Setup

In [None]:
import pandas as pd
import numpy as np
import glob
import pickle
import os
import torch
import torch.nn as nn
import torch.utils
import torch.utils.data
from torch.utils.data import DataLoader
import copy
import json

import sys
sys.path.append(project_home)
from utils.geo_transformer_mem import VergeDataset, verge_collate_fn, GeospatialTransformer


## Parameters

In [None]:
# The name of the ROI to use.
roi_name = 'ne-laptop'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# The unique identifier of the model to be used.
run_id = '102'


## Preliminaries

In [None]:
# Load labels
fname = "%s/labels.csv" % data_home
labels = pd.read_csv(fname)
labels.head(3)

In [None]:
# Get a list of tiles.
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles' % len(tile_info))
tile_info.head(3)

In [None]:
# Get the list of AOI tags.
aoi_tags = np.unique(tile_info['aoi_tag'])
print('%d unique AOIs' % len(aoi_tags))

## Processing

In [None]:
for aoi_tag in aoi_tags:
    iok = tile_info['aoi_tag'] == aoi_tag
    aoi_tiles = tile_info[iok]
    print(aoi_tag, len(aoi_tiles))

    for k, ti in enumerate(aoi_tiles.to_dict('records')):

        tile_tag = ti['tile_tag']
        print('handling tile %s' % tile_tag)
        
        # Read the tile data.
        fname = "%s/%s/%s.pq" % (roi_home, aoi_tag, tile_tag)
        tile = geopandas.read_parquet(fname)

        # Make a label column that matches the format of the "labels" data frame.
        tile['full_label'] = tile['category'] + ' : ' + tile['label']
    
        # Get a tally of the coverage of each type of thing.
        coverages = { z: 0.0 for z in labels['label'].values }
        
        for rec in tile.to_dict('records'):
            if rec['gtype'] == 'Polygon':
                numerator = rec['geometry'].area / 1000000.0
            elif rec['gtype'] == 'LineString':
                numerator = rec['geometry'].length / 1000.0
            else:
                numerator = 1.0
        
            label = rec['full_label']
            coverages[label] += numerator

        # Get a feature vector and add it to the list.
        v = np.array([coverages[z] for z in sorted(labels['label'].values)])
        feature_vectors.append(v)
        center_lons.append(center_lon)
        center_lats.append(center_lat)
    

In [None]:
# Get feature vectors for each tile.

feature_vectors = []
center_lons = []
center_lats = []

for k, ti in enumerate(tile_info.to_dict('records')):

    if k % 1000 == 0:
        print('handling tile %d / %d' % (k, len(tile_info)))
        
    fname = "%s/%s" % (data_dname, ti['fname'][6:])
    center_lon = ti['center_lon']
    center_lat = ti['center_lat']

    # Read the tile data.
    tile = geopandas.read_parquet(fname)

    # Make a label column that matches the format of the "labels" data frame.
    tile['full_label'] = tile['category'] + ' : ' + tile['label']

    # Get a tally of the coverage of each type of thing.
    coverages = { z: 0.0 for z in labels['label'].values }
    
    for rec in tile.to_dict('records'):
        if rec['gtype'] == 'Polygon':
            numerator = rec['geometry'].area / 1000000.0
        elif rec['gtype'] == 'LineString':
            numerator = rec['geometry'].length / 1000.0
        else:
            numerator = 1.0
    
        label = rec['full_label']
        coverages[label] += numerator

    # Get a feature vector and add it to the list.
    v = np.array([coverages[z] for z in sorted(labels['label'].values)])
    feature_vectors.append(v)
    center_lons.append(center_lon)
    center_lats.append(center_lat)
    

In [None]:
features = np.vstack(feature_vectors)
print('feature matrix shape', features.shape)

In [None]:
# Normalize the rows and columns.
m = features.mean(axis=0)
s = features.std(axis=0)
zzz = (features - m) / (s + 0.0001)

## TSNE plot of feature vectors

In [None]:
# Make a TSNE plot of the scaled features
from sklearn.manifold import TSNE
tsne = TSNE()
xy = tsne.fit_transform(zzz)

## Run a DBSCAN clustering on the feature vectors.

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=12)
model.fit(feature_vectors)
print('%d clusters' % len(set(model.labels_)))

In [None]:
import plotly
from plotly.subplots import make_subplots
from plotly.graph_objects import Scatter

colors = [
    "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00",
    "#ffff33", "#a65628", "#f781bf", "#999999", "#66c2a5",
    "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854", "#ffd92f",
    "#e5c494", "#b3b3b3", "#1b9e77", "#d95f02", "#7570b3",
    "#e7298a", "#66a61e", "#e6ab02", "#a6761d", "#666666",
    "#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3",
    "#fdb462", "#b3de69", "#fccde5", "#d9d9d9", "#bc80bd",
    "#ccebc5", "#ffed6f", "#a1dab4", "#41b6c4", "#2c7fb8"
]

labels = model.labels_
ulabels = set(labels)

fig = make_subplots(rows=1, cols=1)
for k, ulabel in enumerate(ulabels):
    iok = labels == ulabel
    tr = Scatter(x=xy[iok,0], y=xy[iok,1], mode='markers', 
                 marker={'color': colors[k%40]})
    fig.append_trace(tr, 1, 1)
        
lo = fig['layout']
lo['width'] = 800
lo['height'] = 500
fig


## Make a map showing the locations of all elements of a cluster

In [None]:
import folium

label = 7
iok = labels == label
coords = list(zip(np.array(center_lons)[iok], np.array(center_lats)[iok]))
# coords = list(zip(np.array(center_lons), np.array(center_lats)))

m = folium.Map(location=[coords[0][1], coords[0][0]], zoom_start=5)

# Add solid circle markers for each coordinate
for lon, lat in coords:
    folium.CircleMarker(
        location=[lat, lon],
        radius=5,               # Small circle
        color='black',          # Circle border color
        fill=True,
        fill_color='green',     # Solid fill color
        fill_opacity=1.0        # Fully opaque
    ).add_to(m)

m
