# Apply the trained embedding model to selected locations

The real purpose of this notebook is to develop and end-to-end process
to pull geo data, get MPP encodings, and then apply the initial and
final embedding models.

## Processing Setup

In [9]:
# Google colab
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)
!pip install geo_encodings

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting geo_encodings
  Downloading geo_encodings-1.0.4-py2.py3-none-any.whl.metadata (4.0 kB)
Downloading geo_encodings-1.0.4-py2.py3-none-any.whl (6.9 kB)
Installing collected packages: geo_encodings
Successfully installed geo_encodings-1.0.4


In [None]:
# Local processing setup
# project_home = '..'

## Notebook Setup

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from typing import List, Tuple, Optional

import pickle
import json
import pandas as pd
import numpy as np

import sys
sys.path.append('%s/03-embeddings' % project_home)
from embedderv5 import *

## Parameters

In [3]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# The unique identifier of the model to be used.
run_id = '201b'

# Identifier of the splits file.
splits_id = '201'



## Preliminaries

In [6]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
encoding_resolution = roi['encoding_resolution']

roi


{'name': 'newengland',
 'lon0': -73.564321,
 'lat0': 41.253746,
 'lon1': -68.058533,
 'lat1': 45.116468,
 'proj_def': '\n+proj=tmerc +lat_0=43.185107 +lon_0=-70.81142700000001\n+k=1.0 +x_0=231000.0 +y_0=211000.0 +datum=WGS84 +units=m +no_defs\n',
 'tile_size': 2000,
 'tile_shift': 1000,
 'encoding_resolution': 100}

In [10]:
# Define an encoder to use.
from geo_encodings import MPPEncoder
encoder = MPPEncoder(
    region=[0, 0, tile_size, tile_size],
    resolution=encoding_resolution,
    center=True
)
geo_encoding_dim = len(encoder)
print('%d elements in encodings' % geo_encoding_dim)


400 elements in encodings


## Processing


In [11]:
# Set a lon/lat for the center of a tile
center_lat, center_lon = 43.000659, -70.921196 # Stratham BMW


In [None]:
# We will divide into training and validation sets based on AOI.
# The splits have already been determined, before training the initial MGM.
# Here we look them up and re-organize things a bit.
fname = '%s/models/splits-%s.csv' % (roi_home, splits_id)
splits = pd.read_csv(fname)
print('%d splits' % len(splits))
splits.head(3)

splits_lookup = {
    '%s : %s' % (z['aoi_tag'], z['tile_tag']): z['split']
    for k, z in splits.iterrows()
}
print('%d elements in splits lookup' % len(splits_lookup))


In [None]:
# Get a list of tiles.
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles' % len(tile_info))

# Make a lookup table for tile info.
tile_info_lookup = {
    '%s : %s' % (z['aoi_tag'], z['tile_tag']): z
    for z in tile_info.to_dict('records')
}

In [None]:
# Get the list of AOI tags.
aoi_tags = np.unique(tile_info['aoi_tag'])
print('%d unique AOIs' % len(aoi_tags))

In [None]:
# Load final embeddings.
embeddings_lookup = {}

fname = '%s/embeddings/embeddings-%s.pkl' % (roi_home, run_id)
with open(fname, 'rb') as source:
    a = pickle.load(source)

print('%d total embeddings' % len(a))


In [None]:
embedding_dim = a[0]['embedding'].shape[-1]
print('dimension of embeddings is %d' % embedding_dim)

In [None]:
a[0]

In [None]:
# Reorganize those.
idents = []
embedding_list = []
for rec in a:
  idents.append('%s : %s' % (rec['aoi_tag'], rec['tile_tag']))
  embedding_list.append(rec['embedding'])

embeddings = np.vstack(embedding_list)
print(embeddings.shape)
print(len(idents))

In [None]:
# Pick a random embedding.
n = len(idents)
ix = np.random.choice(n)
print(ix)

# Compute distance between the selected embedding and all other embeddings.
dd = np.linalg.norm(embeddings - embeddings[ix], axis=1)

reix = np.argsort(dd)
print(reix[:20])
print(dd[reix[:20]])

ix0 = ix
ix1 = reix[1]

ident_0 = idents[ix0]
ident_1 = idents[ix1]
embed_0 = embeddings[0]
embed_1 = embeddings[1]
info_0 = tile_info_lookup[ident_0]
info_1 = tile_info_lookup[ident_1]
print(info_0)
print(info_1)

In [None]:
import folium

# Create a map centered between the two tiles
center_lat = (info_0['center_lat'] + info_1['center_lat']) / 2
center_lon = (info_0['center_lon'] + info_1['center_lon']) / 2
m = folium.Map(location=[center_lat, center_lon], zoom_start=10,
               width=800, height=500)

# Add bounding box for info_0
folium.Rectangle(
    bounds=[[info_0['lat0'], info_0['lon0']], [info_0['lat1'], info_0['lon1']]],
    color='blue',
    fill=True,
    fill_color='blue',
    fill_opacity=0.2,
    tooltip=ident_0
).add_to(m)

# Add bounding box for info_1
folium.Rectangle(
    bounds=[[info_1['lat0'], info_1['lon0']], [info_1['lat1'], info_1['lon1']]],
    color='red',
    fill=True,
    fill_color='red',
    fill_opacity=0.2,
    tooltip=ident_1
).add_to(m)

# Display the map
display(m)

## Clustering

In [None]:
from sklearn.cluster import KMeans

# Define the number of clusters
n_clusters = 5

# Initialize and fit the KMeans model
kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10)
cluster_labels = kmeans.fit_predict(embeddings)

# Print the cluster labels
print(cluster_labels)

In [None]:
import folium
import pandas as pd

# Create a map centered on the approximate center of the data
# (using the first tile as a starting point)
if idents:
    first_tile_info = tile_info_lookup[idents[0]]
    m = folium.Map(location=[first_tile_info['center_lat'], first_tile_info['center_lon']], zoom_start=6, width=800, height=500)
else:
    m = folium.Map(location=[0, 0], zoom_start=2, width=800, height=500) # Default map if no idents

# Define a color map for the clusters
# You can adjust the colors based on the number of clusters
colors = [
    'red', 'blue', 'green', 'purple', 'orange', 'darkred',
    'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue',
    'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen',
    'gray', 'black', 'lightgray', 'gold'
]

# Ensure enough colors for the number of clusters
if n_clusters > len(colors):
    print(f"Warning: Not enough colors defined for {n_clusters} clusters. Using repeating colors.")
    colors = (colors * ((n_clusters // len(colors)) + 1))[:n_clusters]


# Add each tile to the map with its cluster color
for i, ident in enumerate(idents):
    tile_info = tile_info_lookup[ident]
    cluster = cluster_labels[i]
    color = colors[cluster]

    # radius = 10  # Adjust the radius as needed
    # folium.CircleMarker(
    #     location=[tile_info['center_lat'], tile_info['center_lon']],
    #     radius=radius,
    #     color=color,
    #     stroke=False,
    #     fill=True,
    #     fill_opacity=0.6,
    #     opacity=1,
    # ).add_to(m)

    folium.Rectangle(
        bounds=[[tile_info['lat0'], tile_info['lon0']], [tile_info['lat1'], tile_info['lon1']]],
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.4,
        tooltip=f"Tile: {ident}, Cluster: {cluster}"
    ).add_to(m)

# Display the map
display(m)