# Apply the trained embedding model to the entire dataset

## Processing Setup

In [20]:
# Google colab
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# Local processing setup
# project_home = '..'

## Notebook Setup

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from typing import List, Tuple, Optional

import pickle
import pandas as pd

import sys
sys.path.append('%s/03-embeddings' % project_home)
from embedderv5 import *

## Parameters

In [23]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# The unique identifier of the model to be used.
run_id = '201b'

# Identifier of the splits file.
splits_id = '201'

# What type of device to train on.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device', device)


using device cuda


## Load and organize data
We have two data sources that we need to associate with one another:
a set of initial embeddings ("initials") and a set of feature vectors
to be used for similarity assessments ("features").

In [24]:
# We will divide into training and validation sets based on AOI.
# The splits have already been determined, before training the initial MGM.
# Here we look them up and re-organize things a bit.
fname = '%s/models/splits-%s.csv' % (roi_home, splits_id)
splits = pd.read_csv(fname)
print('%d splits' % len(splits))
splits.head(3)

splits_lookup = {
    '%s : %s' % (z['aoi_tag'], z['tile_tag']): z['split']
    for k, z in splits.iterrows()
}
print('%d elements in splits lookup' % len(splits_lookup))


15793 splits
15793 elements in splits lookup


In [25]:
# Get a list of tiles.
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles' % len(tile_info))
tile_info.head(3)

15793 tiles


Unnamed: 0,aoi_tag,tile_tag,lon0,lat0,lon1,lat1,center_lon,center_lat
0,0735w-413n,005-005,-73.5,41.3,-73.476879,41.318544,-73.48844,41.309272
1,0735w-413n,005-006,-73.50037,41.308994,-73.477246,41.327538,-73.488808,41.318266
2,0735w-413n,005-007,-73.50074,41.317988,-73.477613,41.336533,-73.489176,41.32726


In [26]:
# Get the list of AOI tags.
aoi_tags = np.unique(tile_info['aoi_tag'])
print('%d unique AOIs' % len(aoi_tags))

263 unique AOIs


In [27]:
# Load initial embeddings. Put them into a lookup table based on aoi/tile identifiers.
embeddings_lookup = {}

for k, aoi_tag in enumerate(aoi_tags):

    if k % 10 == 0:
      print('%d/%d' % (k, len(aoi_tags)))

    fname = '%s/initials/%s.pkl' % (roi_home, aoi_tag)
    with open(fname, 'rb') as source:
        a = pickle.load(source)
    for b in a:
        key = '%s : %s' % (b['aoi_tag'], b['tile_tag'])
        e = b['embedding']
        embeddings_lookup[key] = e

print('%d total embeddings' % len(embeddings_lookup))


0/263
10/263
20/263
30/263
40/263
50/263
60/263
70/263
80/263
90/263
100/263
110/263
120/263
130/263
140/263
150/263
160/263
170/263
180/263
190/263
200/263
210/263
220/263
230/263
240/263
250/263
260/263
15793 total embeddings


In [28]:
embedding_dim = e.shape[-1]
print('dimension of embeddings is %d' % embedding_dim)

dimension of embeddings is 128


In [29]:
# Load initial Features. Ditto.
features_lookup = {}

for k, aoi_tag in enumerate(aoi_tags):

    if k % 10 == 0:
      print('%d/%d' % (k, len(aoi_tags)))

    fname = '%s/features/%s.pkl' % (roi_home, aoi_tag)
    with open(fname, 'rb') as source:
        a = pickle.load(source)
    for b in a:
        key = '%s : %s' % (b['aoi_tag'], b['tile_tag'])
        f = b['features']
        features_lookup[key] = f

print('%d total feature vectors' % len(features_lookup))


0/263
10/263
20/263
30/263
40/263
50/263
60/263
70/263
80/263
90/263
100/263
110/263
120/263
130/263
140/263
150/263
160/263
170/263
180/263
190/263
200/263
210/263
220/263
230/263
240/263
250/263
260/263
15793 total feature vectors


In [30]:
# Organize the data the way the model expects it.
sequences = []
features = []
idents = []
for key in features_lookup.keys():
    f = features_lookup[key]
    if key in embeddings_lookup:
        e = embeddings_lookup[key].squeeze().detach().cpu().numpy()
        sequences.append(e)
        features.append(f)
        idents.append(key)
    else:
        print('key mismatch: %s' % key)

print('%d sequences' % len(sequences))

15793 sequences


In [31]:
# # # This was the Claude-generated code to generate test data for the model.
# sequences, similarity_features = generate_sample_data(
#     num_instances=1000, min_R=5, max_R=20, C=32, similarity_dim=16
# )
# print(type(sequences))
# print(type(sequences[0]))
# print(sequences[0].shape)

## Model
The model code, including data loaders, the model itself, loss function, and all that,
were generated by Claude via a lot of iterative prompting and debugging.


In [32]:
# Claude generated code for training, which includes a collate function that
# deals with positive and negative exmaples for the contrastive loss function.
# To compute embeddings, we don't need that. So this is my re-write, which (1) only
# deals with the anchor examples, and (2) returns identifiers for each instance as well.

from embedderv5 import _pad_sequences


def embedding_collate_fn(batch):
    """
    Collate function for triplet data (anchor, positive, negatives).
    """
    # anchors, positives, negatives_list = [], [], []
    anchors = []

    for item in batch:
        anchor_seq, anchor_sim, anchor_ident = item['anchor']
        # pos_seq, pos_sim = item['positive']
        # neg_items = item['negatives']

        anchors.append((anchor_seq, anchor_sim, anchor_ident))
        # positives.append((pos_seq, pos_sim))
        # negatives_list.append(neg_items)

    # Process anchors
    anchor_seqs, anchor_sims, anchor_idents = zip(*anchors)
    anchor_padded, anchor_masks = _pad_sequences(anchor_seqs)
    anchor_sims = np.array(anchor_sims)

    # Process positives
    # pos_seqs, pos_sims = zip(*positives)
    # pos_padded, pos_masks = _pad_sequences(pos_seqs)
    # pos_sims = np.array(pos_sims)

    # Process negatives (flatten all negatives from all batch items)
    # all_neg_seqs, all_neg_sims = [], []
    # neg_batch_indices = []  # Track which batch item each negative belongs to

    # for batch_idx, neg_items in enumerate(negatives_list):
    #     for neg_seq, neg_sim in neg_items:
    #         all_neg_seqs.append(neg_seq)
    #         all_neg_sims.append(neg_sim)
    #         neg_batch_indices.append(batch_idx)

    # neg_padded, neg_masks = _pad_sequences(all_neg_seqs)
    # all_neg_sims = np.array(all_neg_sims)
    # neg_batch_indices = np.array(neg_batch_indices)

    return {
        'anchor': (
            torch.FloatTensor(anchor_padded),
            torch.BoolTensor(anchor_masks),
            torch.FloatTensor(anchor_sims),
            anchor_idents
        ),
        # 'positive': (torch.FloatTensor(pos_padded), torch.BoolTensor(pos_masks),
        #              torch.FloatTensor(pos_sims)),
        # 'negatives': (torch.FloatTensor(neg_padded), torch.BoolTensor(neg_masks),
        #               torch.FloatTensor(all_neg_sims), torch.LongTensor(neg_batch_indices))
    }


In [33]:
# Create dataset and dataloader with explicit triplet sampling
dataset = ContrastivePairDataset(
    sequences,
    features,
    idents,
    similarity_threshold=0.5,  # Adjust based on your similarity features
    num_negatives=2  # Number of negatives per anchor
)
# loader = DataLoader(dataset, batch_size=256, shuffle=True, collate_fn=triplet_collate_fn)
loader = DataLoader(dataset, batch_size=256, shuffle=True, collate_fn=embedding_collate_fn)



In [34]:
# Initialize model
model = PermutationInvariantModel(
    input_dim=embedding_dim,
    hidden_dim=128,
    embedding_dim=embedding_dim,
    num_attention_heads=8,
    num_linear_layers=3,
    dropout=0.1
)

print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")


Model has 181768 parameters


In [35]:
# Define the filename of the trained model.
model_fname = '%s/models/model-%s.pth' % (roi_home, run_id)

# Load the trained model.
model.load_state_dict(torch.load(model_fname))
model.eval()
print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")


Model has 181768 parameters


In [36]:
model = model.to(device)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
# criterion = TripletContrastiveLoss(margin=0.5, temperature=0.1)
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.5)

all_embeddings = []

model.eval()
with torch.no_grad():
    for batch_idx, batch_data in enumerate(loader):

        # Unpack batch data
        anchor_seqs, anchor_masks, anchor_sims, anchor_idents = batch_data['anchor']

        # Move to device
        anchor_seqs, anchor_masks = anchor_seqs.to(device), anchor_masks.to(device)

        # Forward pass
        anchor_emb = model(anchor_seqs, anchor_masks)
        print(f"Batch anchor embeddings shape: {anchor_emb.shape}")

        # Unpack and re-associate embeddings with their identifiers.
        for i in range(len(anchor_idents)):
            ident = anchor_idents[i]
            aoi_tag, tile_tag = ident.split(' : ')
            embedding = anchor_emb[i].detach().cpu().numpy()
            all_embeddings.append({
                'aoi_tag': aoi_tag,
                'tile_tag': tile_tag,
                'embedding': embedding
            })


ofname = '%s/embeddings/embeddings-%s.pkl' % (roi_home, run_id)
os.makedirs(os.path.dirname(ofname), exist_ok=True)
with open(ofname, 'wb') as target:
    pickle.dump(all_embeddings, target)
print('%d embeddings to %s' % (len(all_embeddings), ofname))


Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shape: torch.Size([256, 128])
Batch anchor embeddings shap

In [37]:
len(anchor_idents)

177

In [38]:

# # Example inference
# model.eval()
# with torch.no_grad():
#     sample_batch = next(iter(loader))
#     # Unpack the dictionary structure from triplet data loader
#     anchor_seqs, anchor_masks, anchor_sims = sample_batch['anchor']
#     anchor_seqs = anchor_seqs.to(device)
#     anchor_masks = anchor_masks.to(device)

#     # Generate embeddings for anchor samples
#     embeddings = model(anchor_seqs, anchor_masks)
#     print(f"Generated embeddings shape: {embeddings.shape}")
#     print(f"Sample embedding norm: {torch.norm(embeddings[0]).item():.4f}")

#     # Get embeddings for positives
#     pos_seqs, pos_masks, pos_sims = sample_batch['positive']
#     pos_seqs, pos_masks = pos_seqs.to(device), pos_masks.to(device)
#     pos_embeddings = model(pos_seqs, pos_masks)
#     print(f"Positive embeddings shape: {pos_embeddings.shape}")

#     # Get embeddings for negatives
#     neg_seqs, neg_masks, neg_sims, neg_batch_indices = sample_batch['negatives']
#     neg_seqs, neg_masks = neg_seqs.to(device), neg_masks.to(device)
#     neg_batch_indices = neg_batch_indices.to(device)
#     neg_embeddings = model(neg_seqs, neg_masks)
#     print(f"Negative embeddings shape: {neg_embeddings.shape}")

#     # Check similarity between anchors and positives
#     pos_similarities = F.cosine_similarity(embeddings, pos_embeddings, dim=1)
#     print(f"Anchor-Positive similarities: {pos_similarities.mean().item():.4f} ± {pos_similarities.std().item():.4f}")

#     # Check similarity between anchors and negatives
#     batch_size = embeddings.shape[0]
#     neg_similarities_all = []

#     for i in range(batch_size):
#         # Get negatives for this anchor
#         neg_mask = neg_batch_indices == i
#         if neg_mask.sum() > 0:
#             anchor_i = embeddings[i:i+1]  # (1, embedding_dim)
#             negatives_i = neg_embeddings[neg_mask]  # (num_negs, embedding_dim)

#             # Compute similarities between this anchor and its negatives
#             neg_sims_i = F.cosine_similarity(
#                 anchor_i.expand_as(negatives_i), negatives_i, dim=1
#             )
#             neg_similarities_all.extend(neg_sims_i.cpu().tolist())

#     if len(neg_similarities_all) > 0:
#         neg_similarities = torch.tensor(neg_similarities_all)
#         print(f"Anchor-Negative similarities: {neg_similarities.mean().item():.4f} ± {neg_similarities.std().item():.4f}")

#         # Show the difference (should be positive if model is learning well)
#         print(f"Positive vs Negative similarity difference: {pos_similarities.mean().item() - neg_similarities.mean().item():.4f}")
#     else:
#         print("No negative samples found in this batch")
