# Towards a contrastive loss model for regional embeddings

OK so we have a few sample tiles. 
For each, we have computed vector embeddings (under the "embeddings" folder),
and we have a baseline methodology for computing similarity vectors for contrastive loss
(in the EDA notebook in this folder). 
Here I'm looking into putting these pieces together. The goal is
* compute the perm-equivariant transformation using the trained MGM model,
* Train an aggregator model using contrastive loss
The aggregator will use a layer that computes the weighted sum of the elements in its input set.


In [None]:
# Google colab setup
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# project_home = '/content/drive/MyDrive/Projects/verge'
# os.chdir(project_home)

In [1]:
# Local processing setup
project_home = '..'

## Code

In [10]:
import pandas as pd
import numpy as np
import glob
import pickle
import os
import torch
import torch.nn as nn
import torch.utils
import torch.utils.data
from torch.utils.data import DataLoader
import copy
import json
from geo_encodings import MPPEncoder

import sys
sys.path.append(project_home)
from utils.geo_transformer_mem import VergeDataset, verge_collate_fn, GeospatialTransformer


In [5]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)


In [11]:
# Read the ROI definition.
fname = '%s/roi.json' % roi_home
with open(fname) as source:
    roi = json.load(source)

tile_size = roi['tile_size']
encoding_resolution = roi['encoding_resolution']

# We need the dimension of the encoding.
encoder = MPPEncoder(
    region=[0, 0, tile_size, tile_size],
    resolution=encoding_resolution,
    center=True
)
geo_encoding_dim = len(encoder)
print('%d elements in encodings' % geo_encoding_dim)


400 elements in encodings


In [6]:
# Load labels
fname = "%s/labels.csv" % data_home
labels = pd.read_csv(fname)
labels.head(3)

Unnamed: 0,id,label
0,0,amenity : commercial
1,1,amenity : food and drink
2,2,amenity : parking lot


In [7]:
# Get a list of tiles.
fname = '%s/tile_info.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles' % len(tile_info))
tile_info.head(3)

15793 tiles


Unnamed: 0,fname,center_lon,center_lat
0,/content/drive/MyDrive/Projects/verge/data/new...,-73.488441,41.309272
1,/content/drive/MyDrive/Projects/verge/data/new...,-73.48881,41.318267
2,/content/drive/MyDrive/Projects/verge/data/new...,-73.489178,41.327261


## Read tile encodings

In [8]:
# Get a list of the tile file names.
import glob
globstring = '%s/tiles/*/*.pq' % roi_home
tile_fnames = glob.glob(globstring)
print('%d tile files' % len(tile_fnames))
tile_fnames[:3]

686 tile files


['../data/newengland/tiles/0709w-422n/225-104.pq',
 '../data/newengland/tiles/0709w-422n/229-104.pq',
 '../data/newengland/tiles/0709w-422n/227-105.pq']

## Load the trained model and get initial embeddings for tiles

In [12]:
run_id = '010'

# What type of device to train on.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('using device', device)

using device cpu


In [13]:
# Deterine which files to process using the "splits" file.
splits_fname = '%s/splits-%s.csv' % (roi_home, run_id)
splits = pd.read_csv(splits_fname)
val_fnames = splits[splits['split'] == 'val']['aoi'].tolist()
# val_fnames = splits[splits['type'] == 'train']['fname'].tolist()
print('%d files with validation data' % len(val_fnames))

# Read some data.
val_tiles = []
for fname in val_fnames[:3]:
    print('reading', fname)
    with open(fname, 'rb') as source:
        val_tiles += pickle.load(source)

print('%d validation tiles' % len(val_tiles))

FileNotFoundError: [Errno 2] No such file or directory: '../data/newengland/splits-010.csv'