Install and import the required packages. 

In [4]:

!pip install geoopt==0.3.1
!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
!pip3 install geomstats==2.2
os.environ['GEOMSTATS_BACKEND'] = 'pytorch'

You should consider upgrading via the '/home/noga/NeuroSEED/neuroseed/bin/python3 -m pip install --upgrade pip' command.[0m
Looking in links: https://download.pytorch.org/whl/torch_stable.html
You should consider upgrading via the '/home/noga/NeuroSEED/neuroseed/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/noga/NeuroSEED/neuroseed/bin/python3 -m pip install --upgrade pip' command.[0m


In [5]:
import torch
import os 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
from geomstats.geometry.poincare_ball import PoincareBall

from edit_distance.train import load_edit_distance_dataset
from util.data_handling.data_loader import get_dataloaders
from util.ml_and_math.loss_functions import AverageMeter


In this notebook, we only show the code to run a simple linear layer on the sequence which, in the hyperbolic space, already gives particularly good results. Later we will also report results for more complex models whose implementation can be found in the [NeuroSEED repository](https://github.com/gcorso/NeuroSEED).

In [6]:
class LinearEncoder(nn.Module):
    """  Linear model which simply flattens the sequence and applies a linear transformation. """

    def __init__(self, len_sequence, embedding_size, alphabet_size=4):
        super(LinearEncoder, self).__init__()
        self.encoder = nn.Linear(in_features=alphabet_size * len_sequence, 
                                 out_features=embedding_size)

    def forward(self, sequence):
        # flatten sequence and apply layer
        B = sequence.shape[0]
        sequence = sequence.reshape(B, -1)
        emb = self.encoder(sequence)
        return emb


class PairEmbeddingDistance(nn.Module):
    """ Wrapper model for a general encoder, computes pairwise distances and applies projections """

    def hyperbolic_metric(self, enc_sequence):
        distance=PoincareBall(self.embedding_size).metric.dist(enc_sequence[:, 0], enc_sequence[:, 1])
        distance = distance * self.scaling
        return distance

    def euclidean_metric(self, enc_sequence):
        distance=torch.norm(enc_sequence[:, 0]-enc_sequence[:, 1], dim=-1)
        return distance

    def __init__(self, embedding_model, embedding_size, scaling=False,hyperbolic=True):
        super(PairEmbeddingDistance, self).__init__()

        self.embedding_size=embedding_size

        if hyperbolic:
            self.metric = self.hyperbolic_metric
            self.radius = nn.Parameter(torch.Tensor([1e-2]), requires_grad=True)
            self.scaling = nn.Parameter(torch.Tensor([1.]), requires_grad=True)
        else:
            self.metric=self.euclidean_metric

        self.embedding_model = embedding_model



    def normalize_embeddings(self, embeddings):
        """ Project embeddings to an hypersphere of a certain radius """
        if self.hyperbolic_metric==True:
            min_scale = 1e-7
            max_scale = 1 - 1e-3
            return F.normalize(embeddings, p=2, dim=1) * self.radius.clamp_min(min_scale).clamp_max(max_scale)
        else:
            return embeddings

    def encode(self, sequence):
        """ Use embedding model and normalization to encode some sequences. """
        enc_sequence = self.embedding_model(sequence)
        enc_sequence = self.normalize_embeddings(enc_sequence)
        return enc_sequence

    def forward(self, sequence):
        # flatten couples
        (B, _, N, _) = sequence.shape
        sequence = sequence.reshape(2 * B, N, -1)

        # encode sequences
        enc_sequence = self.encode(sequence)

        # compute distances
        enc_sequence = enc_sequence.reshape(B, 2, -1)

        distance = self.metric(enc_sequence)


        return distance

General training and evaluation routines used to train the models:

In [7]:

def train(model, loader, optimizer, loss, device):
    avg_loss = AverageMeter()
    model.train()

    for sequences, labels in loader:
        # move examples to right device
        sequences, labels = sequences.to(device), labels.to(device)

        # forward propagation
        optimizer.zero_grad()
        output = model(sequences)

        # loss and backpropagation
        loss_train = loss(output, labels)
        loss_train.backward()
        optimizer.step()

        # keep track of average loss
        avg_loss.update(loss_train.data.item(), sequences.shape[0])

    return avg_loss.avg


def test(model, loader, loss, device):
    avg_loss = AverageMeter()
    model.eval()

    for sequences, labels in loader:
        # move examples to right device
        sequences, labels = sequences.to(device), labels.to(device)

        # forward propagation and loss computation
        output = model(sequences)
        loss_val = loss(output, labels).data.item()
        avg_loss.update(loss_val, sequences.shape[0])

    return avg_loss.avg

The linear model is trained on 7000 sequences (+700 of validation) and tested on 1500 different sequences: 

In [8]:
# create subset of qiita to better understand mechanisms
import pickle

with open('./datasets/string_subset.pkl', 'rb') as f:
        sequences, distances = pickle.load(f)

print(distances.keys())
slices={'train':7000, 'test':700, 'val':1500}
smaller_distances = {key: distances[key][:slices[key],:slices[key]] for key in distances.keys()}
smaller_sequences = {key: sequences[key][:slices[key]] for key in sequences.keys()}

pickle.dump((smaller_sequences, smaller_distances),open('./datasets/string_subsubset' + ".pkl", "wb"))



dict_keys(['train', 'val', 'test'])


In [12]:
from edit_distance.train import load_edit_distance_dataset

EMBEDDING_SIZE = 128

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(2021)
if device == 'cuda':
    torch.cuda.manual_seed(2021)

# load data
datasets = load_edit_distance_dataset('./datasets/string_for_test.pkl')
loaders = get_dataloaders(datasets, batch_size=128, workers=5)

# model, optimizer and loss
model=optimizer=loss=loss_train=loss_val=avg_loss={}

encoder = LinearEncoder(153, EMBEDDING_SIZE)
model['hyperbolic'] = PairEmbeddingDistance(embedding_model=encoder, embedding_size=EMBEDDING_SIZE)
model['hyperbolic'].to(device)

optimizer = optim.Adam(model['hyperbolic'].parameters(), lr=0.001)
loss = nn.MSELoss()

# training
for epoch in range(0, 20):
    t = time.time()
    loss_train = train(model['hyperbolic'], loaders['train'], optimizer, loss, device)
    loss_val['hyperbolic'] = test(model['hyperbolic'], loaders['val'], loss, device)

    # print progress
    if epoch % 5 == 0:
        print('Epoch: {:02d}'.format(epoch),
              'loss_train: {:.6f}'.format(loss_train),
              'loss_val: {:.6f}'.format(loss_val['hyperbolic']),
              'time: {:.4f}s'.format(time.time() - t))
      
# testing
for dset in loaders.keys():
    avg_loss = test(model['hyperbolic'], loaders[dset], loss, device)
    print('Final results {}: loss = {:.6f}'.format(dset, avg_loss))


Epoch: 00 loss_train: 924.024585 loss_val: 836.552754 time: 1.6643s


AttributeError: 'float' object has no attribute 'train'

In [25]:
from edit_distance.train import load_edit_distance_dataset_calculate

# model, optimizer and loss
model['euclidean'] = PairEmbeddingDistance(embedding_model=encoder, embedding_size=EMBEDDING_SIZE, hyperbolic=False)
model['euclidean'].to(device)

optimizer = optim.Adam(model['euclidean'].parameters(), lr=0.001)
loss = nn.MSELoss()

# training
for epoch in range(0, 5):
    t = time.time()
    loss_train = train(model['euclidean'], loaders['train'], optimizer, loss, device)
    loss_val = test(model['euclidean'], loaders['val'], loss, device)

    # print progress
    if epoch % 1 == 0:
        print('Epoch: {:02d}'.format(epoch),
              'loss_train: {:.6f}'.format(loss_train),
              'loss_val: {:.6f}'.format(loss_val),
              'time: {:.4f}s'.format(time.time() - t))
      
# testing
for dset in loaders.keys():
    avg_loss = test(model['euclidean'], loaders[dset], loss, device)
    print('Final results {}: loss = {:.6f}'.format(dset, avg_loss))


Epoch: 00 loss_train: 0.103979 loss_val: 0.001626 time: 414.7923s
Epoch: 01 loss_train: 0.001360 loss_val: 0.001742 time: 415.0668s
Epoch: 02 loss_train: 0.001722 loss_val: 0.002383 time: 414.3412s
Epoch: 03 loss_train: 0.002209 loss_val: 0.002347 time: 414.1662s
Epoch: 04 loss_train: 0.002588 loss_val: 0.002653 time: 413.1137s
Final results train: loss = 0.002938
Final results val: loss = 0.002653
Final results test: loss = 0.002803
