## Siamese Neural Networks 
### for Supervised Clustering of High Dimensional Spaces

In [1]:
!python -V
!which python

Python 3.8.5
/Users/seanmacrae/mobius/.venv/bin/python


Load one of the simplest datasets possible; penguins!

In [2]:
import pandas as pd
from palmerpenguins import load_penguins

df = load_penguins()
df["label"] = df["species"]
df.drop("species", axis=1, inplace=True)

print(df.shape)
df.head()

(344, 8)


Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,label
0,Torgersen,39.1,18.7,181.0,3750.0,male,2007,Adelie
1,Torgersen,39.5,17.4,186.0,3800.0,female,2007,Adelie
2,Torgersen,40.3,18.0,195.0,3250.0,female,2007,Adelie
3,Torgersen,,,,,,2007,Adelie
4,Torgersen,36.7,19.3,193.0,3450.0,female,2007,Adelie


In [5]:
df["label"].value_counts()

Adelie       152
Gentoo       124
Chinstrap     68
Name: label, dtype: int64

The penguins dataset has 344 observations across 8 features. The label of interest is the `species` of penguin, called `label`.

## Tabular Learner

The Tabular Siamese Network uses the fastai `TabularModel` model definition to start with. I first instantiate a new `TabularModel` but I don't fit it. Once the model is fully defined, I removed the head (linear `BinaryCrossEntropy` output) and repurpose the body as the Siamese encoder. By using the `ContrastiveLoss` function defined in `mobius` with the `TabularModel` body, I create a metric learner that discriminates between classes by learning an embedding space that maps similar classes to similar regions in the learned space.

### Create Dataloaders

Defining a fastai `TabularModel` requires a `DataLoader`, which I create below.

In [6]:
from fastai.tabular.all import CategoryBlock
                                
y_names = ["label"]
y_block = CategoryBlock()

In [7]:
exclude_vars = ["label"]

In [8]:
from mobius.utils import emb_sz_rule

cat_names = [x for x in df.select_dtypes(exclude=['int', 'float']).columns if x != y_names]
cat_names = [x for x in cat_names if x not in exclude_vars]

# calc embedding sizes for each categorical feature
emb_szs = {k: emb_sz_rule(len(df[k].unique())) for k in cat_names}
emb_szs

{'island': 3, 'sex': 3}

In [9]:
import numpy as np

cont_names = [x for x in df.select_dtypes([np.number]).columns if x != y_names]
cont_names = [x for x in cont_names if x not in exclude_vars]
cont_names

['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year']

Define pre-processing steps of the data loaders.

In [10]:
from fastai.data.core import range_of
from fastai.tabular.all import RandomSplitter
from fastai.tabular.all import (Categorify, CategoryBlock, FillMissing, FillStrategy,
                                Normalize, TabDataLoader, TabularPandas,
                                tabular_config, tabular_learner)

# pre-processing
procs = [FillMissing, Categorify, Normalize]

# train/test split
splits = RandomSplitter(valid_pct=0.10)(range_of(df))

In [11]:
tabular_pandas = TabularPandas(
        df,
        procs=procs,
        cat_names=cat_names,
        cont_names=cont_names,
        y_names=y_names,
        y_block=y_block,
        splits=splits,
        device="cpu")

In [12]:
trn_dl = TabDataLoader(
    tabular_pandas.train,
    bs=8,
    shuffle=True,
    drop_last=True,
    num_workers=1)

val_dl = TabDataLoader(
    tabular_pandas.valid,
    bs=8,
    num_workers=1)

Let's take a look at a batch of penguins!

In [13]:
from fastai.data.core import DataLoaders

dls = DataLoaders(trn_dl, val_dl)

print("Sample batch:")
dls.one_batch()

Sample batch:


(tensor([[3, 2, 1, 1, 1, 1],
         [1, 0, 2, 2, 2, 2],
         [1, 2, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1],
         [3, 2, 1, 1, 1, 1],
         [2, 1, 1, 1, 1, 1],
         [2, 1, 1, 1, 1, 1],
         [1, 2, 1, 1, 1, 1]]),
 tensor([[-1.7222,  1.9682, -0.1988,  0.2744, -1.2746],
         [ 0.1622,  0.1512, -0.2703, -0.2293,  1.1872],
         [ 1.3332, -1.5143,  1.2326,  1.4079,  1.1872],
         [ 0.6379, -1.1105,  1.2326,  0.9671,  1.1872],
         [-0.5330,  0.7064, -0.8429, -1.0794,  1.1872],
         [ 0.2171, -0.3030, -0.6997, -1.1739,  1.1872],
         [-1.3014,  0.7064, -0.8429, -0.8590, -0.0437],
         [-0.5330,  0.9588, -0.9145, -0.1034, -0.0437]]),
 tensor([[0],
         [2],
         [2],
         [2],
         [0],
         [1],
         [0],
         [0]], dtype=torch.int8))

The input is a 3-tuple containing a tensor of categorical features, continuous features, and label.

In [14]:
from fastai.metrics import F1Score, Precision, Recall, accuracy

# load the tabular_pandas data through the tabular_learner
layers = [128, 32]

# tabular learner configuration
config = tabular_config(ps=[0.03, 0.03], embed_p=0.0)

learn = tabular_learner(
    dls,
    layers=layers,
    emb_szs=emb_szs,
    config=config,
    train_bn=False,
    metrics=[accuracy,
             Precision(average='macro'),
             Recall(average='macro'),
             F1Score(average='macro')])

In [15]:
# learn.fit_one_cycle(n_epoch=3)

In [16]:
# learn.export("tabular_learn.pkl")

In [17]:
# from mobius.calibration import ModelWithTemperature

# scaled_model = ModelWithTemperature(learn.model)
# scaled_model.set_temperature(val_dl)
# learn.model = scaled_model.model

In [18]:
# # true species labels
# y_true=learn.dls.valid.items["label"]

# # model scores and species predictions
# y_scores, *_ = learn.get_preds(dl=val_dl)
# preds = np.argmax(y_scores, 1).numpy()

In [19]:
# print("First 20 investor labels and predictions")
# list(zip(y_true, preds))[:10]

In [20]:
# (y_true == preds).sum() / len(y_true)

## Siamese Net (a`la DrLIM)

To init a new `TabularSiameseDataset` object, we only need a `tabular_pandas` object from the fast.ai library.

In [21]:
from mobius.datasets import write_jsonl

# write SNN training data to `data/`
write_jsonl(tabular_pandas.train.to.items[0].items, "data/train_data.jsonl")
write_jsonl(tabular_pandas.valid.to.items[0].items, "data/valid_data.jsonl")

# write SNN training labels to `data/`
tabular_pandas.train.y.to_csv("data/train_labels.csv", index=True)
tabular_pandas.valid.y.to_csv("data/valid_labels.csv", index=True)

In [22]:
from mobius.datasets import TabularSiameseDataset

train_ds = TabularSiameseDataset(
    csv_file="data/train_labels.csv", 
    jsonl_file="data/train_data.jsonl",
    tabular_learner=learn)
    
valid_ds = TabularSiameseDataset(
    csv_file="data/valid_labels.csv", 
    jsonl_file="data/valid_data.jsonl",
    tabular_learner=learn)

In [23]:
train_ds.__len__(), train_ds.__getitem__(1)

(310,
 (((tensor([1, 2, 1, 1, 1, 1]),
    tensor([-0.4415,  0.4036, -0.6282, -0.2923, -0.0437])),
   (tensor([1, 2, 1, 1, 1, 1]),
    tensor([-0.4964,  1.9682, -0.4135,  0.2744, -0.0437]))),
  tensor(0.)))

In [24]:
valid_ds.__len__(), valid_ds.__getitem__(0)

(34,
 (((tensor([1, 1, 1, 1, 1, 1]),
    tensor([-1.6490,  0.3531, -0.7713, -0.9220, -0.0437])),
   (tensor([1, 1, 1, 1, 1, 1]),
    tensor([-1.6490,  0.3531, -0.7713, -0.9220, -0.0437]))),
  tensor(0.)))

In [25]:
dls = DataLoaders.from_dsets(train_ds, 
                             valid_ds, 
                             bs=2, 
                             device='cpu', 
                             num_workers=2,
                             drop_last=True)

Siamese net encoder is the body of the Tabular net we just trained.

In [26]:
dls.dataset.c

3

In [27]:
import copy

encoder = copy.copy(learn)
encoder.model.layers = learn.model.layers[:-1]
encoder_model = encoder.model

In [28]:
from mobius.models import TabularSiameseModel

model = TabularSiameseModel(encoder_model)

In [29]:
from fastai.torch_basics import params
from mobius.losses import ContrastiveLoss

def contrastive_loss_func(out, targ):
    return ContrastiveLoss(margin=2.0)(out, targ.long())

In [30]:
from fastai.learner import Learner
from mobius.callbacks import TSNECallback
from fastai.callback.tracker import SaveModelCallback
from fastai.callback.training import ShortEpochCallback

# TODO: add callback for best validation
siamese_learner = Learner(dls,
    model,
    model_dir=".",
    loss_func=contrastive_loss_func,
    cbs=[TSNECallback])

In [31]:
siamese_learner.summary()

TabularSiameseModel (Input shape: ['["[\'2 x 6\', \'2 x 5\']", "[\'2 x 6\', \'2 x 5\']"]'])
Layer (type)         Output Shape         Param #    Trainable 
                     2 x 3               
Embedding                                 12         True      
Embedding                                 9          True      
Embedding                                 9          True      
Embedding                                 9          True      
Embedding                                 9          True      
Embedding                                 9          True      
Dropout                                                        
BatchNorm1d                               10         True      
BatchNorm1d                               46         True      
Dropout                                                        
____________________________________________________________________________
                     2 x 128             
Linear                                    2

In [33]:
siamese_learner.lr_find()

SuggestedLRs(lr_min=7.585775847473997e-08, lr_steep=1.0964781722577754e-06)

In [35]:
siamese_learner.fit(n_epoch=3, lr=6.309573450380412e-07)

epoch,train_loss,valid_loss,time
0,3.740033,2.504922,00:24
1,3.104853,1.775285,00:23
2,2.906327,1.921575,00:23


## Siamese Net (w/ Binary CrossEntropy)

In [31]:
from fastai.layers import LinBnDrop

head = LinBnDrop(n_in=layers[-1]*2,
    n_out=16,  # size of output space
    bn=True,
    act=None)

In [32]:
from mobius.models import TabularSiameseModelBinaryCrossEntropy

model = TabularSiameseModelBinaryCrossEntropy(encoder_model, head)

In [33]:
from fastai.losses import CrossEntropyLossFlat

def loss_func(out, targ):
    return CrossEntropyLossFlat()(out, targ.long())

In [34]:
from fastai.learner import Learner

siamese_learner = Learner(dls,
    model,
    model_dir=".",
    loss_func=loss_func)

In [35]:
lr_min, lr_steep = siamese_learner.lr_find()
lr_min, lr_steep

(0.02089296132326126, 0.43651583790779114)

In [36]:
siamese_learner.fit(n_epoch=3, lr=lr_min)

epoch,train_loss,valid_loss,time
0,0.976632,6.699764,00:04
1,0.862619,0.769325,00:04
2,0.826566,0.826116,00:04


In [37]:
lr_min, lr_steep = siamese_learner.lr_find()
lr_min, lr_steep

(6.309573450380412e-08, 0.3019951581954956)

In [38]:
siamese_learner.fit(n_epoch=10, lr=lr_min)

epoch,train_loss,valid_loss,time
0,0.708342,0.658743,00:04
1,0.706507,0.787999,00:04
2,0.729647,0.769968,00:04
3,0.728697,0.777857,00:04
4,0.751755,0.742473,00:04
5,0.71031,0.701429,00:04
6,0.733236,0.660383,00:04
7,0.71795,0.701322,00:04
8,0.718412,0.72315,00:04
9,0.732776,0.74694,00:04


In [40]:
valid_ds.__getitem__(0)

(((tensor([2, 2, 1, 1, 1, 1]),
   tensor([ 1.3481,  0.5106, -0.2549, -0.5495, -1.2341])),
  (tensor([1, 2, 1, 1, 1, 1]),
   tensor([ 0.7678, -1.0729,  0.6168,  1.6649,  1.1796]))),
 tensor(1.))

In [108]:
pair, _ = valid_ds.__getitem__(0)

p1, p2 =[(t[0].unsqueeze(0), t[1].unsqueeze(0)) for t in pair]
p1, p2

((tensor([[2, 2, 1, 1, 1, 1]]),
  tensor([[ 1.3481,  0.5106, -0.2549, -0.5495, -1.2341]])),
 (tensor([[1, 1, 1, 1, 1, 1]]),
  tensor([[ 0.6590, -1.1239,  1.2705,  0.9689,  1.1796]])))

In [36]:
# siamese_learner.predict((p1, p2))