## Siamese Neural Networks 
### for Supervised Clustering of High Dimensional Spaces

In [None]:
!python -V
!which python

In [None]:
import pandas as pd

from loaderbot.big_query import query_table_and_cache
from google.cloud import bigquery

sql = """WITH matches AS (
    SELECT DISTINCT
        account_id,
        windfall_id ,
        candidate_id,
        confidence,
        CASE 
         -- luxury
         WHEN account_id = 81 THEN "1stDibs"
         WHEN account_id = 614 THEN "TamaraMellon"
         WHEN account_id = 585 THEN "Tonal"
         -- WHEN account_id = 385 THEN "WheelsUp"
         -- WHEN account_id = 208 THEN "Inspirato"
         -- WHEN account_id = 1577 THEN "OneFlight"
         -- alternative investment
         -- WHEN account_id = 501 THEN "Cadre"
         -- WHEN account_id = 679 THEN "Crowdstreet"
         -- WHEN account_id = 1047 THEN "Equaim"
         -- WHEN account_id = 1218 THEN "EquityEstates"
         -- WHEN account_id = 1246 THEN "EquityMultiple"
         WHEN account_id = 1050 THEN "MasterWorks"
         WHEN account_id = 753 THEN "Microventures"
         -- WHEN account_id = 1473 THEN "Portfolia"        
         -- insurance
         -- WHEN account_id = 514 THEN "HealthIQ"
         -- WHEN account_id = 1344 THEN "PureInsurance"
         -- finance
         -- WHEN account_id = 1219 THEN "SmartBiz"
         -- health
         -- WHEN account_id = 220 THEN "GrandViewHealth"
         -- WHEN account_id = 352 THEN "NewEnglandBaptistHospital"
         -- WHEN account_id = 1216 THEN "NuvanceHealth"
         -- WHEN account_id = 654 THEN "ProvidenceHealth"
         -- WHEN account_id = 1197 THEN "StCharles"
         END AS account_name,
         CASE 
         -- luxury
         WHEN account_id = 81 THEN "lux"
         WHEN account_id = 614 THEN "lux"
         WHEN account_id = 585 THEN "lux"
         -- WHEN account_id = 385 THEN "lux"
         -- WHEN account_id = 208 THEN "lux"
         -- WHEN account_id = 1577 THEN "lux"
         -- alternative investment
         -- WHEN account_id = 501 THEN "alt"
         -- WHEN account_id = 679 THEN "alt"
         -- WHEN account_id = 1047 THEN "alt"
         -- WHEN account_id = 1218 THEN "alt"
         -- WHEN account_id = 1246 THEN "alt"
         WHEN account_id = 1050 THEN "alt"
         WHEN account_id = 753 THEN "alt"
         -- WHEN account_id = 1473 THEN "alt"
         -- insurance
         -- WHEN account_id = 514 THEN "insurance"
         -- WHEN account_id = 1344 THEN "insurance"
         -- finance
         -- WHEN account_id = 1219 THEN "finance"
         -- health
         -- WHEN account_id = 220 THEN "health-donor"
         -- WHEN account_id = 352 THEN "health-donor"
         -- WHEN account_id = 1216 THEN "health-donor"
         -- WHEN account_id = 654 THEN "health-donor"
         -- WHEN account_id = 1197 THEN "health-donor"
         END AS label,
    FROM `portal.match`
    )

SELECT
    m.label,
    audience.*,
    latest.city,
    latest.state,
    latest.zipcode,
    latest.county,
    latest.metroName,
    realEstateInvestor,
    personalInvestor,
    FROM
    `tranquil-garage-139216.people.audience_latest` audience
    LEFT JOIN `tranquil-garage-139216.people.audience_dbusa_features` dbusa using(id)
    LEFT JOIN `tranquil-garage-139216.people.latest` latest ON latest.id = audience.id
    LEFT JOIN matches m ON audience.id = m.windfall_id
    WHERE m.label IS NOT NULL
    AND m.confidence > 0.90
    """

In [None]:
import os
import hashlib

raw_data_name = hashlib.md5(sql.encode('utf-8')).hexdigest()

if os.path.exists(f"data/{raw_data_name}.csv"):
    raw_data = pd.read_csv(f"data/{raw_data_name}.csv")
else:
    raw_data = query_table_and_cache(sql=sql)
    raw_data.to_csv(f"data/{raw_data_name}.csv")

In [None]:
from sklearn.model_selection import train_test_split

df, _ = train_test_split(
    raw_data,
    test_size=0.980,
    stratify=raw_data["label"])

df.shape

## Tabular Learner

Before we train the Tabular Siamese Learner we will train baseline Tabular Learner for species classification... (why do we do this, exactly? can we just instantiate a Tabular Siamese Learner without a baseline Tabular Learner ???)

Ah yes, to init a new `TabularSiameseModel` we need to provide an `encoder` and `head` and the Tabular Learner will act as the `encoder` we init the `TabularSiameseModel` with.

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(
    df,
    test_size=0.20,
    stratify=df["label"])

df_train.shape, df_val.shape

In [None]:
from fastai.tabular.all import CategoryBlock
                                
y_names = ["label"]
y_block = CategoryBlock()

In [None]:
exclude_vars = ["label", "id", "investorId", "createdAt", "investorId_1", "investorId_2",
               "investorLevel", "investorLevel_1", "status", "windfall_id", "windfall_id_1",
                "candidate_id", "minInvestmentDate", "maxInvestmentDate", "confidence", 
                "closed", "countInvestmentDate", "amount", "sumAmount"]

In [None]:
from mobius.utils import emb_sz_rule

cat_names = [x for x in df.select_dtypes(exclude=['int', 'float']).columns if x != y_names]
cat_names = [x for x in cat_names if x not in exclude_vars]

# calc embedding sizes for each categorical feature
emb_szs = {k: emb_sz_rule(len(df[k].unique())) for k in cat_names}
emb_szs

In [None]:
import numpy as np

cont_names = [x for x in df.select_dtypes([np.number]).columns if x != y_names]
cont_names = [x for x in cont_names if x not in exclude_vars]
cont_names

In [None]:
from fastai.tabular.all import (Categorify, CategoryBlock, FillMissing, FillStrategy,
                                Normalize, TabDataLoader, TabularPandas,
                                tabular_config, tabular_learner)
# from collections import defaultdict
# from dataclasses import dataclass, field

# @dataclass
# class MyFillMissing(FillMissing):
#     fill_strategy:FillStrategy=FillStrategy.constant
#     add_col:bool=False
#     fill_vals:float=field(default_factory=dict)

# procs = [MyFillMissing, Categorify, Normalize]
procs = [FillMissing, Categorify, Normalize]

In [None]:
from fastai.data.core import range_of
from fastai.tabular.all import RandomSplitter

# train/test split
splits = RandomSplitter(valid_pct=0.10)(range_of(df))

In [None]:
tabular_pandas = TabularPandas(
        df,
        procs=procs,
        cat_names=cat_names,
        cont_names=cont_names,
        y_names=y_names,
        y_block=y_block,
        splits=splits,
        device="cpu")

In [None]:
trn_dl = TabDataLoader(
    tabular_pandas.train,
    bs=128,
    shuffle=True,
    drop_last=True,
    num_workers=4)

val_dl = TabDataLoader(
    tabular_pandas.valid,
    bs=128,
    num_workers=4)

In [None]:
from fastai.data.core import DataLoaders

dls = DataLoaders(trn_dl, val_dl)

print("Sample batch:")
# dls.one_batch()

In [None]:
from fastai.metrics import F1Score, Precision, Recall, accuracy

# load the tabular_pandas data through the tabular_learner
layers = [2048, 1024, 128]

# tabular learner configuration
config = tabular_config(ps=[0.03, 0.03, 0.0], embed_p=0.03)

learn = tabular_learner(
    dls,
    layers=layers,
    emb_szs=emb_szs,
    config=config,
    metrics=[accuracy,
             Precision(average='macro'),
             Recall(average='macro'),
             F1Score(average='macro')])

In [None]:
learn.fit_one_cycle(n_epoch=1)

In [None]:
# learn.export("tabular_learn.pkl")

In [None]:
# from mobius.calibration import ModelWithTemperature

# scaled_model = ModelWithTemperature(learn.model)
# scaled_model.set_temperature(val_dl)
# learn.model = scaled_model.model

In [None]:
# true species labels
y_true=learn.dls.valid.items["label"]

# model scores and species predictions
y_scores, *_ = learn.get_preds(dl=val_dl)
preds = np.argmax(y_scores, 1).numpy()

In [None]:
print("First 20 investor labels and predictions")
list(zip(y_true, preds))[:10]

In [None]:
(y_true == preds).sum() / len(y_true)

## Siamese Net

To init a new `TabularSiameseDataset` object, we only need a `tabular_pandas` object from the fast.ai library.

In [None]:
from mobius.datasets import write_jsonl

# write SNN training data to `data/`
write_jsonl(tabular_pandas.train.to.items[0].items, "data/train_data.jsonl")
write_jsonl(tabular_pandas.valid.to.items[0].items, "data/valid_data.jsonl")

# write SNN training labels to `data/`
tabular_pandas.train.y.to_csv("data/train_labels.csv", index=True)
tabular_pandas.valid.y.to_csv("data/valid_labels.csv", index=True)

In [None]:
from mobius.datasets import TabularSiameseDataset

train_ds = TabularSiameseDataset(
    csv_file="data/train_labels.csv", 
    jsonl_file="data/train_data.jsonl",
    tabular_learner=learn)
    
valid_ds = TabularSiameseDataset(
    csv_file="data/valid_labels.csv", 
    jsonl_file="data/valid_data.jsonl",
    tabular_learner=learn)

In [None]:
train_ds.__len__(), train_ds.__getitem__(1)

In [None]:
valid_ds.__len__(), valid_ds.__getitem__(0)

In [None]:
dls = DataLoaders.from_dsets(train_ds, valid_ds, bs=128, device='cpu', num_workers=3)

Siamese net encoder is the body of the Tabular net we just trained.

In [None]:
dls.dataset.c

In [None]:
import copy

encoder = copy.copy(learn)
encoder.model.layers = learn.model.layers[:-1]
encoder_model = encoder.model

In [None]:
from fastai.layers import LinBnDrop

head = LinBnDrop(n_in=layers[-1]*2,
    n_out=16,  # size of output space
    bn=True,
    act=None)

In [None]:
from mobius.models import TabularSiameseModel

model = TabularSiameseModel(encoder_model, head)

In [None]:
from fastai.torch_basics import params
from mobius.losses import ContrastiveLoss

def siamese_splitter(model):
    return [params(model.encoder), params(model.head)]

def contrastive_loss_func(out, targ):
    return ContrastiveLoss(margin=0.50)(out, targ.long())

In [None]:
from fastai.learner import Learner
from mobius.callbacks import TSNECallback
from fastai.callback.tracker import SaveModelCallback

# TODO: add callback for best validation
siamese_learner = Learner(dls,
    model,
    model_dir=".",
    loss_func=contrastive_loss_func,
    splitter=siamese_splitter,
    cbs=[TSNECallback, SaveModelCallback])

In [None]:
# siamese_learner.summary()

In [None]:
siamese_learner.unfreeze()
siamese_learner.fit(n_epoch=5, lr=10e-4)

In [None]:
siamese_learner.fit(n_epoch=10, lr=10e-5)

In [None]:
siamese_learner.fit(n_epoch=5, lr=10e-6)

In [None]:
siamese_learner.fit(n_epoch=10, lr=10e-7)

In [None]:
# siamese_learner.unfreeze()
# siamese_learner.fit(n_epoch=3, lr=10e-4)

In [None]:
# tsne = np.load("tsne_1625777058_0.npy")