In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r 'drive/MyDrive/rosbank' '.'

In [1]:
!pip install transformers
!pip install pytorch-lightning
!pip install spacecutter
!pip install skorch

Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
     ---------------------------------------- 6.3/6.3 MB 1.5 MB/s eta 0:00:00
Collecting regex!=2019.12.17
  Downloading regex-2022.10.31-cp39-cp39-win_amd64.whl (267 kB)
     -------------------------------------- 267.8/267.8 kB 1.5 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
     -------------------------------------- 190.3/190.3 kB 1.9 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.9.0-py3-none-any.whl (9.7 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-win_amd64.whl (3.3 MB)
     ---------------------------------------- 3.3/3.3 MB 1.6 MB/s eta 0:00:00
Installing collected packages: tokenizers, regex, filelock, huggingface-hub, transformers
Successfully installed filelock-3.9.0 huggingface-hub-0.12.1 regex-2022.10.31 tokenizers-0.13.2 transformers-4.26.1
Collecting sp

In [None]:
import math

import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import roc_auc_score

import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from torchmetrics.functional import accuracy, auroc
from argparse import Namespace

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from spacecutter.models import LogisticCumulativeLink
from spacecutter.callbacks import AscensionCallback
from spacecutter.losses import CumulativeLinkLoss

In [None]:
def split_data(data, train_size=0.8, use_train_ratio=1.0, val_size=0.5, seed=42):
  np.random.seed(seed)

  train_ids = np.random.choice(len(data), int(len(data) * train_size), replace=False)
  other_ids = np.setdiff1d(np.arange(len(data)), train_ids)
  train_ids = np.random.choice(train_ids, int(len(train_ids) * use_train_ratio), replace=False)
  train_data = data.iloc[train_ids]
  other_data = data.iloc[other_ids]

  val_size = 0.5
  val_ids = np.random.choice(len(other_data), int(len(other_data) * val_size))
  test_ids = np.setdiff1d(np.arange(len(other_data)), val_ids)

  val_data = other_data.iloc[val_ids]
  test_data = other_data.iloc[test_ids]
  

  return train_data, val_data, test_data

In [None]:
transactions = pd.read_csv('rosbank/train.csv')
transactions = transactions.sort_values(by=['TRDATETIME'])
transactions = transactions.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})

In [None]:
mcc2id = dict(zip(transactions.small_group.unique(), 
                 np.arange(transactions.small_group.nunique()) + 1))
class_proportions = transactions.small_group.value_counts() / len(transactions)
_id2mcc = {v:int(k) for k, v in mcc2id.items()}
class_proportions = [class_proportions[_id2mcc[id]] for id in range(1, len(mcc2id) + 1)]

In [None]:
sequences = transactions.groupby('client_id').agg({'small_group': lambda x: x.tolist(), 'amount_rur': lambda x: x.tolist()})
train_sequences, val_sequences, test_sequences = split_data(sequences)

# Transaction2Vec

In [None]:
class T2VDataset(Dataset):
  def __init__(self, mcc_sequences, amnt_sequences, window_size, subsample=False, class_proportions=None):
    if subsample:
      assert class_proportions is not None
      class_proportions = np.array(class_proportions)
      keep_probs = (np.sqrt(class_proportions / 0.2) + 1) * 0.2 / class_proportions
    else:
      keep_probs = np.ones(len(mcc2id))

    self.id2seq_id = []
    self.id2offset = []
    self.window_size = window_size

    mcc_sequences = [seq for seq in mcc_sequences if len(seq) > 1]
    lens = [len(seq) for seq in mcc_sequences]
    for seq_id, l in enumerate(lens):
      self.id2seq_id += [seq_id] * l
      self.id2offset += list(range(l))
    amnt_sequences = [seq for seq in amnt_sequences if len(seq) > 1]

    self.mcc_seqs = mcc_sequences
    self.amnt_seqs = amnt_sequences
  
  def __getitem__(self, id):
    seq_id, offset = self.id2seq_id[id], self.id2offset[id]
    mcc_seq, amnt_seq = self.mcc_seqs[seq_id], self.amnt_seqs[seq_id]
    center_mcc, center_amnt = mcc_seq[offset], amnt_seq[offset]
    left, right = max(offset - self.window_size, 0), min(offset + self.window_size, len(mcc_seq))
    ctx_mcc = torch.cat([mcc_seq[left:offset], mcc_seq[offset + 1:right]])
    ctx_amnt = torch.cat([amnt_seq[left:offset], amnt_seq[offset + 1:right]])
    ctx_length = len(ctx_mcc)
    return ctx_mcc, ctx_amnt, center_mcc, center_amnt, ctx_length

  def __len__(self):
    return len(self.id2seq_id)

def tr2vec_collate(batch):
  ctx_mccs, ctx_amnts, center_mccs, center_amnts, ctx_lengths = zip(*batch)
  ctx_mccs = pad_sequence(ctx_mccs, batch_first=True, padding_value=0)
  ctx_amnts = pad_sequence(ctx_amnts, batch_first=True, padding_value=0)
  ctx_lengths = torch.LongTensor(ctx_lengths)
  center_mccs = torch.LongTensor(center_mccs)
  center_amnts = torch.LongTensor(center_amnts)
  return ctx_mccs, ctx_amnts, center_mccs, center_amnts, ctx_lengths

In [None]:
tr2vec_hparams = Namespace(**{
  'window_size': 10,
  'mcc_vocab_size': 344,
  'mcc_emb_size': 16,
  'amnt_bins': 50,
  'amnt_emb_size': 8,
  'emb_size': 16,
  'amnt_loss': 'ordinal',
  'lr': 0.83,
  'batch_size': 2000,
  'epochs': 50,
  #'class_proportions': class_proportions 
})

discretizer = KBinsDiscretizer(n_bins=tr2vec_hparams.amnt_bins, encode='ordinal')
all_amounts = []
for i in range(len(train_sequences)):
    all_amounts += train_sequences.iloc[i].amount_rur
discretizer.fit(np.array(all_amounts).reshape(-1, 1))

KBinsDiscretizer(encode='ordinal', n_bins=50)

In [None]:
class T2VDataModule(pl.LightningDataModule):
  def __init__(self, config, train_sequences, val_sequences, mcc2id, discretizer):
    super().__init__()
    self.window_size = config.window_size
    self.batch_size  = config.batch_size
    for ds_name, sequences in zip(['train_ds', 'val_ds'], [train_sequences, val_sequences]):
      mcc_codes, amnts = sequences
      mcc_seqs =  [torch.LongTensor([mcc2id[code] for code in sequence]) for sequence in mcc_codes]
      amnt_seqs = [torch.LongTensor(discretizer.transform(np.array(sequence).reshape(-1, 1))).view(-1) + 1 for sequence in amnts]
      setattr(self, ds_name, T2VDataset(mcc_seqs, amnt_seqs, self.window_size))
        
  def train_dataloader(self):
    return torch.utils.data.DataLoader(self.train_ds, 
                                       batch_size=self.batch_size,
                                       shuffle=True,
                                       drop_last=True,
                                       collate_fn=tr2vec_collate)
  
  def val_dataloader(self):
    return torch.utils.data.DataLoader(self.val_ds, 
                                       batch_size=self.batch_size,
                                       collate_fn=tr2vec_collate)

datamodule = T2VDataModule(tr2vec_hparams,
                           (train_sequences.small_group, train_sequences.amount_rur), 
                           (val_sequences.small_group, val_sequences.amount_rur),
                           mcc2id,
                           discretizer)

In [None]:
class Transaction2VecJoint(pl.LightningModule):
  def __init__(self, hparams):
    super(Transaction2VecJoint, self).__init__()
    self.save_hyperparameters(hparams)
    if isinstance(hparams, Namespace):
      hparams = vars(hparams)
    assert hparams['amnt_loss'] in ['ordinal', 'ce']
    self.mcc_input_embeddings  = nn.Embedding(hparams['mcc_vocab_size'] + 1, 
                                              hparams['mcc_emb_size'], 
                                              padding_idx=0)
    self.amnt_input_embeddings = nn.Embedding(hparams['amnt_bins'] + 1,
                                              hparams['amnt_emb_size'],
                                              padding_idx=0)
    self.hidden_linear = nn.Linear(hparams['mcc_emb_size'] + hparams['amnt_emb_size'], 
                                   hparams['emb_size'], bias=False)
    self.mcc_output = nn.Linear(hparams['emb_size'], 
                                hparams['mcc_vocab_size'],
                                bias=False)
    amnt_loss = hparams['amnt_loss']
    self.amnt_output = nn.Linear(hparams['emb_size'],
                                 1 if amnt_loss == 'ordinal' else hparams['amnt_bins'],
                                 bias=True if amnt_loss == 'ordinal' else False)

    self.lr = hparams['lr']
    self.mcc_criterion = nn.CrossEntropyLoss()

    if amnt_loss == 'ordinal':
      self.amnt_output = nn.Sequential(self.amnt_output, 
                                       LogisticCumulativeLink(hparams['amnt_bins']))
      self.amnt_criterion = CumulativeLinkLoss()
    else:
      self.amnt_criterion = nn.CrossEntropyLoss()

      
  def forward(self, ctx_mccs, ctx_amnts, ctx_lengths):
    mcc_hidden = self.mcc_input_embeddings(ctx_mccs) / ctx_lengths.view(-1, 1, 1)
    amnt_hidden = self.amnt_input_embeddings(ctx_amnts) / ctx_lengths.view(-1, 1, 1)
    hidden = self.hidden_linear(torch.cat([mcc_hidden, amnt_hidden], -1)).sum(1)
    mcc_logits = self.mcc_output(hidden)
    amnt_logits = self.amnt_output(hidden)
    return mcc_logits, amnt_logits

  def configure_optimizers(self):
    optimizer = torch.optim.SGD(self.parameters(), lr=self.lr)
    return {'optimizer': optimizer}
  
  def training_step(self, batch, batch_idx):
    ctx_mccs, ctx_amnts, center_mccs, center_amnts, ctx_lengths = batch
    mcc_logits, amnt_logits = self(ctx_mccs, ctx_amnts, ctx_lengths)
    if self.hparams['amnt_loss'] == 'ordinal':
      center_amnts = center_amnts.view(-1, 1)
    loss = self.mcc_criterion(mcc_logits, center_mccs - 1) + self.amnt_criterion(amnt_logits, center_amnts - 1)
    self.log('train_loss', loss)
    return loss

  def validation_step(self, batch, batch_idx):
    ctx_mccs, ctx_amnts, center_mccs, center_amnts, ctx_lengths = batch
    mcc_logits, amnt_logits = self(ctx_mccs, ctx_amnts, ctx_lengths)
    if self.hparams['amnt_loss'] == 'ordinal':
      center_amnts = center_amnts.view(-1, 1)
    loss = self.mcc_criterion(mcc_logits, center_mccs - 1) + self.amnt_criterion(amnt_logits, center_amnts - 1)
    self.log('val_loss', loss, prog_bar=True)

In [None]:
model = Transaction2VecJoint(tr2vec_hparams)

early_stop_callback = EarlyStopping(
   monitor='val_loss',
   min_delta=1e-3,
   patience=5,
   verbose=False,
   mode='min'
)
checkpoint = ModelCheckpoint(
    monitor='val_loss',
    mode='min'
)

trainer = pl.Trainer(accumulate_grad_batches=5,
                     gpus=1, 
                     default_root_dir='transaction2vec', 
                     deterministic=True,
                     callbacks=[early_stop_callback, checkpoint],
                     max_epochs=tr2vec_hparams.epochs,
                     auto_lr_find=True)

#trainer.tune(model, datamodule, lr_find_kwargs={'min_lr': 1e-3, 'max_lr': 10})
trainer.fit(model, datamodule)

model = Transaction2VecJoint.load_from_checkpoint(checkpoint.best_model_path)
torch.save({'mccs': model.mcc_input_embeddings.weight.data,
            'amnts': model.amnt_input_embeddings.weight.data,
            'hidden': model.hidden_linear.weight.data,
            'mcc2id': mcc2id,
            'discretizer': discretizer}, 
           f'tr2vec_mcc={tr2vec_hparams.mcc_emb_size}_amnt={tr2vec_hparams.amnt_emb_size}_emb={tr2vec_hparams.emb_size}_window={tr2vec_hparams.window_size}_loss={tr2vec_hparams.amnt_loss}.pth')


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                  | Type               | Params
-------------------------------------------------------------
0 | mcc_input_embeddings  | Embedding          | 5.5 K 
1 | amnt_input_embeddings | Embedding          | 408   
2 | hidden_linear         | Linear             | 384   
3 | mcc_output            | Linear             | 5.5 K 
4 | amnt_output           | Sequential         | 66    
5 | mcc_criterion         | CrossEntropyLoss   | 0     
6 | amnt_criterion        | CumulativeLinkLoss | 0     
----------

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
!cp 'tr2vec_mcc=16_amnt=8_emb=16_window=10_loss=ordinal.pth' 'drive/MyDrive/tr2vec'

# Supervised learning

### Baseline: RNN

In [None]:
transactions = pd.read_csv('rosbank/train.csv')
transactions = transactions.sort_values(by=['TRDATETIME'])
transactions = transactions.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})

sequences = transactions.groupby('client_id').agg({'small_group': lambda x: x.tolist(), 'amount_rur': lambda x: x.tolist(), 'target_flag': lambda x: x.tolist()[0]})
train_sequences, val_sequences, test_sequences = split_data(sequences)

In [None]:
class TransactionLabelDataset(torch.utils.data.Dataset):
    def __init__(self, mcc_seqs, amnt_seqs, labels):
        self.mcc_seqs = mcc_seqs
        self.amnt_seqs = amnt_seqs
        self.labels = labels

    def __getitem__(self, id):
        return self.mcc_seqs[id], self.amnt_seqs[id], self.labels[id]

    def __len__(self):
        return len(self.labels)

def rnn_collate(batch):
  mcc_seqs, amnt_seqs, labels = zip(*batch)
  lengths = torch.LongTensor([len(seq) for seq in mcc_seqs])
  mcc_seqs = pad_sequence(mcc_seqs, batch_first=True)
  amnt_seqs = pad_sequence(amnt_seqs, batch_first=True)
  labels = torch.LongTensor(labels)
  return mcc_seqs, amnt_seqs, labels, lengths

In [None]:
mcc_seqs, amnt_seqs, labels = train_sequences.small_group, train_sequences.amount_rur, train_sequences.target_flag.tolist()
mcc_seqs  = [torch.LongTensor([mcc2id[code] for code in seq]) for seq in mcc_seqs]
amnt_seqs = [torch.LongTensor(discretizer.transform(np.array(seq).reshape(-1, 1))).view(-1) + 1 for seq in amnt_seqs]

In [None]:
rnn_hparams = Namespace(**{
  'batch_size': 100,
  'lr': 1e-3,
  'epochs': 40,
  'emb_type': 'concat',
  'mcc_vocab_size': 344,
  'mcc_emb_size': 16,
  'amnt_bins': 50,
  'amnt_emb_size': 8,
  'emb_size': 24,
  'layers': 2,
  'hidden_dim': 64,
  'dropout': 0.1,
  'permutation': True
})


In [None]:
class TransactionRNNDataModule(pl.LightningDataModule):
  def __init__(self,
               config,
               train_sequences,
               val_sequences,
               test_sequences,
               mcc2id,
               discretizer):
    super().__init__()
    self.batch_size = config.batch_size
    for ds_name, sequences in zip(['train_ds', 'val_ds', 'test_ds'], [train_sequences, val_sequences, test_sequences]):
      mcc_seqs, amnt_seqs, labels = sequences.small_group, sequences.amount_rur, sequences.target_flag.tolist()
      mcc_seqs  = [torch.LongTensor([mcc2id[code] for code in seq]) for seq in mcc_seqs]
      amnt_seqs = [torch.LongTensor(discretizer.transform(np.array(seq).reshape(-1, 1))).view(-1) + 1 for seq in amnt_seqs]
      setattr(self, ds_name, TransactionLabelDataset(mcc_seqs, amnt_seqs, labels))
        
  def train_dataloader(self):
    return torch.utils.data.DataLoader(self.train_ds, 
                                       batch_size=self.batch_size,
                                       shuffle=True,
                                       drop_last=True,
                                       collate_fn=rnn_collate)
  
  def val_dataloader(self):
    return torch.utils.data.DataLoader(self.val_ds, 
                                       batch_size=self.batch_size,
                                       collate_fn=rnn_collate)

  def test_dataloader(self):
    return torch.utils.data.DataLoader(self.test_ds, 
                                       batch_size=self.batch_size,
                                       collate_fn=rnn_collate)

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = .1, max_len: int = 5000) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [None]:
class TransactionGRU(pl.LightningModule):
  def __init__(self, hparams):
    super(TransactionGRU, self).__init__()
    self.save_hyperparameters(hparams)
    if isinstance(hparams, Namespace):
      hparams = vars(hparams)
    assert hparams['emb_type'] in ['concat', 'tr2vec']
    self.is_perm = hparams['permutation']
    if hparams['emb_type'] == 'concat':
      assert hparams['mcc_emb_size'] + hparams['amnt_emb_size'] == hparams['emb_size']
    self.lr = hparams['lr']

    self.mcc_embeddings = nn.Embedding(hparams['mcc_vocab_size'] + 1, 
                                       hparams['mcc_emb_size'], 
                                       padding_idx=0)
    self.amnt_embeddings = nn.Embedding(hparams['amnt_bins'] + 1, 
                                        hparams['amnt_emb_size'], 
                                        padding_idx=0)
    
    if hparams['emb_type'] == 'concat':
      self.emb_linear = nn.Identity()
    else:
      self.emb_linear = nn.Linear(hparams['mcc_emb_size'] + hparams['amnt_emb_size'],
                                  hparams['emb_size'],
                                  bias=False)
    self.pos_enc = PositionalEncoding(hparams['emb_size'])

    self.rnn = nn.GRU(hparams['emb_size'], 
                      hparams['hidden_dim'], 
                      hparams['layers'],
                      bidirectional=True,
                      batch_first=True,
                      dropout=hparams['dropout'])
    self.predictor = nn.Linear(2 * hparams['hidden_dim'], 1)

  def set_embeddings(self, mcc_weights, amnt_weights, emb_linear_weights=None):
    with torch.no_grad():
      self.mcc_embeddings.weight.data  = mcc_weights
      self.amnt_embeddings.weight.data = amnt_weights
      if emb_linear_weights is not None:
        self.emb_linear.weight.data = emb_linear_weights
  
  def forward(self, mcc_seqs, amnt_seqs, lengths):
    mcc_embs = self.mcc_embeddings(mcc_seqs)
    amnt_embs = self.amnt_embeddings(amnt_seqs)
    embs = torch.cat([mcc_embs, amnt_embs], -1)
    embs = self.emb_linear(embs)
    embs = self.pos_enc(embs)
    
    if self.is_perm:
        perm = torch.randperm(embs.size(1))
        embs = embs[:, perm, :]

    packed_embs = pack_padded_sequence(embs, 
                                       lengths.cpu(), 
                                       batch_first=True, 
                                       enforce_sorted=False)
    hidden, _ = self.rnn(packed_embs)
    hidden, _ = pad_packed_sequence(hidden, batch_first=True)
    features = self._mean_pooling(hidden, lengths)

    logits = self.predictor(features).squeeze()
    return logits

  def _mean_pooling(self, outputs, lengths):
    max_length = outputs.size(1)
    mask = torch.vstack([torch.cat([torch.zeros(length), 
                                    torch.ones(max_length - length)]) for length in lengths])
    mask = mask.bool().to(outputs.device).unsqueeze(-1)
    outputs.masked_fill_(mask, 0)
    feature_vector = outputs.sum(1) / lengths.unsqueeze(-1)
    return feature_vector

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
    return {'optimizer': optimizer}
  
  def training_step(self, batch, batch_idx):
    mcc_seqs, amnt_seqs, labels, lengths = batch
    logits = self(mcc_seqs, amnt_seqs, lengths)
    loss = F.binary_cross_entropy_with_logits(logits, labels.float())
    self.log('train_loss', loss)
    return {'loss': loss, 'probs': torch.sigmoid(logits), 'labels': labels}

  def training_epoch_end(self, outputs):
    probs  = torch.cat([o['probs']  for o in outputs])
    labels = torch.cat([o['labels'] for o in outputs])
    self.log('train_auroc', auroc(probs, labels), prog_bar=True)

  def validation_step(self, batch, batch_idx):
    mcc_seqs, amnt_seqs, labels, lengths = batch
    logits = self(mcc_seqs, amnt_seqs, lengths)
    probs = torch.sigmoid(logits)
    return probs, labels

  def validation_epoch_end(self, outputs):
    probs, labels = zip(*outputs)
    probs, labels = torch.cat(probs), torch.cat(labels)
    self.log('val_auroc', auroc(probs, labels), prog_bar=True)

  def test_step(self, batch, batch_idx):
    mcc_seqs, amnt_seqs, labels, lengths = batch
    logits = self(mcc_seqs, amnt_seqs, lengths)
    probs = torch.sigmoid(logits)
    return probs, labels

  def test_epoch_end(self, outputs):
    probs, labels = zip(*outputs)
    probs, labels = torch.cat(probs), torch.cat(labels)
    self.log('test_auroc', auroc(probs, labels))

In [None]:
from pytorch_lightning import Callback

def auroc(probs, labels):
  return roc_auc_score(labels.detach().cpu().numpy(), probs.detach().cpu().numpy())

class FreezeEmbeddings(Callback):
  def on_sanity_check_start(self, trainer, pl_module):
    pl_module.mcc_embeddings.requires_grad_(False)
    pl_module.amnt_embeddings.requires_grad_(False)
    pl_module.emb_linear.requires_grad_(False)

class UnfreezeEmbeddings(Callback):
  def __init__(self, unfreeze_after_epoch=3):
    self.unfreeze_after_epoch = unfreeze_after_epoch
    self.n_epoch = 0
  
  def on_epoch_start(self, trainer, pl_module):
    if self.n_epoch == self.unfreeze_after_epoch:
      pl_module.embeddings.requires_grad_(True)
    self.n_epoch += 1

In [None]:
transactions = pd.read_csv('rosbank/train.csv')
transactions = transactions.sort_values(by=['TRDATETIME'])
transactions = transactions.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})

sequences = transactions.groupby('client_id').agg({'small_group': lambda x: x.tolist(), 'amount_rur': lambda x: x.tolist(), 'target_flag': lambda x: x.tolist()[0]})


In [None]:
!ls drive/MyDrive/tr2vec

'tr2vec_mcc=16_amnt=8_emb=16_window=10_loss=ordinal.pth'


In [None]:
results = {}
weights = torch.load('drive/MyDrive/tr2vec/tr2vec_mcc=16_amnt=8_emb=16_window=10_loss=ordinal.pth')

for train_ratio in [1.0]:
  train_sequences, val_sequences, test_sequences = split_data(sequences, use_train_ratio=train_ratio)
  results[train_ratio] = []

  for _ in range(5):
    model = TransactionGRU(rnn_hparams)
    model.set_embeddings(weights['mccs'], weights['amnts'])#, weights['hidden'])
    datamodule = TransactionRNNDataModule(
        rnn_hparams,
        train_sequences,
        val_sequences,
        test_sequences,
        mcc2id,
        discretizer
    )

    early_stop_callback = EarlyStopping(
      monitor='val_auroc',
      min_delta=1e-3,
      patience=4,
      verbose=False,
      mode='max'
    )
    checkpoint = ModelCheckpoint(monitor='val_auroc', mode='max')

    callbacks = [checkpoint, early_stop_callback, FreezeEmbeddings()]

    trainer = pl.Trainer(gpus=1, 
                        default_root_dir='transactionrnn', 
                        deterministic=True, 
                        callbacks=callbacks,
                        max_epochs=rnn_hparams.epochs,
                        auto_lr_find=True)
    trainer.fit(model, datamodule)
    model = TransactionGRU.load_from_checkpoint(checkpoint.best_model_path)
    res = trainer.test(model, dataloaders=datamodule.test_dataloader())[0]['test_auroc']
    results[train_ratio].append(res)

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name            | Type               | Params
-------------------------------------------------------
0 | mcc_embeddings  | Embedding          | 5.5 K 
1 | amnt_embeddings | Embedding          | 408   
2 | emb_linear      | Identity           | 0     
3 | pos_enc         | PositionalEncoding | 0     
4 | rnn             | GRU                | 109 K 
5 | predictor       | Linear             | 129   
-------------------------------------------------------
115 K     Trainable params
0         Non-trainable params


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       test_auroc           0.6822329229012485
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name            | Type               | Params
-------------------------------------------------------
0 | mcc_embeddings  | Embedding          | 5.5 K 
1 | amnt_embeddings | Embedding          | 408   
2 | emb_linear      | Identity           | 0     
3 | pos_enc         | PositionalEncoding | 0     
4 | rnn             | GRU                | 109 K 
5 | predictor       | Linear             | 129   
-------------------------------------------------------
115 K     Trainable params
0         Non-trainable params


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       test_auroc           0.6962479397602042
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name            | Type               | Params
-------------------------------------------------------
0 | mcc_embeddings  | Embedding          | 5.5 K 
1 | amnt_embeddings | Embedding          | 408   
2 | emb_linear      | Identity           | 0     
3 | pos_enc         | PositionalEncoding | 0     
4 | rnn             | GRU                | 109 K 
5 | predictor       | Linear             | 129   
-------------------------------------------------------
115 K     Trainable params
0         Non-trainable params


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       test_auroc            0.677256029904448
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name            | Type               | Params
-------------------------------------------------------
0 | mcc_embeddings  | Embedding          | 5.5 K 
1 | amnt_embeddings | Embedding          | 408   
2 | emb_linear      | Identity           | 0     
3 | pos_enc         | PositionalEncoding | 0     
4 | rnn             | GRU                | 109 K 
5 | predictor       | Linear             | 129   
-------------------------------------------------------
115 K     Trainable params
0         Non-trainable params


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       test_auroc           0.6403548460071745
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name            | Type               | Params
-------------------------------------------------------
0 | mcc_embeddings  | Embedding          | 5.5 K 
1 | amnt_embeddings | Embedding          | 408   
2 | emb_linear      | Identity           | 0     
3 | pos_enc         | PositionalEncoding | 0     
4 | rnn             | GRU                | 109 K 
5 | predictor       | Linear             | 129   
-------------------------------------------------------
115 K     Trainable params
0         Non-trainable params


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       test_auroc           0.6584418662271487
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [None]:
# Shuffle results
for k, v in results.items():
  print(k, np.mean(v), np.std(v))

1.0 0.6709067209600448 0.019491349934829958


In [None]:
# Original results
for k, v in results.items():
  print(k, np.mean(v), np.std(v))

1.0 0.7219942043973326 0.004405874343944151
