In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
%load_ext autoreload
%autoreload 2

# Model definition

In [3]:
import hydra
from omegaconf import OmegaConf
import torch

conf = OmegaConf.load('config/coles.yaml')
model = hydra.utils.instantiate(conf.pl_module)
model.load_state_dict(torch.load("models/coles.p"))

<All keys matched successfully>

# Finetune

In [4]:
from glob import glob
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load.iterable_processing.target_move import TargetMove
from ptls.data_load.iterable_processing.target_empty_filter import TargetEmptyFilter
from ptls.data_load import padded_collate, padded_collate_wo_target
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.datasets import MemoryMapDataset
from tqdm.auto import tqdm

from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.data_load.utils import collate_feature_dict


from ptls.frames import PtlsDataModule

train_data = glob('data/train_transactions_clipped.parquet')
valid_data = glob('data/valid_transactions_clipped.parquet')

feature_cols = list(conf.pl_module.seq_encoder.trx_encoder.embeddings.keys()) + \
               list(conf.pl_module.seq_encoder.trx_encoder.numeric_values.keys())

target_cols = ['mcc', 'amnt', 'hour_diff']

dataset_conf = {
    'min_seq_len':25,
    }



class SeqToTargetMultiheadDataset(torch.utils.data.Dataset):
    def __init__(self,
                 data,
                 feature_cols,
                 target_cols,
                 target_dtype=None,
                 *args, **kwargs,
                 ):
        super().__init__(*args, **kwargs)

        self.data = data
        self.feature_cols = feature_cols
        self.target_cols = target_cols
        
        if type(target_dtype) is str:
            self.target_dtype = getattr(torch, target_dtype)
        else:
            self.target_dtype = target_dtype

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        feature_arrays = self.data[item]
        return feature_arrays

    def __iter__(self):
        for feature_arrays in self.data:
            yield feature_arrays


    def collate_fn(self, batch):
        
        targets = []
        values = []
        
        for target_col in target_cols:
            targets.append(torch.tensor([rec[target_col][-1] for rec in batch]).to(self.target_dtype[target_col]))
        
        for rec in batch:
            values.append({k: v[:-1] for k, v in rec.items() if k in feature_cols})
    
        return padded_collate_wo_target(values), targets

process = IterableChain(
            SeqLenFilter(min_seq_len=dataset_conf['min_seq_len']),
            ToTorch()
            )
    
def get_dataset(data):
    ds = MemoryMapDataset(ParquetDataset(data, post_processing=process))
    return SeqToTargetMultiheadDataset(ds, feature_cols, target_cols, target_dtype = {'mcc': torch.long, 'amnt': torch.float, 'hour_diff': torch.float})

train_ds = get_dataset(train_data)
valid_ds = get_dataset(valid_data)

dm = PtlsDataModule(
    train_data=train_ds,
    valid_data=valid_ds,
    train_num_workers=4,
    train_batch_size=64)



In [5]:
import logging
from copy import deepcopy
from typing import List

import pandas as pd
import pytorch_lightning as pl
import torch
import torchmetrics
from omegaconf import DictConfig

from ptls.data_load.padded_batch import PaddedBatch

logger = logging.getLogger(__name__)


class SequenceToTargetMultihead(pl.LightningModule):


    def __init__(self,
                 seq_encoder: torch.nn.Module,
                 heads: List[torch.nn.Module],
                 losses: List[torch.nn.Module],
                 metric_list: torchmetrics.Metric=None,
                 optimizer_partial=None,
                 lr_scheduler_partial=None,
                 pretrained_lr=None,
                 train_update_n_steps=None,
                 ):
        super().__init__()

        self.save_hyperparameters(ignore=[
            'seq_encoder', 'heads', 'losses', 'metric_list', 'optimizer_partial', 'lr_scheduler_partial'])

        self.seq_encoder = seq_encoder
        self.heads = heads
        self.losses = losses
        self.n_heads = len(heads)

        self.optimizer_partial = optimizer_partial
        self.lr_scheduler_partial = lr_scheduler_partial

    def forward(self, x):
        x = self.seq_encoder(x)
        xs = [head(x) for head in self.heads]
        return xs

    def training_step(self, batch, _):
        x, y = batch
        y_hs = self(x)
        loss = sum([loss(y_hs[i], y[i]) for i, loss in enumerate(self.losses)])
        self.log('loss', loss)
        return loss

    def validation_step(self, batch, _):
        x, y = batch
        y_hs = self(x)
        loss = sum([loss(y_hs[i], y[i]) for i, loss in enumerate(self.losses)])
        self.log('val_loss', loss)


    def configure_optimizers(self):
        if self.hparams.pretrained_lr is not None:
            if self.hparams.pretrained_lr == 'freeze':
                for p in self.seq_encoder.parameters():
                    p.requires_grad = False
                parameters = self.parameters()
            else:
                parameters = [
                    {'params': self.seq_encoder.parameters(), 'lr': self.hparams.pretrained_lr},
                ] + [{'params': head.parameters()} for head in self.heads]
        else:
            parameters = self.parameters()

        optimizer = self.optimizer_partial(parameters)
        scheduler = self.lr_scheduler_partial(optimizer)
        return [optimizer], [scheduler]



In [6]:
from functools import partial
import torch
import torchmetrics
from ptls.nn import Head


head_mcc = Head(input_size=model.seq_encoder.embedding_size, 
                use_batch_norm=True,
                hidden_layers_sizes=[128],
                objective='classification',
                num_classes=109).to('cuda:0')

head_amnt = Head(input_size=model.seq_encoder.embedding_size, 
                 use_batch_norm=True,
                 hidden_layers_sizes=[128],
                 objective='softplus').to('cuda:0')

head_hour_diff = Head(input_size=model.seq_encoder.embedding_size, 
                      use_batch_norm=True,
                      hidden_layers_sizes=[128],
                      objective='softplus').to('cuda:0')

model_multihead = SequenceToTargetMultihead(
    seq_encoder=model.seq_encoder,
    heads=[head_mcc, head_amnt, head_hour_diff],
    losses=[torch.nn.NLLLoss(), torch.nn.L1Loss(), torch.nn.L1Loss()],
    metric_list=torchmetrics.Accuracy(compute_on_step=False),
    pretrained_lr=False, # 0.00001,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001), # , weight_decay=1e-5
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=1, gamma=0.9),
)

In [7]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

trainer_params = conf.trainer
print(OmegaConf.to_yaml(trainer_params))



trainer_params = conf.trainer
trainer_params['max_epochs']  = 15
callbacks = [ModelCheckpoint(every_n_epochs=5, save_top_k=-1), LearningRateMonitor(logging_interval='step')]
logger = TensorBoardLogger(save_dir='lightning_logs', name=conf.get('logger_name'))
trainer = pl.Trainer(**trainer_params, callbacks=callbacks, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


gpus: 1
auto_select_gpus: false
max_epochs: 30
deterministic: true



In [8]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model_multihead, dm)
print(trainer.logged_metrics)

logger.version = 37


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type          | Params
----------------------------------------------
0 | seq_encoder | RnnSeqEncoder | 3.5 M 
----------------------------------------------
3.5 M     Trainable params
0         Non-trainable params
3.5 M     Total params
13.933    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

{'loss': tensor(111.0199), 'val_loss': tensor(114.0924)}
CPU times: user 44min 32s, sys: 2min 20s, total: 46min 53s
Wall time: 48min 48s


In [9]:
torch.save(model_multihead.state_dict(), "models/rnn-e2e-nep.pt")

# Infernece

In [10]:
model_multihead.load_state_dict(torch.load("models/rnn-e2e-nep.pt"))

<All keys matched successfully>

In [11]:
# %%time
import tqdm

def inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            features = batch[0]
            targets = [t.unsqueeze(dim=1).to(device) for t in batch[1]]
            x = model(features.to(device))
            mcc = torch.argmax(x[0], dim=1, keepdim=True)
            amnt = x[1].unsqueeze(dim=1)
            hour_diff = x[2].unsqueeze(dim=1)
            predicted = [mcc, amnt, hour_diff]
            X += [torch.cat(predicted + targets, dim=1)]
    return X


valid_dl = torch.utils.data.DataLoader(dataset=valid_ds, 
                                       collate_fn=valid_ds.collate_fn,
                                       num_workers=8,
                                       batch_size=128)

In [12]:
preds = torch.vstack(inference(model_multihead, valid_dl)).cpu().numpy()

100%|█████████████████████████████████████████| 203/203 [00:06<00:00, 29.21it/s]


In [13]:
import numpy as np

df_valid = pd.DataFrame(preds, columns = ['predicted_mcc', 'predicted_amnt', 'predicted_hour_diff', 'mcc', 'amnt', 'hour_diff'])
df_valid.head()

Unnamed: 0,predicted_mcc,predicted_amnt,predicted_hour_diff,mcc,amnt,hour_diff
0,2.0,0.316761,15.209979,1.0,0.230493,0.0
1,2.0,0.314062,27.670671,14.0,0.208383,41.0
2,1.0,0.296456,4.204549,1.0,0.249951,6.0
3,2.0,0.428839,22.073618,2.0,0.535713,20.0
4,2.0,0.321625,2.882242,35.0,0.410411,48.0


## MCC

In [14]:
from sklearn.metrics import accuracy_score

print("Accuracy:", {accuracy_score(df_valid['mcc'],  df_valid['predicted_mcc'])})

Accuracy: {0.45001733502831387}


# amnt

In [15]:
from sklearn.metrics import mean_absolute_error

print("Mae amnt:", {mean_absolute_error(df_valid['amnt'],  df_valid['predicted_amnt'])})

Mae amnt: {0.080119364}


# hour_diff

In [16]:
from sklearn.metrics import mean_absolute_error

print("Accuracy:", {mean_absolute_error(df_valid['hour_diff'],  df_valid['predicted_hour_diff'])})

Accuracy: {111.75027}
