## Setup

In [1]:
%load_ext autoreload
%autoreload 2

import logging
import torch
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [2]:
from glob import glob
import numpy as np
import logging
import pytorch_lightning as pl
import torch
from tqdm.auto import tqdm
import os


logger = logging.getLogger(__name__)

## Model

In [3]:
import hydra
from omegaconf import OmegaConf

conf = OmegaConf.load('config/cpc.yaml')
model = hydra.utils.instantiate(conf.pl_module)
model.load_state_dict(torch.load("models/cpc.p"))

<All keys matched successfully>

In [4]:
feature_cols = list(conf.pl_module.seq_encoder.trx_encoder.embeddings.keys()) + \
               list(conf.pl_module.seq_encoder.trx_encoder.numeric_values.keys())
target_cols = ['mcc', 'amnt', 'hour_diff']

## Inference

In [5]:
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load.iterable_processing.target_move import TargetMove
from ptls.data_load.iterable_processing.target_empty_filter import TargetEmptyFilter
from ptls.data_load import padded_collate, padded_collate_wo_target
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from tqdm.auto import tqdm

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices

train_data = glob('data/train_transactions_clipped.parquet')
valid_data = glob('data/valid_transactions_clipped.parquet')

dataset_conf = {
    'min_seq_len':25,
    }


def nep_collate_fn(feature_cols, target_cols):

    def fn(batch):
        targets = []
        values = []
        for rec in batch:
            values.append({k: v[:-1] for k, v in rec.items() if k in feature_cols})
            targets.append([rec[target_col][-1] for target_col in target_cols])
        return padded_collate_wo_target(values), np.array(targets)
    return fn


process = IterableChain(
            SeqLenFilter(min_seq_len=dataset_conf['min_seq_len']),
            ToTorch()
            )
    

train_ds = ParquetDataset(train_data, post_processing=process)
valid_ds = ParquetDataset(valid_data, post_processing=process)


train_dl = torch.utils.data.DataLoader(
                        dataset=train_ds,
                        collate_fn=nep_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

valid_dl = torch.utils.data.DataLoader(
                        dataset=valid_ds,
                        collate_fn=nep_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

In [6]:
from tqdm import tqdm

def embedding_inference(dl, model, device='cuda:1'):
    
    model.to(device)
    
    df = []
    
    for batch in tqdm(dl):
        with torch.no_grad():
            targets = batch[1]
            features = model(batch[0].to(device)).cpu().numpy()
            df += np.concatenate([targets, features], axis=1).tolist()


    cols = target_cols + ["embed_" + str(i) for i in range(features.shape[1])]
    df = pd.DataFrame(df, columns=cols)
    return df

In [7]:
import pandas as pd

model.seq_encoder.is_reduce_sequence = True

df_train = embedding_inference(train_dl, model.seq_encoder)  
df_val = embedding_inference(valid_dl, model.seq_encoder)  

3666it [01:30, 40.38it/s]
406it [00:10, 38.18it/s]


In [8]:
df_train.head()

Unnamed: 0,mcc,amnt,hour_diff,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,...,embed_1014,embed_1015,embed_1016,embed_1017,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023
0,2.0,0.537682,93.0,-0.135425,-0.025563,-0.832383,-0.443871,-0.256772,-0.096166,-0.367555,...,0.220581,0.383107,-0.344861,-0.046685,-0.062376,-0.214278,0.429598,0.316703,-0.823607,-0.068331
1,2.0,0.49091,3.0,0.568995,-0.427223,-0.198055,-0.030762,0.061307,-0.037302,-0.296958,...,0.080086,-0.094486,0.014594,-0.077216,0.330096,-0.141233,-0.101689,0.244265,-0.716257,-0.181191
2,2.0,0.426544,145.0,-0.204826,0.03057,-0.698429,-0.544108,0.011031,-0.191963,-0.088573,...,-0.027709,-0.13175,-0.410465,-0.042604,0.13106,-0.284322,-0.43645,0.272733,-0.776181,-0.237437
3,2.0,0.544932,0.0,-0.118677,0.695231,-0.087829,0.36493,0.174214,0.145642,-0.295656,...,0.136849,-0.758047,0.949156,-0.026383,0.102808,-0.282843,-0.195086,-0.007509,0.031246,0.377631
4,2.0,0.516834,96.0,-0.158809,0.424362,-0.189834,0.237304,0.108413,0.195587,-0.162792,...,0.374915,0.038658,0.274064,-0.06511,0.103757,-0.035104,0.397811,0.332082,-0.391122,0.046227


# Predict next MCC

In [9]:
class_names=sorted(df_train['mcc'].unique().tolist())

In [10]:
from catboost import CatBoostClassifier, metrics

cb_model = CatBoostClassifier(
    class_names=class_names,
    metric_period=100,
    learning_rate=1e-1,
    iterations=1000,
    depth=3,
    verbose=100,
    task_type='GPU',
    loss_function='MultiClass',
    eval_metric='Accuracy',
    early_stopping_rounds=100
)

feats = list(filter(lambda c: c.startswith('embed_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['mcc']
y_val= df_val['mcc']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict(X_val)

0:	learn: 0.3954138	test: 0.3932355	best: 0.3932355 (0)	total: 180ms	remaining: 2m 59s
100:	learn: 0.4427509	test: 0.4379599	best: 0.4379599 (100)	total: 15.9s	remaining: 2m 21s
200:	learn: 0.4591309	test: 0.4518279	best: 0.4520590 (195)	total: 32.4s	remaining: 2m 8s
300:	learn: 0.4677024	test: 0.4591086	best: 0.4591086 (300)	total: 46.7s	remaining: 1m 48s
400:	learn: 0.4728001	test: 0.4645017	best: 0.4650025 (397)	total: 1m	remaining: 1m 30s
500:	learn: 0.4774801	test: 0.4683925	best: 0.4687392 (496)	total: 1m 16s	remaining: 1m 16s
600:	learn: 0.4808218	test: 0.4704341	best: 0.4708194 (576)	total: 1m 32s	remaining: 1m 1s
700:	learn: 0.4841080	test: 0.4716669	best: 0.4723603 (691)	total: 1m 48s	remaining: 46.2s
800:	learn: 0.4874198	test: 0.4736315	best: 0.4736315 (800)	total: 2m	remaining: 29.9s
900:	learn: 0.4902841	test: 0.4744019	best: 0.4747872 (893)	total: 2m 10s	remaining: 14.4s
999:	learn: 0.4933018	test: 0.4757887	best: 0.4760199 (984)	total: 2m 21s	remaining: 0us
bestTest = 0

In [11]:
from sklearn.metrics import accuracy_score

print("Accuracy:", {accuracy_score(df_val['mcc'],  pred)})

Accuracy: {0.47601987749913327}


# Predict next amnt

In [12]:
from catboost import CatBoostRegressor, metrics

cb_model = CatBoostRegressor(
    learning_rate=1e-1,
    iterations=2000,
    depth=7,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)



feats = list(filter(lambda c: c.startswith('embed_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['amnt']
y_val= df_val['amnt']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict(X_val)

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.0877386	test: 0.0889455	best: 0.0889455 (0)	total: 26.6ms	remaining: 53.2s
100:	learn: 0.0750736	test: 0.0828301	best: 0.0828191 (97)	total: 2.28s	remaining: 42.9s
200:	learn: 0.0718490	test: 0.0830877	best: 0.0827478 (129)	total: 4.5s	remaining: 40.3s
bestTest = 0.0827478435
bestIteration = 129
Shrink model to first 130 iterations.


In [13]:
from sklearn.metrics import mean_absolute_error

print("Mae amnt:", {mean_absolute_error(df_val['amnt'],  pred)})

Mae amnt: {0.08274784249201608}


# Predict next hour_diff

In [14]:
from catboost import CatBoostRegressor, metrics

cb_model = CatBoostRegressor(
    learning_rate=1e-1,
    iterations=2000,
    depth=7,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)

feats = list(filter(lambda c: c.startswith('embed_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['hour_diff']
y_val= df_val['hour_diff']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict(X_val)

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 114.6514417	test: 115.3142552	best: 115.3142552 (0)	total: 24.6ms	remaining: 49.2s
100:	learn: 114.3812032	test: 115.0677992	best: 115.0677992 (100)	total: 2.33s	remaining: 43.8s
200:	learn: 114.1476035	test: 114.8578046	best: 114.8578046 (200)	total: 4.61s	remaining: 41.3s
300:	learn: 113.9541717	test: 114.6885473	best: 114.6885473 (300)	total: 7.06s	remaining: 39.8s
400:	learn: 113.7856147	test: 114.5385319	best: 114.5385319 (400)	total: 9.5s	remaining: 37.9s
500:	learn: 113.6369883	test: 114.4059093	best: 114.4059093 (500)	total: 12s	remaining: 35.8s
600:	learn: 113.4998870	test: 114.2847953	best: 114.2847953 (600)	total: 14.5s	remaining: 33.8s
700:	learn: 113.3706796	test: 114.1686795	best: 114.1686795 (700)	total: 17.6s	remaining: 32.6s
800:	learn: 113.2480191	test: 114.0581494	best: 114.0581494 (800)	total: 20.9s	remaining: 31.2s
900:	learn: 113.1303625	test: 113.9528295	best: 113.9528295 (900)	total: 24.3s	remaining: 29.7s
1000:	learn: 113.0171217	test: 113.8509573	bes

In [15]:
from sklearn.metrics import mean_absolute_error

print("Accuracy:", {mean_absolute_error(df_val['hour_diff'],  pred)})

Accuracy: {112.98572079209072}
