## Setup

In [1]:
%load_ext autoreload
%autoreload 2

import logging
import torch
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [2]:
from glob import glob
import numpy as np
import logging
import pytorch_lightning as pl
import torch
from tqdm.auto import tqdm
import os


logger = logging.getLogger(__name__)

## Model

In [3]:
import hydra
from omegaconf import OmegaConf

conf = OmegaConf.load('config/coles.yaml')

feature_cols = list(conf.pl_module.seq_encoder.trx_encoder.embeddings.keys()) + \
               list(conf.pl_module.seq_encoder.trx_encoder.numeric_values.keys())
target_cols = ['mcc', 'amnt', 'hour_diff']

## Inference

In [4]:
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load.iterable_processing.target_move import TargetMove
from ptls.data_load.iterable_processing.target_empty_filter import TargetEmptyFilter
from ptls.data_load import padded_collate, padded_collate_wo_target
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from tqdm.auto import tqdm

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices

train_data = glob('data/train_transactions_clipped.parquet')
valid_data = glob('data/valid_transactions_clipped.parquet')

dataset_conf = {
    'min_seq_len':25,
    }


def nep_collate_fn(feature_cols, target_cols):

    def fn(batch):
        targets = []
        values = []
        for rec in batch:
            values.append([rec[target_col][-2] for target_col in target_cols])
            targets.append([rec[target_col][-1] for target_col in target_cols])
        return  np.array(values), np.array(targets)
    return fn


process = IterableChain(
            SeqLenFilter(min_seq_len=dataset_conf['min_seq_len']),
            ToTorch()
            )
    

train_ds = ParquetDataset(train_data, post_processing=process)
valid_ds = ParquetDataset(valid_data, post_processing=process)


train_dl = torch.utils.data.DataLoader(
                        dataset=train_ds,
                        collate_fn=nep_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

valid_dl = torch.utils.data.DataLoader(
                        dataset=valid_ds,
                        collate_fn=nep_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

In [5]:
batch = next(iter(train_dl))

In [6]:
from tqdm import tqdm

def embedding_inference(dl):

    
    df = []
    
    for batch in tqdm(dl):
        with torch.no_grad():
            targets = batch[1]
            features = batch[0]
            df += np.concatenate([targets, features], axis=1).tolist()


    cols = target_cols + ["f_" + t  for t in target_cols]
    df = pd.DataFrame(df, columns=cols)
    return df

In [7]:
import pandas as pd

df_train = embedding_inference(train_dl)  
df_val = embedding_inference(valid_dl)  

3666it [00:24, 147.65it/s]
406it [00:02, 162.17it/s]


In [8]:
df_train.head(5)

Unnamed: 0,mcc,amnt,hour_diff,f_mcc,f_amnt,f_hour_diff
0,2.0,0.537682,93.0,2.0,0.449287,525.0
1,2.0,0.49091,3.0,1.0,0.385651,23.0
2,2.0,0.426544,145.0,9.0,0.483799,237.0
3,2.0,0.544932,0.0,2.0,0.59462,594.0
4,2.0,0.516834,96.0,2.0,0.552848,211.0


In [9]:
df_train.hour_diff.value_counts()

0.0       45647
1.0       11542
2.0        5804
3.0        4457
24.0       4359
          ...  
5300.0        1
1973.0        1
1551.0        1
1290.0        1
2150.0        1
Name: hour_diff, Length: 1947, dtype: int64

In [10]:
df_train.f_hour_diff.value_counts()

0.0       45077
1.0       12392
2.0        6250
3.0        4673
24.0       4420
          ...  
4589.0        1
1401.0        1
2288.0        1
1752.0        1
1988.0        1
Name: f_hour_diff, Length: 1788, dtype: int64

# Predict next MCC

In [23]:
class_names=sorted(df_train['mcc'].unique().tolist())

In [24]:
from catboost import CatBoostClassifier, metrics


cb_model = CatBoostClassifier(
    class_names=class_names,
    metric_period=100,
    learning_rate=1e-1,
    iterations=1000,
    depth=3,
    verbose=100,
    task_type='GPU',
    loss_function='MultiClass',
    eval_metric='Accuracy',
    early_stopping_rounds=100
)


feats = list(filter(lambda c: c.startswith('f_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['mcc'].astype(int)
y_val= df_val['mcc'].astype(int)


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict(X_val)

0:	learn: 0.4025275	test: 0.4036365	best: 0.4036365 (0)	total: 44.5ms	remaining: 44.5s
100:	learn: 0.4253905	test: 0.4258253	best: 0.4259794 (98)	total: 3.45s	remaining: 30.7s
200:	learn: 0.4323253	test: 0.4306021	best: 0.4306021 (200)	total: 6.81s	remaining: 27.1s
300:	learn: 0.4349253	test: 0.4337224	best: 0.4338380 (294)	total: 10.2s	remaining: 23.6s
400:	learn: 0.4366089	test: 0.4355715	best: 0.4356100 (398)	total: 13.6s	remaining: 20.2s
500:	learn: 0.4372227	test: 0.4351477	best: 0.4357256 (439)	total: 16.9s	remaining: 16.9s
bestTest = 0.4357255672
bestIteration = 439
Shrink model to first 440 iterations.


In [25]:
from sklearn.metrics import accuracy_score

print("Accuracy:", {accuracy_score(df_val['mcc'],  pred)})

Accuracy: {0.4357255672406487}


# Predict next amnt

In [14]:
from catboost import CatBoostRegressor, metrics

cb_model = CatBoostRegressor(
    learning_rate=1e-1,
    iterations=2000,
    depth=7,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)

feats = list(filter(lambda c: c.startswith('f_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['amnt']
y_val= df_val['amnt']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict(X_val)

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.0869309	test: 0.0869883	best: 0.0869883 (0)	total: 5.23ms	remaining: 10.5s
100:	learn: 0.0802053	test: 0.0805799	best: 0.0805625 (48)	total: 348ms	remaining: 6.55s
bestTest = 0.08056248443
bestIteration = 48
Shrink model to first 49 iterations.


In [15]:
from sklearn.metrics import mean_absolute_error

print("Mae amnt:", {mean_absolute_error(df_val['amnt'],  pred)})

Mae amnt: {0.08056248993401939}


# Predict next hour_diff

In [16]:
from catboost import CatBoostRegressor, metrics

cb_model = CatBoostRegressor(
    learning_rate=1e-1,
    iterations=2000,
    depth=7,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)

feats = list(filter(lambda c: c.startswith('f_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['hour_diff']
y_val= df_val['hour_diff']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict(X_val)

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 114.6522686	test: 115.3146500	best: 115.3146500 (0)	total: 4.83ms	remaining: 9.66s
100:	learn: 114.4715129	test: 115.1335375	best: 115.1335375 (100)	total: 348ms	remaining: 6.54s
200:	learn: 114.3080877	test: 114.9702608	best: 114.9702608 (200)	total: 695ms	remaining: 6.22s
300:	learn: 114.1643203	test: 114.8268809	best: 114.8268809 (300)	total: 1.04s	remaining: 5.86s
400:	learn: 114.0380112	test: 114.7002774	best: 114.7002774 (400)	total: 1.38s	remaining: 5.52s
500:	learn: 113.9270209	test: 114.5892369	best: 114.5892369 (500)	total: 1.72s	remaining: 5.16s
600:	learn: 113.8263282	test: 114.4889248	best: 114.4889248 (600)	total: 2.06s	remaining: 4.79s
700:	learn: 113.7317904	test: 114.3944778	best: 114.3944778 (700)	total: 2.4s	remaining: 4.45s
800:	learn: 113.6440722	test: 114.3073693	best: 114.3073693 (800)	total: 2.73s	remaining: 4.09s
900:	learn: 113.5613750	test: 114.2254517	best: 114.2254517 (900)	total: 3.07s	remaining: 3.75s
1000:	learn: 113.4841165	test: 114.1490235	b

In [17]:
from sklearn.metrics import mean_absolute_error

print("Accuracy:", {mean_absolute_error(df_val['hour_diff'],  pred)})

Accuracy: {113.60708132056196}
