## Setup

In [1]:
%load_ext autoreload
%autoreload 2

import logging
import torch
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [2]:
from glob import glob
import numpy as np
import logging
import pytorch_lightning as pl
import torch
from tqdm.auto import tqdm
import os


logger = logging.getLogger(__name__)

## Model

In [3]:
import hydra
from omegaconf import OmegaConf

conf = OmegaConf.load('config/coles.yaml')
model = hydra.utils.instantiate(conf.pl_module)
model.load_state_dict(torch.load("models/coles.p"))

<All keys matched successfully>

In [4]:
feature_cols = list(conf.pl_module.seq_encoder.trx_encoder.embeddings.keys()) + \
               list(conf.pl_module.seq_encoder.trx_encoder.numeric_values.keys())
target_cols = ['mcc', 'amnt', 'hour_diff']

## Inference

In [5]:
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load.iterable_processing.target_move import TargetMove
from ptls.data_load.iterable_processing.target_empty_filter import TargetEmptyFilter
from ptls.data_load import padded_collate, padded_collate_wo_target
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from tqdm.auto import tqdm

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices

train_data = glob('data/train_transactions_clipped.parquet')
valid_data = glob('data/valid_transactions_clipped.parquet')

dataset_conf = {
    'min_seq_len':25,
    }


def nep_collate_fn(feature_cols, target_cols):

    def fn(batch):
        targets = []
        values = []
        for rec in batch:
            values.append({k: v[:-1] for k, v in rec.items() if k in feature_cols})
            targets.append([rec[target_col][-1] for target_col in target_cols])
        return padded_collate_wo_target(values), np.array(targets)
    return fn


process = IterableChain(
            SeqLenFilter(min_seq_len=dataset_conf['min_seq_len']),
            ToTorch()
            )
    

train_ds = ParquetDataset(train_data, post_processing=process)
valid_ds = ParquetDataset(valid_data, post_processing=process)


train_dl = torch.utils.data.DataLoader(
                        dataset=train_ds,
                        collate_fn=nep_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

valid_dl = torch.utils.data.DataLoader(
                        dataset=valid_ds,
                        collate_fn=nep_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

In [6]:
from tqdm import tqdm

def embedding_inference(dl, model, device='cuda:1'):
    
    model.to(device)
    
    df = []
    
    for batch in tqdm(dl):
        with torch.no_grad():
            targets = batch[1]
            features = model(batch[0].to(device)).cpu().numpy()
            df += np.concatenate([targets, features], axis=1).tolist()


    cols = target_cols + ["embed_" + str(i) for i in range(features.shape[1])]
    df = pd.DataFrame(df, columns=cols)
    return df

In [7]:
import pandas as pd

df_train = embedding_inference(train_dl, model.seq_encoder)  
df_val = embedding_inference(valid_dl, model.seq_encoder)  

3666it [01:27, 41.69it/s]
406it [00:10, 40.59it/s]


In [8]:
df_train.head()

Unnamed: 0,mcc,amnt,hour_diff,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,...,embed_1014,embed_1015,embed_1016,embed_1017,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023
0,2.0,0.537682,93.0,0.206564,-0.135329,-0.069773,0.037959,0.744756,-0.060816,-0.023761,...,0.024613,0.553877,0.088246,0.224332,0.067838,-0.190348,-0.205112,0.0116,0.04601,0.110382
1,2.0,0.49091,3.0,0.221395,-0.288682,0.222773,-0.031157,0.792092,0.051108,-0.034407,...,0.02139,-0.097828,0.073278,0.215796,0.036889,-0.264213,-0.297667,-0.022489,0.054961,0.21964
2,2.0,0.426544,145.0,0.227916,-0.127958,0.078684,0.031956,0.758466,-0.019711,-0.039038,...,0.026989,-0.08147,0.104947,0.215453,0.049085,-0.219179,-0.581297,0.010498,0.0235,0.169956
3,2.0,0.544932,0.0,0.215687,-0.089508,-0.135199,0.012565,0.700087,-0.015262,-0.0042,...,0.02295,-0.300145,0.080627,0.20787,0.039538,-0.379681,-0.29735,0.035988,-0.017468,0.036234
4,2.0,0.516834,96.0,0.192429,-0.02608,-0.125103,0.090209,0.728842,-0.024083,-0.034507,...,0.049875,-0.128565,0.074193,0.227088,0.013846,-0.305255,-0.320588,-0.197538,0.05053,0.179575


# Predict next MCC

In [9]:
class_names=sorted(df_train['mcc'].unique().tolist())

In [10]:
df_train['mcc'] =df_train['mcc'].astype(int)

In [17]:
from catboost import CatBoostClassifier, metrics

cb_model = CatBoostClassifier(
    class_names=class_names,
    metric_period=100,
    learning_rate=1e-1,
    iterations=1000,
    depth=3,
    verbose=100,
    task_type='GPU',
    loss_function='MultiClass',
    eval_metric='Accuracy',
    early_stopping_rounds=100
)


feats = list(filter(lambda c: c.startswith('embed_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['mcc']
y_val= df_val['mcc']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict(X_val)



0:	learn: 0.3973318	test: 0.3933896	best: 0.3933896 (0)	total: 146ms	remaining: 2m 26s
100:	learn: 0.4311873	test: 0.4225895	best: 0.4225895 (100)	total: 10.8s	remaining: 1m 36s
200:	learn: 0.4405047	test: 0.4337995	best: 0.4337995 (200)	total: 21.4s	remaining: 1m 25s
300:	learn: 0.4452571	test: 0.4361493	best: 0.4361493 (289)	total: 32s	remaining: 1m 14s
400:	learn: 0.4480745	test: 0.4379599	best: 0.4379984 (392)	total: 42.4s	remaining: 1m 3s
500:	learn: 0.4506404	test: 0.4390000	best: 0.4394237 (489)	total: 52.8s	remaining: 52.6s
600:	learn: 0.4528525	test: 0.4394237	best: 0.4397319 (591)	total: 1m 3s	remaining: 42.1s
700:	learn: 0.4552181	test: 0.4392696	best: 0.4402712 (627)	total: 1m 13s	remaining: 31.5s
bestTest = 0.4402711969
bestIteration = 627
Shrink model to first 628 iterations.


In [18]:
from sklearn.metrics import accuracy_score

print("Accuracy:", {accuracy_score(df_val['mcc'],  pred)})

Accuracy: {0.44027119688739935}


# Predict next amnt

In [13]:
from catboost import CatBoostRegressor, metrics

cb_model = CatBoostRegressor(
    learning_rate=1e-1,
    iterations=2000,
    depth=7,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)

feats = list(filter(lambda c: c.startswith('embed_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['amnt']
y_val= df_val['amnt']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict(X_val)

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.0884766	test: 0.0897347	best: 0.0897347 (0)	total: 41ms	remaining: 1m 21s
100:	learn: 0.0785862	test: 0.0848277	best: 0.0846854 (59)	total: 3.33s	remaining: 1m 2s
bestTest = 0.08468538466
bestIteration = 59
Shrink model to first 60 iterations.


In [14]:
from sklearn.metrics import mean_absolute_error

print("Mae amnt:", {mean_absolute_error(df_val['amnt'],  pred)})

Mae amnt: {0.08468538623678998}


# Predict next hour_diff

In [15]:
from catboost import CatBoostRegressor, metrics

cb_model = CatBoostRegressor(
    learning_rate=1e-1,
    iterations=2000,
    depth=7,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)



feats = list(filter(lambda c: c.startswith('embed_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['hour_diff']
y_val= df_val['hour_diff']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict(X_val)

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 114.6528483	test: 115.3157575	best: 115.3157575 (0)	total: 30ms	remaining: 59.9s
100:	learn: 114.5249025	test: 115.2442313	best: 115.2442313 (100)	total: 2.58s	remaining: 48.5s
200:	learn: 114.4067856	test: 115.1766247	best: 115.1766247 (200)	total: 5.11s	remaining: 45.7s
300:	learn: 114.3024274	test: 115.1147964	best: 115.1147964 (300)	total: 7.61s	remaining: 43s
400:	learn: 114.2107538	test: 115.0571671	best: 115.0571671 (400)	total: 10.1s	remaining: 40.3s
500:	learn: 114.1282953	test: 115.0005971	best: 115.0005971 (500)	total: 12.6s	remaining: 37.7s
600:	learn: 114.0529122	test: 114.9454717	best: 114.9454717 (600)	total: 15.2s	remaining: 35.3s
700:	learn: 113.9832065	test: 114.8940445	best: 114.8940445 (700)	total: 17.9s	remaining: 33.2s
800:	learn: 113.9176438	test: 114.8445048	best: 114.8445048 (800)	total: 21.7s	remaining: 32.5s
900:	learn: 113.8558063	test: 114.7976424	best: 114.7976424 (900)	total: 25.8s	remaining: 31.4s
1000:	learn: 113.7974725	test: 114.7524751	best

In [16]:
from sklearn.metrics import mean_absolute_error

print("Accuracy:", {mean_absolute_error(df_val['hour_diff'],  pred)})

Accuracy: {114.43801889374788}
