## Setup

In [1]:
%load_ext autoreload
%autoreload 2

import logging
import torch
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [2]:
from glob import glob
import numpy as np
import logging
import pytorch_lightning as pl
import torch
from tqdm.auto import tqdm
import os


logger = logging.getLogger(__name__)

## Model

In [3]:
import hydra
from omegaconf import OmegaConf

conf = OmegaConf.load('config/coles.yaml')
model = hydra.utils.instantiate(conf.pl_module)
model.load_state_dict(torch.load("models/coles.p"))

<All keys matched successfully>

In [4]:
feature_cols = list(conf.pl_module.seq_encoder.trx_encoder.embeddings.keys()) + \
               list(conf.pl_module.seq_encoder.trx_encoder.numeric_values.keys())
target_cols = ['flag']

## Inference

In [5]:
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load.iterable_processing.target_move import TargetMove
from ptls.data_load.iterable_processing.target_empty_filter import TargetEmptyFilter
from ptls.data_load import padded_collate, padded_collate_wo_target
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from tqdm.auto import tqdm

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices

train_data = glob('data/train_transactions_clipped.parquet')
valid_data = glob('data/valid_transactions_clipped.parquet')

dataset_conf = {
    'min_seq_len':25,
    }


def target_collate_fn(feature_cols, target_cols):

    def fn(batch):
        targets = []
        values = []
        for rec in batch:
            values.append({k: v for k, v in rec.items() if k in feature_cols})
            targets.append([rec[target_col] for target_col in target_cols])
        return padded_collate_wo_target(values), np.array(targets)
    return fn


process = IterableChain(
            SeqLenFilter(min_seq_len=dataset_conf['min_seq_len']),
            ToTorch()
            )
    

train_ds = ParquetDataset(train_data, post_processing=process)
valid_ds = ParquetDataset(valid_data, post_processing=process)


train_dl = torch.utils.data.DataLoader(
                        dataset=train_ds,
                        collate_fn=target_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

valid_dl = torch.utils.data.DataLoader(
                        dataset=valid_ds,
                        collate_fn=target_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

In [6]:
from tqdm import tqdm

def embedding_inference(dl, model, device='cuda:0'):
    
    model.to(device)
    
    df = []
    
    for batch in tqdm(dl):
        with torch.no_grad():
            targets = batch[1]
            features = model(batch[0].to(device)).cpu().numpy()
            df += np.concatenate([targets, features], axis=1).tolist()


    cols = target_cols + ["embed_" + str(i) for i in range(features.shape[1])]
    df = pd.DataFrame(df, columns=cols)
    return df

In [7]:
import pandas as pd

df_train = embedding_inference(train_dl, model.seq_encoder)  
df_val = embedding_inference(valid_dl, model.seq_encoder)  

3666it [01:47, 34.11it/s]
406it [00:10, 40.50it/s]


In [8]:
df_train.head()

Unnamed: 0,flag,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_1014,embed_1015,embed_1016,embed_1017,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023
0,0.0,0.207238,-0.138093,-0.070465,0.039588,0.74996,-0.061186,-0.023569,0.009563,-0.658574,...,0.024751,0.552349,0.088058,0.225189,0.068943,-0.188097,-0.206763,0.010289,0.04716,0.110729
1,0.0,0.222079,-0.256347,0.213891,-0.032479,0.786849,0.05003,-0.034273,0.256404,-0.654217,...,0.021662,-0.0946,0.073257,0.216162,0.037922,-0.26422,-0.2994,-0.022494,0.054382,0.21902
2,0.0,0.22843,-0.131115,0.084865,0.03183,0.762495,-0.020348,-0.038988,0.095485,-0.696967,...,0.027184,-0.075829,0.10477,0.21614,0.049658,-0.216921,-0.579872,0.009912,0.023958,0.170221
3,0.0,0.217732,-0.090584,-0.137838,0.011127,0.707037,-0.015044,-0.00424,0.217954,-0.689606,...,0.023633,-0.316959,0.080522,0.20864,0.040056,-0.379121,-0.298674,0.032936,-0.018247,0.035927
4,0.0,0.194463,-0.026915,-0.127303,0.086252,0.72656,-0.023898,-0.033572,0.038396,-0.661517,...,0.050396,-0.127763,0.074329,0.228102,0.014529,-0.305612,-0.321049,-0.175992,0.053392,0.179872


# Predict pd

In [11]:
from catboost import CatBoostClassifier, metrics


cb_model = CatBoostClassifier(
    metric_period=100,
    learning_rate=1e-1,
    iterations=1000,
    depth=3,
    verbose=100,
    task_type='GPU',
    loss_function='Logloss',
    eval_metric='AUC',
    early_stopping_rounds=100
)

feats = list(filter(lambda c: c.startswith('embed_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['flag']
y_val= df_val['flag']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict_proba(X_val)

0:	test: 0.5743463	best: 0.5743463 (0)	total: 10.1ms	remaining: 10.1s
100:	test: 0.7519980	best: 0.7520486 (97)	total: 838ms	remaining: 7.46s
200:	test: 0.7579563	best: 0.7580324 (192)	total: 1.67s	remaining: 6.63s
300:	test: 0.7600921	best: 0.7602437 (295)	total: 2.47s	remaining: 5.73s
400:	test: 0.7608144	best: 0.7612427 (342)	total: 3.27s	remaining: 4.88s
bestTest = 0.7612426579
bestIteration = 342
Shrink model to first 343 iterations.


In [12]:
from sklearn.metrics import roc_auc_score

print("Accuracy:", {roc_auc_score(df_val['flag'],  pred[:,1])})

Accuracy: {0.7612427465727699}
