## Setup

In [1]:
%load_ext autoreload
%autoreload 2

import logging
import torch
import pytorch_lightning as pl
import warnings

warnings.filterwarnings('ignore')
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [2]:
from glob import glob
import numpy as np
import logging
import pytorch_lightning as pl
import torch
from tqdm.auto import tqdm
import os


logger = logging.getLogger(__name__)

## Model

In [3]:
import hydra
from omegaconf import OmegaConf

conf = OmegaConf.load('config/cpc.yaml')
model = hydra.utils.instantiate(conf.pl_module)
model.load_state_dict(torch.load("models/cpc.p"))

<All keys matched successfully>

In [4]:
feature_cols = list(conf.pl_module.seq_encoder.trx_encoder.embeddings.keys()) + \
               list(conf.pl_module.seq_encoder.trx_encoder.numeric_values.keys())
target_cols = ['flag']

## Inference

In [5]:
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load.iterable_processing.target_move import TargetMove
from ptls.data_load.iterable_processing.target_empty_filter import TargetEmptyFilter
from ptls.data_load import padded_collate, padded_collate_wo_target
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from tqdm.auto import tqdm

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices

train_data = glob('data/train_transactions_clipped.parquet')
valid_data = glob('data/valid_transactions_clipped.parquet')

dataset_conf = {
    'min_seq_len':25,
    }


def target_collate_fn(feature_cols, target_cols):

    def fn(batch):
        targets = []
        values = []
        for rec in batch:
            values.append({k: v for k, v in rec.items() if k in feature_cols})
            targets.append([rec[target_col] for target_col in target_cols])
        return padded_collate_wo_target(values), np.array(targets)
    return fn


process = IterableChain(
            SeqLenFilter(min_seq_len=dataset_conf['min_seq_len']),
            ToTorch()
            )
    

train_ds = ParquetDataset(train_data, post_processing=process)
valid_ds = ParquetDataset(valid_data, post_processing=process)


train_dl = torch.utils.data.DataLoader(
                        dataset=train_ds,
                        collate_fn=target_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

valid_dl = torch.utils.data.DataLoader(
                        dataset=valid_ds,
                        collate_fn=target_collate_fn(feature_cols, target_cols),
                        num_workers=8,
                        batch_size=64)

In [6]:
from tqdm import tqdm

def embedding_inference(dl, model, device='cuda:0'):
    
    model.to(device)
    
    df = []
    
    for batch in tqdm(dl):
        with torch.no_grad():
            targets = batch[1]
            features = model(batch[0].to(device)).cpu().numpy()
            df += np.concatenate([targets, features], axis=1).tolist()


    cols = target_cols + ["embed_" + str(i) for i in range(features.shape[1])]
    df = pd.DataFrame(df, columns=cols)
    return df

In [7]:
import pandas as pd

model.seq_encoder.is_reduce_sequence = True

df_train = embedding_inference(train_dl, model.seq_encoder)  
df_val = embedding_inference(valid_dl, model.seq_encoder)  

3666it [01:22, 44.46it/s]
406it [00:09, 41.46it/s]


In [8]:
df_train.head()

Unnamed: 0,flag,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_1014,embed_1015,embed_1016,embed_1017,embed_1018,embed_1019,embed_1020,embed_1021,embed_1022,embed_1023
0,0.0,-0.001884,0.029678,-0.368001,-0.438923,-0.256236,-0.092869,-0.368203,-0.056535,0.017936,...,0.219115,0.384228,-0.389103,-0.054365,-0.061914,-0.217882,0.302824,0.317627,-0.843474,-0.357644
1,0.0,0.49749,-0.251693,-0.02607,-0.032538,0.072265,-0.027482,-0.295669,0.203094,-0.640301,...,0.075718,-0.096116,0.007134,-0.078473,0.314798,-0.154614,0.037853,0.377936,-0.70288,-0.178977
2,0.0,-0.116681,0.035342,-0.29279,-0.541468,0.01909,-0.186942,-0.098093,0.404233,0.073702,...,-0.030164,-0.132847,-0.688073,-0.044164,0.139248,-0.246922,-0.711427,0.418092,0.688879,-0.218136
3,0.0,-0.104357,0.709393,-0.07224,0.36655,0.144971,0.143783,-0.292065,-0.114839,-0.178328,...,0.13889,-0.752829,0.750009,-0.011598,0.100241,-0.284383,-0.479511,-0.029866,0.029147,0.08127
4,0.0,-0.160716,0.469429,-0.143408,0.242216,0.111258,0.187987,-0.163859,0.156713,-0.286034,...,0.374161,0.040415,0.410244,-0.065304,0.102565,-0.06441,0.321401,0.351916,-0.274337,0.056422


# Predict pd

In [14]:
from catboost import CatBoostClassifier, metrics

cb_model = CatBoostClassifier(
    #metric_period=100,
    # learning_rate=1e-1,
    #iterations=1000,
    # depth=3,
    verbose=100,
    task_type='GPU',
    # loss_function='Logloss',
    eval_metric='AUC',
    early_stopping_rounds=100
)


feats = list(filter(lambda c: c.startswith('embed_'), df_train.columns))
X_train = df_train[feats].values
X_val = df_val[feats].values
y_train = df_train['flag']
y_val= df_val['flag']


cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), plot=False)

pred = cb_model.predict_proba(X_val)



Learning rate set to 0.044776


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5955602	best: 0.5955602 (0)	total: 47ms	remaining: 47s
100:	test: 0.7403083	best: 0.7403083 (100)	total: 4.38s	remaining: 39s
200:	test: 0.7449971	best: 0.7451489 (198)	total: 8.9s	remaining: 35.4s
300:	test: 0.7463885	best: 0.7464175 (297)	total: 13.3s	remaining: 30.8s
400:	test: 0.7462545	best: 0.7466450 (371)	total: 17.8s	remaining: 26.6s
500:	test: 0.7470408	best: 0.7471985 (465)	total: 22.2s	remaining: 22.1s
bestTest = 0.7471984923
bestIteration = 465
Shrink model to first 466 iterations.


In [15]:
from sklearn.metrics import roc_auc_score

print("Accuracy:", {roc_auc_score(df_val['flag'],  pred[:,1])})

Accuracy: {0.7471982468674518}
