In [17]:
import torch
import ptls
import pandas as pd
import numpy as np
import pytorch_lightning as pl

import catboost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

from ptls.frames import PtlsDataModule, coles
from ptls.frames.coles import split_strategy
from ptls.data_load import datasets
from ptls.data_load.datasets import ParquetFiles
from ptls.data_load.datasets import MemoryMapDataset, parquet_file_scan, ParquetDataset
from ptls.data_load.iterable_processing import SeqLenFilter, FeatureFilter
from ptls.frames.coles import CoLESModule
from ptls.nn.trx_encoder import TrxEncoder
from ptls.nn.seq_encoder import RnnSeqEncoder
from ptls.nn.head import Head
from ptls.frames.coles.losses import ContrastiveLoss
from ptls.frames.coles import sampling_strategies

import sklearn
from sklearn.metrics import mean_absolute_error, accuracy_score

from functools import partial

# <font size="7">Data Module</font>

In [3]:
from ptls.frames import coles
from ptls.data_load import datasets
from ptls.frames import PtlsDataModule
from ptls.frames.coles import losses, sampling_strategies
from ptls.frames.coles import split_strategy

data_module = PtlsDataModule(
    train_data=ptls.frames.coles.ColesDataset(
        splitter=split_strategy.SampleSlices(split_count=5, cnt_min=20, cnt_max=60),
        data=ptls.data_load.datasets.AugmentationDataset(
            data=MemoryMapDataset(
                data=ParquetDataset(
                    i_filters=[SeqLenFilter(min_seq_len=30), FeatureFilter()],
                    data_files=parquet_file_scan(file_path='train_df_agg.parquet',
                                                 valid_rate=0,
                                                 return_part='train')
                )
            ),
            f_augmentations=[ptls.data_load.augmentations.DropoutTrx(trx_dropout=0.01)]
        )
    ),
train_batch_size=256,
train_num_workers=8,
)

# <font size="7">Model</font>

In [4]:
model = CoLESModule(
      validation_metric=ptls.frames.coles.metric.BatchRecallTopK(K=4,
                                                                 metric="cosine"),
      seq_encoder=RnnSeqEncoder(
            trx_encoder=TrxEncoder(
            use_batch_norm_with_lens=True,
            norm_embeddings=False,
            embeddings_noise=0.003,
            
            embeddings={
                "mcc": {"in": 110, "out": 32},
            },

            numeric_values={
                'amnt': 'identity',
                'hour_diff': 'identity',
            },
            ),
            type="lstm",
            hidden_size=1024,
            bidir=False,
            trainable_starter="static",
      ),
     
      head=Head(
            use_norm_encoder=False,
            input_size=1024,
      ),

      loss=ContrastiveLoss(
            margin=0.5,
            sampling_strategy=sampling_strategies.HardNegativePairSelector(neg_count=5),
      ),
    
      optimizer_partial=partial(
            torch.optim.Adam, 
            lr=0.001,
            weight_decay=0.0
      ),
    
      lr_scheduler_partial=partial(
            torch.optim.lr_scheduler.StepLR,
            step_size=1,
            gamma=0.8,
      ),
)

# <font size="7">Fitting</font>

In [None]:
trainer = pl.Trainer(
    num_sanity_val_steps=0,
    gpus=1,
    auto_select_gpus=False,
    max_epochs=30,
    enable_checkpointing=False,
    deterministic=True
)

trainer.fit(model, data_module)
print(trainer.logged_metrics)

# <font size="5">Load trained model</font>

In [5]:
PATH = 'path/model.pth'
model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

# <font size="7">Inference</font>

In [15]:
train_df = pd.read_parquet('data/train.parquet').drop(columns=['transaction_number',
                                                               'reversed_transaction', 
                                                               'transaction_max',
                                                               'time'])

valid_df = pd.read_parquet('data/valid.parquet').drop(columns=['transaction_number',
                                                               'reversed_transaction', 
                                                               'transaction_max',
                                                               'time'])
last_transactions_train = train_df.groupby('app_id').tail(1)

first_transactions_valid = valid_df.groupby('app_id').head(1)
first_transactions_valid = first_transactions_valid.dropna()

In [8]:
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
iterable_inference_dataset = ParquetDataset(
    data_files=ParquetFiles(['train_agg_inference.parquet'],                                                                   
                            ).data_files
)

next(iter(iterable_inference_dataset))

inference_dl = torch.utils.data.DataLoader(
    dataset=iterable_inference_dataset,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=0,
)

next(iter(inference_dl)).payload

mod = InferenceModule(model, pandas_output=True, model_out_name='emb')

pred = pl.Trainer(gpus=1).predict(mod, inference_dl)

GPU available: True, used: True
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [9]:
train_embeddings = pd.concat(pred, axis=0)
train_embeddings

Unnamed: 0,app_id,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,emb_0005,emb_0006,emb_0007,emb_0008,...,emb_1014,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023
0,0,0.003914,0.009090,-0.014253,-0.000246,0.012538,0.000039,0.000037,-0.004606,0.002772,...,-0.000033,-0.000383,-0.240523,0.172152,0.013610,-0.029905,-0.000650,-0.007254,-0.922810,0.000551
1,1,0.004588,0.008645,-0.015699,-0.000383,0.012169,0.000302,0.000100,-0.006467,0.002386,...,-0.000029,-0.000440,-0.208373,0.165709,0.014849,-0.025442,-0.001140,-0.008975,-0.922521,0.000288
2,2,0.003725,0.009859,-0.013657,-0.000366,0.013092,0.000565,0.000033,-0.005264,0.002802,...,-0.000011,-0.000551,-0.253237,0.166377,0.014257,-0.053596,0.001399,-0.009352,-0.926340,0.000173
3,4,0.003516,0.008183,-0.010952,-0.000286,0.013063,0.000955,0.000108,-0.011730,0.002057,...,-0.000061,-0.000149,-0.228056,0.163205,0.017917,-0.038170,-0.001332,-0.006900,-0.925197,0.000135
4,6,0.003416,0.012006,-0.015211,-0.000311,0.014764,0.000380,0.000117,-0.004704,0.003123,...,-0.000035,-0.000401,-0.234176,0.166675,0.025411,-0.043626,-0.001973,-0.008154,-0.922646,0.000166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1003041,0.003916,0.007977,-0.011157,-0.000312,0.010694,-0.000223,0.000299,-0.007234,0.002428,...,-0.000009,-0.000503,-0.229039,0.165724,0.018091,-0.052470,-0.000024,-0.009233,-0.929722,0.000126
97,1003044,0.003665,0.010082,-0.012899,-0.000328,0.012460,0.000326,0.000084,-0.007317,0.003034,...,-0.000016,-0.000322,-0.204652,0.158848,0.020121,-0.045413,0.000224,-0.008040,-0.928399,0.000227
98,1003047,0.002183,0.012965,-0.016791,-0.000241,0.015602,0.000425,0.000013,0.021656,0.002620,...,-0.000019,-0.000109,-0.234429,0.175132,0.031866,-0.067642,0.000083,-0.009457,-0.931182,0.000038
99,1003048,0.002896,0.007967,-0.011837,-0.000284,0.014049,0.000088,0.000073,-0.011703,0.002389,...,-0.000007,-0.000069,-0.241328,0.162514,0.033378,-0.061186,-0.012767,-0.008983,-0.930808,0.000540


In [11]:
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
iterable_inference_dataset = ParquetDataset(
    data_files=ParquetFiles(['train_agg_with_last.parquet'],                                                                   
                            ).data_files
)

next(iter(iterable_inference_dataset))

inference_dl = torch.utils.data.DataLoader(
    dataset=iterable_inference_dataset,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=0,
)

next(iter(inference_dl)).payload

mod = InferenceModule(model, pandas_output=True, model_out_name='emb')

pred = pl.Trainer(gpus=1).predict(mod, inference_dl)

GPU available: True, used: True
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [12]:
train_embeddings_with_last = pd.concat(pred, axis=0)
train_embeddings_with_last

Unnamed: 0,app_id,__index_level_0__,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,emb_0005,emb_0006,emb_0007,...,emb_1014,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023
0,0,0,0.003954,0.009111,-0.014293,-0.000325,0.012551,0.000029,0.000038,-0.004674,...,-0.000020,-0.000435,-0.240582,0.172266,0.013527,-0.030062,-6.748151e-04,-0.007260,-0.922807,0.000460
1,1,1,0.004714,0.008807,-0.015979,-0.000310,0.012286,0.000031,0.000120,-0.005672,...,-0.000265,-0.000468,-0.208511,0.165530,0.014747,-0.025615,-8.217460e-04,-0.008998,-0.922640,0.000153
2,2,2,0.003796,0.009738,-0.013814,-0.000293,0.013076,0.000626,0.000028,-0.005176,...,-0.000014,-0.000696,-0.253382,0.166213,0.014296,-0.053099,1.477676e-03,-0.009166,-0.926000,0.000461
3,4,4,0.003309,0.008386,-0.011009,-0.000180,0.013555,0.001056,0.000056,-0.011767,...,-0.000471,-0.000106,-0.231477,0.162135,0.018395,-0.038361,-1.473200e-03,-0.007030,-0.925887,0.000223
4,6,6,0.003716,0.011732,-0.015138,-0.000223,0.014705,0.000446,0.000113,-0.005414,...,-0.000041,-0.000651,-0.233494,0.166833,0.025795,-0.044710,-1.900411e-03,-0.008051,-0.923124,0.000108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,1003041,1003041,0.003914,0.007778,-0.011141,-0.000428,0.010702,-0.000054,0.000274,-0.006615,...,-0.000008,-0.000311,-0.228219,0.165549,0.017762,-0.052359,4.598223e-05,-0.009202,-0.929624,0.000142
11,1003044,1003044,0.003618,0.010039,-0.012860,-0.000389,0.012328,0.000290,0.000188,-0.007098,...,-0.000020,-0.000225,-0.203613,0.158468,0.020052,-0.045447,2.339469e-04,-0.008030,-0.928267,0.000137
12,1003047,1003047,0.002288,0.012747,-0.016875,-0.000248,0.015567,0.000398,0.000014,0.020858,...,-0.000008,-0.000119,-0.234061,0.175686,0.031994,-0.067409,1.178113e-03,-0.009536,-0.930796,0.000089
13,1003048,1003048,0.002844,0.008093,-0.012014,-0.000162,0.014526,-0.000348,0.000033,-0.011217,...,-0.000162,-0.000124,-0.241808,0.162941,0.033595,-0.061391,-1.190810e-02,-0.008797,-0.930358,0.000499


# <font size="5">hour diff predicting</font>

In [14]:
regressor_time = CatBoostRegressor(
    learning_rate=1e-1,
    iterations=2000,
    depth=7,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)

regressor_time.fit(train_embeddings,
              last_transactions_train['hour_diff'])

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 32.3118480	total: 25.9ms	remaining: 51.9s
100:	learn: 32.1403991	total: 2.32s	remaining: 43.7s
200:	learn: 31.9894634	total: 4.59s	remaining: 41.1s
300:	learn: 31.8592012	total: 6.92s	remaining: 39s
400:	learn: 31.7512392	total: 9.28s	remaining: 37s
500:	learn: 31.6641739	total: 11.5s	remaining: 34.5s
600:	learn: 31.5934224	total: 13.7s	remaining: 32s
700:	learn: 31.5360450	total: 15.9s	remaining: 29.5s
800:	learn: 31.4888162	total: 18s	remaining: 27s
900:	learn: 31.4492260	total: 20.2s	remaining: 24.6s
1000:	learn: 31.4163264	total: 22.2s	remaining: 22.2s
1100:	learn: 31.3884116	total: 24.3s	remaining: 19.8s
1200:	learn: 31.3646775	total: 26.3s	remaining: 17.5s
1300:	learn: 31.3435229	total: 28.4s	remaining: 15.2s
1400:	learn: 31.3253127	total: 30.4s	remaining: 13s
1500:	learn: 31.3091011	total: 32.4s	remaining: 10.8s
1600:	learn: 31.2939562	total: 34.4s	remaining: 8.58s
1700:	learn: 31.2803544	total: 36.5s	remaining: 6.41s
1800:	learn: 31.2682770	total: 38.5s	remaining: 4.2

<catboost.core.CatBoostRegressor at 0x7f3422511dd0>

# <font size="5">mcc predicting</font>

In [38]:
classifier_mcc = CatBoostClassifier(
    learning_rate=1e-1,
    iterations=500,
    depth=3,
    verbose=100,
    task_type='GPU',
    loss_function='MultiClass',
    eval_metric='Accuracy',
    early_stopping_rounds=100
)

classifier_mcc.fit(train_embeddings,
              last_transactions_train['mcc'])

0:	learn: 0.3071072	total: 341ms	remaining: 2m 49s
100:	learn: 0.3460211	total: 31.7s	remaining: 2m 5s
200:	learn: 0.3548380	total: 1m 2s	remaining: 1m 33s
300:	learn: 0.3580264	total: 1m 33s	remaining: 1m 1s
400:	learn: 0.3605223	total: 2m 3s	remaining: 30.5s
499:	learn: 0.3623350	total: 2m 33s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f30ebc06610>

# <font size="5">amount predicting</font>

In [46]:
classifier_amnt = CatBoostClassifier(
    learning_rate=1e-1,
    iterations=500,
    depth=5,
    verbose=100,
    task_type='GPU',
    loss_function='MultiClass',
    eval_metric='Accuracy',
    early_stopping_rounds=100
)

classifier_amnt.fit(train_embeddings,
              last_transactions_train['amnt_bins'])

0:	learn: 0.5714057	total: 31.1ms	remaining: 15.5s
100:	learn: 0.5836016	total: 2.06s	remaining: 8.13s
200:	learn: 0.5853422	total: 4.04s	remaining: 6.01s
300:	learn: 0.5865576	total: 5.97s	remaining: 3.95s
400:	learn: 0.5881192	total: 7.9s	remaining: 1.95s
499:	learn: 0.5892161	total: 9.78s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f30ebc06dd0>

# <font size="6">Metrics</font>

In [47]:
preds = regressor_time.predict(train_embeddings_with_last.drop(columns='app_id'))

print('Time MAE:', mean_absolute_error(first_transactions_valid['hour_diff'], preds))

preds = classifier_mcc.predict(train_embeddings_with_last)

print('Mcc Accuracy', accuracy_score(first_transactions_valid['mcc'], preds))

preds = classifier_amnt.predict(train_embeddings_with_last)

print('Amnt Accuracy', accuracy_score(first_transactions_valid['amnt_bins'], preds))

Time MAE: 30.59788261215277
Mcc Accuracy 0.3567886323598884
Amnt Accuracy 0.5833059893742538
