# <font size="7">Libraries</font>

In [1]:
import torch
import ptls
import pandas as pd
import numpy as np

import pytorch_lightning as pl

import catboost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

from ptls.frames import PtlsDataModule, coles
from ptls.frames.coles import split_strategy
from ptls.data_load import datasets
from ptls.data_load.datasets import ParquetFiles
from ptls.data_load.datasets import MemoryMapDataset, parquet_file_scan, ParquetDataset
from ptls.data_load.iterable_processing import SeqLenFilter, FeatureFilter
from ptls.frames.coles import CoLESModule
from ptls.nn.trx_encoder import TrxEncoder
from ptls.nn.seq_encoder import RnnSeqEncoder
from ptls.nn.head import Head
from ptls.frames.coles.losses import ContrastiveLoss
from ptls.frames.coles import sampling_strategies

from pytorch_lightning.loggers import TensorBoardLogger

import sklearn
from sklearn.metrics import mean_absolute_error, accuracy_score

from functools import partial

# <font size="7">Data Module</font>

In [50]:
train_df = pd.read_parquet('data_2/train.parquet').drop(columns=['reversed_transaction', 
                                                                 'transaction_max',
                                                                 'time',
                                                                 'amnt_bins',
                                                                 'amnt_mcc_bins'])
last_transactions_train = train_df.groupby('app_id').tail(1)
train_df = train_df.drop(last_transactions_train.index)
train_df

Unnamed: 0,app_id,amnt,mcc,hour_diff,transaction_number
0,0,0.465425,2,-1,1
1,0,0.000000,2,0,2
2,0,0.521152,2,0,3
3,0,0.356078,10,52,4
4,0,0.000000,2,280,5
...,...,...,...,...,...
95046143,1003049,0.285137,4,7,54
95046144,1003049,0.227264,108,0,55
95046145,1003049,0.387565,57,0,56
95046146,1003049,0.273395,1,1,57


In [61]:
train_df.to_parquet('train_without_last.parquet')

In [51]:
train_df = train_df.rename(columns={'transaction_number': 'event_time'})
train_df = train_df.groupby('app_id').agg(list).reset_index()

In [52]:
train_df.to_parquet('train_agg_without_last.parquet')

In [3]:
df = pd.read_parquet('train_agg_without_last.parquet')

In [4]:
df

Unnamed: 0,app_id,amnt,mcc,hour_diff,event_time
0,0,"[0.4654254330729043, 0.0, 0.5211518246354306, ...","[2, 2, 2, 10, 2, 2, 2, 2, 10, 10, 2, 10, 2, 2,...","[-1, 0, 0, 52, 280, 0, 75, 169, 18, 0, 146, 71...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,1,"[0.3240743706369933, 0.42105610477663846, 0.48...","[65, 1, 2, 2, 1, 3, 58, 2, 1, 2, 1, 1, 2, 25, ...","[-1, 24, 19, 74, 517, 34, 1, 92, 0, 22, 6, 19,...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
2,2,"[0.35252243820939444, 0.37467714070712754, 0.3...","[3, 1, 7, 1, 28, 28, 1, 2, 28, 11, 7, 7, 1, 1,...","[-1, 10, 14, 21, 2, 0, 25, 875, 57, 64, 0, 0, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
3,4,"[0.0, 0.3488379638169017, 0.3796773516219741, ...","[2, 2, 1, 1, 9, 9, 2, 9, 9, 2, 9, 9, 9, 2, 9, ...","[-1, 0, 0, 5, 1, 0, 260, 27, 0, 163, 315, 0, 1...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
4,6,"[0.2709803491165601, 0.30031358969211763, 0.31...","[1, 4, 14, 14, 14, 14, 4, 25, 1, 1, 4, 4, 1, 1...","[-1, 18, 5, 0, 23, 0, 20, 0, 0, 0, 0, 0, 22, 0...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
...,...,...,...,...,...
430304,1003041,"[0.4654254330729043, 0.305395240024642, 0.4881...","[2, 12, 2, 2, 11, 4, 2, 11, 9, 36, 11, 1, 12, ...","[-1, 334, 168, 30, 815, 166, 72, 457, 0, 234, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
430305,1003044,"[0.2324874148808143, 0.23919555856223199, 0.26...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 9, 5, ...","[-1, 22, 4, 67, 5, 1, 19, 3, 46, 3, 14, 80, 3,...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
430306,1003047,"[0.4295051742972632, 0.3875553425009394, 0.423...","[4, 1, 10, 10, 4, 1, 10, 10, 105, 5, 5, 36, 1,...","[-1, 91, 20, 0, 7, 22, 145, 0, 2, 2, 7, 175, 6...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
430307,1003048,"[0.552364469166195, 0.2297496088743847, 0.2811...","[53, 18, 18, 35, 18, 9, 18, 2, 2, 12, 18, 18, ...","[-1, 7, 4, 42, 52, 40, 37, 12, 0, 5, 148, 50, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."


In [None]:
df['event_time'] = df['len'].apply(lambda x: list(range(1, x + 1)))

In [None]:
df

In [8]:
from ptls.frames import coles
from ptls.data_load import datasets
from ptls.frames import PtlsDataModule
from ptls.frames.coles import losses, sampling_strategies
from ptls.frames.coles import split_strategy

data_module = PtlsDataModule(
    train_data=ptls.frames.coles.ColesDataset(
        splitter=split_strategy.SampleSlices(split_count=5, cnt_min=20, cnt_max=60),
        data=ptls.data_load.datasets.AugmentationDataset(
            data=MemoryMapDataset(
                data=ParquetDataset(
                    i_filters=[SeqLenFilter(min_seq_len=30), FeatureFilter()],
                    data_files=parquet_file_scan(file_path='train_agg_without_last.parquet',
                                                 valid_rate=0,
                                                 return_part='train')
                )
            ),
            f_augmentations=[ptls.data_load.augmentations.DropoutTrx(trx_dropout=0.01)]
        )
    ),
train_batch_size=256,
train_num_workers=8,
)

# <font size="7">Model</font>

In [54]:
model = CoLESModule(
      validation_metric=ptls.frames.coles.metric.BatchRecallTopK(K=4,
                                                                 metric="cosine"),
      seq_encoder=RnnSeqEncoder(
            trx_encoder=TrxEncoder(
            use_batch_norm_with_lens=True,
            norm_embeddings=False,
            embeddings_noise=0.003,
            
            embeddings={
                "mcc": {"in": 110, "out": 32},
            },

            numeric_values={
                'amnt': 'identity',
                'hour_diff': 'log',
            },
            ),
            type="gru",
            hidden_size=1024,
            bidir=False,
            trainable_starter="static",
      ),
     
      head=ptls.nn.L2NormEncoder(),

      loss=ContrastiveLoss(
            margin=0.5,
            sampling_strategy=sampling_strategies.HardNegativePairSelector(neg_count=5),
      ),
    
      optimizer_partial=partial(
            torch.optim.Adam, 
            lr=0.001,
            weight_decay=0.0
      ),
    
      lr_scheduler_partial=partial(
            torch.optim.lr_scheduler.StepLR,
            step_size=1,
            gamma=0.8,
      ),
)

# <font size="7">Fitting</font>

In [None]:
logger = TensorBoardLogger('src/ptls-experiments/scenario_alpha_battle/lightning_logs',
                           name='coles-baseline-alpha-battle')

trainer = pl.Trainer(
    logger=logger,
    num_sanity_val_steps=0,
    gpus=1,
    auto_select_gpus=False,
    max_epochs=30,
    enable_checkpointing=False,
    deterministic=True
)

trainer.fit(model, data_module)
print(trainer.logged_metrics)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 3.3 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | L2NormEncoder   | 0     
-------------------------------------------------------
3.3 M     Trainable params
0         Non-trainable params
3.3 M     Total params
13.043    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [57]:
PATH = 'models_alpha_battle/coles_pretrain_without_last.pth'
torch.save(model.state_dict(), PATH)

# <font size="5">Load trained model</font>

In [58]:
PATH = 'models_alpha_battle/coles_pretrain_without_last.pth'
model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

# <font size="7">Inference</font>

In [59]:
train_df = pd.read_parquet('data_2/train.parquet').drop(columns=['transaction_number',
                                                               'reversed_transaction', 
                                                               'transaction_max',
                                                               'time'])

valid_df = pd.read_parquet('data_2/valid.parquet').drop(columns=['transaction_number',
                                                               'reversed_transaction', 
                                                               'transaction_max',
                                                               'time'])
last_transactions_train = train_df.groupby('app_id').tail(1)

first_transactions_valid = valid_df.groupby('app_id').head(1)
first_transactions_valid = first_transactions_valid.dropna()

In [60]:
df = pd.read_parquet('train_agg_inference.parquet')
df

Unnamed: 0_level_0,mcc,amnt,hour_diff
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"[2, 2, 2, 10, 2, 2, 2, 2, 10, 10, 2, 10, 2, 2,...","[0.4654254330729043, 0.0, 0.5211518246354306, ...","[-1, 0, 0, 52, 280, 0, 75, 169, 18, 0, 146, 71..."
1,"[65, 1, 2, 2, 1, 3, 58, 2, 1, 2, 1, 1, 2, 25, ...","[0.3240743706369933, 0.42105610477663846, 0.48...","[-1, 24, 19, 74, 517, 34, 1, 92, 0, 22, 6, 19,..."
2,"[3, 1, 7, 1, 28, 28, 1, 2, 28, 11, 7, 7, 1, 1,...","[0.35252243820939444, 0.37467714070712754, 0.3...","[-1, 10, 14, 21, 2, 0, 25, 875, 57, 64, 0, 0, ..."
4,"[2, 2, 1, 1, 9, 9, 2, 9, 9, 2, 9, 9, 9, 2, 9, ...","[0.0, 0.3488379638169017, 0.3796773516219741, ...","[-1, 0, 0, 5, 1, 0, 260, 27, 0, 163, 315, 0, 1..."
6,"[1, 4, 14, 14, 14, 14, 4, 25, 1, 1, 4, 4, 1, 1...","[0.2709803491165601, 0.30031358969211763, 0.31...","[-1, 18, 5, 0, 23, 0, 20, 0, 0, 0, 0, 0, 22, 0..."
...,...,...,...
1003041,"[2, 12, 2, 2, 11, 4, 2, 11, 9, 36, 11, 1, 12, ...","[0.4654254330729043, 0.305395240024642, 0.4881...","[-1, 334, 168, 30, 815, 166, 72, 457, 0, 234, ..."
1003044,"[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 9, 5, ...","[0.2324874148808143, 0.23919555856223199, 0.26...","[-1, 22, 4, 67, 5, 1, 19, 3, 46, 3, 14, 80, 3,..."
1003047,"[4, 1, 10, 10, 4, 1, 10, 10, 105, 5, 5, 36, 1,...","[0.4295051742972632, 0.3875553425009394, 0.423...","[-1, 91, 20, 0, 7, 22, 145, 0, 2, 2, 7, 175, 6..."
1003048,"[53, 18, 18, 35, 18, 9, 18, 2, 2, 12, 18, 18, ...","[0.552364469166195, 0.2297496088743847, 0.2811...","[-1, 7, 4, 42, 52, 40, 37, 12, 0, 5, 148, 50, ..."


In [59]:
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
iterable_inference_dataset = ParquetDataset(
    data_files=ParquetFiles(['train_agg_inference.parquet'],                                                                   
                            ).data_files
)

next(iter(iterable_inference_dataset))

inference_dl = torch.utils.data.DataLoader(
    dataset=iterable_inference_dataset,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=0,
)

next(iter(inference_dl)).payload

mod = InferenceModule(model, pandas_output=True, model_out_name='emb')

pred = pl.Trainer(gpus=1).predict(mod, inference_dl)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

In [60]:
train_embeddings = pd.concat(pred, axis=0)
train_embeddings

Unnamed: 0,app_id,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,emb_0005,emb_0006,emb_0007,emb_0008,...,emb_1014,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023
0,0,-0.316629,0.025152,0.051943,-0.022990,0.303620,0.021611,-0.065949,-0.141283,0.827946,...,-0.014034,0.278382,0.131639,0.849937,-0.839612,0.048949,0.001131,0.490939,0.740864,-0.287271
1,1,-0.398580,-0.001639,0.025460,-0.070137,0.162887,0.001425,0.538548,-0.087556,0.904141,...,0.008754,0.253184,0.607370,0.883655,-0.813249,0.074149,-0.006525,0.068844,0.918982,-0.449623
2,2,-0.330140,0.017646,-0.013427,-0.019901,0.339298,0.069889,-0.101123,0.099768,0.980735,...,0.004604,0.310660,0.736503,0.941086,-0.828155,0.055566,-0.043767,-0.523815,0.686656,-0.560095
3,4,-0.288756,0.015313,0.056001,-0.011769,0.197203,0.031872,-0.807192,-0.313843,0.748681,...,0.048099,0.409773,0.615628,0.789758,-0.873179,-0.029296,0.027632,-0.038188,0.899733,-0.579338
4,6,-0.349117,-0.001130,0.187017,-0.022511,0.167354,0.024811,-0.403078,-0.326330,0.862271,...,0.024182,0.583474,0.364204,0.912099,-0.946702,0.041188,-0.023246,-0.115333,0.863460,-0.647641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1003041,-0.357882,-0.011741,0.030384,0.013011,0.338792,0.012342,-0.256378,-0.117322,0.867672,...,-0.003890,0.344830,0.633863,0.873154,-0.747370,0.054529,0.020541,-0.525656,0.784151,-0.736420
97,1003044,-0.349139,0.017130,0.148808,0.003040,-0.228662,0.003123,0.026315,0.160060,0.920867,...,-0.002798,0.444159,0.760130,0.926194,-0.725451,-0.022200,-0.035788,-0.164855,0.977543,-0.697238
98,1003047,-0.303130,-0.014288,0.078365,-0.002301,-0.844416,0.022167,-0.316547,-0.419302,0.970306,...,0.001971,0.224621,0.788323,0.917276,-0.978816,-0.017444,-0.007421,-0.331279,0.604149,-0.726768
99,1003048,-0.312240,-0.016289,0.061667,-0.014670,-0.011140,0.042189,-0.466087,-0.243516,0.902887,...,-0.005784,0.249101,0.630472,0.878682,-0.838068,-0.012457,0.015132,-0.199373,0.857246,-0.641172


In [61]:
last_transactions_train

Unnamed: 0,app_id,amnt,mcc,hour_diff,amnt_bins,amnt_mcc_bins
130,0,0.348838,2,42,3,8
436,1,0.297589,20,4,0,77
615,2,0.402386,2,31,3,8
682,4,0.621777,9,26,3,36
892,6,0.333474,3,3,2,11
...,...,...,...,...,...,...
95045500,1003041,0.299564,4,0,0,13
95045710,1003044,0.349617,4,0,3,16
95045920,1003047,0.229750,18,5,0,69
95046089,1003048,0.662458,2,41,3,8


In [62]:
(train_embeddings['app_id'].values == last_transactions_train['app_id'].values).all()

True

In [63]:
train_embeddings = pd.concat(pred, axis=0)
train_embeddings = train_embeddings.drop(columns='app_id')
train_embeddings

Unnamed: 0,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,emb_0005,emb_0006,emb_0007,emb_0008,emb_0009,...,emb_1014,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023
0,-0.316629,0.025152,0.051943,-0.022990,0.303620,0.021611,-0.065949,-0.141283,0.827946,0.875323,...,-0.014034,0.278382,0.131639,0.849937,-0.839612,0.048949,0.001131,0.490939,0.740864,-0.287271
1,-0.398580,-0.001639,0.025460,-0.070137,0.162887,0.001425,0.538548,-0.087556,0.904141,0.789844,...,0.008754,0.253184,0.607370,0.883655,-0.813249,0.074149,-0.006525,0.068844,0.918982,-0.449623
2,-0.330140,0.017646,-0.013427,-0.019901,0.339298,0.069889,-0.101123,0.099768,0.980735,0.574954,...,0.004604,0.310660,0.736503,0.941086,-0.828155,0.055566,-0.043767,-0.523815,0.686656,-0.560095
3,-0.288756,0.015313,0.056001,-0.011769,0.197203,0.031872,-0.807192,-0.313843,0.748681,0.670622,...,0.048099,0.409773,0.615628,0.789758,-0.873179,-0.029296,0.027632,-0.038188,0.899733,-0.579338
4,-0.349117,-0.001130,0.187017,-0.022511,0.167354,0.024811,-0.403078,-0.326330,0.862271,0.775335,...,0.024182,0.583474,0.364204,0.912099,-0.946702,0.041188,-0.023246,-0.115333,0.863460,-0.647641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,-0.357882,-0.011741,0.030384,0.013011,0.338792,0.012342,-0.256378,-0.117322,0.867672,0.601549,...,-0.003890,0.344830,0.633863,0.873154,-0.747370,0.054529,0.020541,-0.525656,0.784151,-0.736420
97,-0.349139,0.017130,0.148808,0.003040,-0.228662,0.003123,0.026315,0.160060,0.920867,0.697938,...,-0.002798,0.444159,0.760130,0.926194,-0.725451,-0.022200,-0.035788,-0.164855,0.977543,-0.697238
98,-0.303130,-0.014288,0.078365,-0.002301,-0.844416,0.022167,-0.316547,-0.419302,0.970306,0.730352,...,0.001971,0.224621,0.788323,0.917276,-0.978816,-0.017444,-0.007421,-0.331279,0.604149,-0.726768
99,-0.312240,-0.016289,0.061667,-0.014670,-0.011140,0.042189,-0.466087,-0.243516,0.902887,0.757606,...,-0.005784,0.249101,0.630472,0.878682,-0.838068,-0.012457,0.015132,-0.199373,0.857246,-0.641172


In [64]:
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
iterable_inference_dataset = ParquetDataset(
    data_files=ParquetFiles(['train_agg_with_last.parquet'],                                                                   
                            ).data_files
)

next(iter(iterable_inference_dataset))

inference_dl = torch.utils.data.DataLoader(
    dataset=iterable_inference_dataset,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=0,
)

next(iter(inference_dl)).payload

mod = InferenceModule(model, pandas_output=True, model_out_name='emb')

pred = pl.Trainer(gpus=1).predict(mod, inference_dl)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

In [65]:
train_embeddings_with_last = pd.concat(pred, axis=0)
train_embeddings_with_last

Unnamed: 0,app_id,__index_level_0__,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,emb_0005,emb_0006,emb_0007,...,emb_1014,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023
0,0,0,-0.316980,0.025995,0.051694,-0.022918,0.304001,0.021705,-0.066005,-0.135683,...,-0.015702,0.278569,0.125089,0.850319,-0.837820,0.048258,0.001190,0.501307,0.741916,-0.291395
1,1,1,-0.398810,-0.002413,0.024534,-0.069622,0.165798,0.000968,0.516665,-0.081431,...,0.008554,0.252639,0.615146,0.885491,-0.819181,0.074009,-0.006507,0.073547,0.921240,-0.457101
2,2,2,-0.330391,0.018255,-0.013493,-0.019971,0.338649,0.069292,-0.101207,0.103945,...,0.004710,0.310705,0.733820,0.939074,-0.811655,0.055371,-0.042550,-0.474677,0.687498,-0.563601
3,4,4,-0.289337,0.015754,0.057224,-0.011762,0.194523,0.031676,-0.807320,-0.307805,...,0.049118,0.409714,0.637316,0.789406,-0.882010,-0.034032,0.027687,-0.029135,0.901520,-0.585236
4,6,6,-0.349459,-0.001568,0.185978,-0.023623,0.169640,0.024005,-0.403352,-0.319444,...,0.023684,0.583504,0.385276,0.912945,-0.940413,0.040791,-0.024398,-0.123591,0.864648,-0.648501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,1003041,1003041,-0.358053,-0.012055,0.031007,0.012737,0.319451,0.012161,-0.256160,-0.128977,...,-0.003457,0.344874,0.635831,0.873587,-0.751018,0.053117,0.019422,-0.538231,0.785731,-0.736615
11,1003044,1003044,-0.349291,0.016366,0.148585,0.003093,-0.265090,0.003083,0.025936,0.144861,...,-0.002451,0.444156,0.758758,0.925738,-0.728925,-0.023700,-0.035471,-0.177315,0.977779,-0.697803
12,1003047,1003047,-0.303361,-0.014633,0.078522,-0.001761,-0.843175,0.023693,-0.315199,-0.414781,...,0.000027,0.225928,0.787283,0.924124,-0.980206,-0.019374,-0.006404,-0.305931,0.606995,-0.727586
13,1003048,1003048,-0.312421,-0.016163,0.061432,-0.014014,-0.011633,0.042073,-0.466151,-0.239149,...,-0.005659,0.249086,0.622677,0.878224,-0.848937,-0.010583,0.015017,-0.173265,0.857943,-0.651206


# <font size="5">hour diff predicting</font>

In [66]:
regressor_time = CatBoostRegressor(
    learning_rate=1e-1,
    iterations=2000,
    depth=7,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)

regressor_time.fit(train_embeddings,
              last_transactions_train['hour_diff'])

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 32.3122105	total: 24.5ms	remaining: 49s
100:	learn: 32.1752090	total: 2.35s	remaining: 44.2s
200:	learn: 32.0526993	total: 4.65s	remaining: 41.6s
300:	learn: 31.9487345	total: 6.92s	remaining: 39.1s
400:	learn: 31.8618133	total: 9.19s	remaining: 36.6s
500:	learn: 31.7890098	total: 11.4s	remaining: 34.2s
600:	learn: 31.7284928	total: 13.7s	remaining: 31.9s
700:	learn: 31.6783474	total: 16s	remaining: 29.7s
800:	learn: 31.6366727	total: 18.3s	remaining: 27.5s
900:	learn: 31.6019419	total: 20.6s	remaining: 25.1s
1000:	learn: 31.5726350	total: 22.9s	remaining: 22.8s
1100:	learn: 31.5479946	total: 25.1s	remaining: 20.5s
1200:	learn: 31.5265030	total: 27.3s	remaining: 18.2s
1300:	learn: 31.5079977	total: 29.5s	remaining: 15.8s
1400:	learn: 31.4917792	total: 31.6s	remaining: 13.5s
1500:	learn: 31.4771618	total: 33.7s	remaining: 11.2s
1600:	learn: 31.4639387	total: 35.8s	remaining: 8.93s
1700:	learn: 31.4520333	total: 38s	remaining: 6.67s
1800:	learn: 31.4410389	total: 40.1s	remainin

<catboost.core.CatBoostRegressor at 0x7fd2ebffd100>

# <font size="5">mcc predicting</font>

In [67]:
classifier_mcc = CatBoostClassifier(
    learning_rate=1e-1,
    iterations=500,
    depth=3,
    verbose=100,
    task_type='GPU',
    loss_function='MultiClass',
    eval_metric='Accuracy',
    early_stopping_rounds=100
)

classifier_mcc.fit(train_embeddings,
              last_transactions_train['mcc'])

0:	learn: 0.3047577	total: 358ms	remaining: 2m 58s
100:	learn: 0.3388890	total: 31.4s	remaining: 2m 4s
200:	learn: 0.3461559	total: 1m 2s	remaining: 1m 33s
300:	learn: 0.3497626	total: 1m 34s	remaining: 1m 2s
400:	learn: 0.3520029	total: 2m 5s	remaining: 31.1s
499:	learn: 0.3538248	total: 2m 36s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fd2ebffdc70>

# <font size="5">amount predicting</font>

In [68]:
classifier_amnt = CatBoostClassifier(
    learning_rate=1e-1,
    iterations=500,
    depth=5,
    verbose=100,
    task_type='GPU',
    loss_function='MultiClass',
    eval_metric='Accuracy',
    early_stopping_rounds=100
)

classifier_amnt.fit(train_embeddings,
              last_transactions_train['amnt_bins'])

0:	learn: 0.5613222	total: 31.7ms	remaining: 15.8s
100:	learn: 0.5776523	total: 2.16s	remaining: 8.52s
200:	learn: 0.5801366	total: 4.18s	remaining: 6.22s
300:	learn: 0.5820166	total: 6.19s	remaining: 4.09s
400:	learn: 0.5833901	total: 8.2s	remaining: 2.02s
499:	learn: 0.5847031	total: 10.1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fd3dd5770a0>

# <font size="5">type predicting</font>

In [69]:
regressor_type = CatBoostClassifier(
    learning_rate=1e-1,
    iterations=500,
    depth=5,
    verbose=100,
    task_type='GPU',
    loss_function='MultiClass',
    eval_metric='Accuracy',
    early_stopping_rounds=100
)

regressor_type.fit(train_embeddings,
              last_transactions_train['amnt_mcc_bins'])

0:	learn: 0.1987200	total: 2.96s	remaining: 24m 39s
100:	learn: 0.1708423	total: 5m 3s	remaining: 19m 58s
200:	learn: 0.2025544	total: 9m 50s	remaining: 14m 38s
300:	learn: 0.2062378	total: 14m 36s	remaining: 9m 39s
400:	learn: 0.2098980	total: 19m 27s	remaining: 4m 48s
499:	learn: 0.2114736	total: 24m 9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fd3dd5773d0>

# <font size="6">Metrics</font>

In [70]:
train_embeddings_with_last

Unnamed: 0,app_id,__index_level_0__,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,emb_0005,emb_0006,emb_0007,...,emb_1014,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023
0,0,0,-0.316980,0.025995,0.051694,-0.022918,0.304001,0.021705,-0.066005,-0.135683,...,-0.015702,0.278569,0.125089,0.850319,-0.837820,0.048258,0.001190,0.501307,0.741916,-0.291395
1,1,1,-0.398810,-0.002413,0.024534,-0.069622,0.165798,0.000968,0.516665,-0.081431,...,0.008554,0.252639,0.615146,0.885491,-0.819181,0.074009,-0.006507,0.073547,0.921240,-0.457101
2,2,2,-0.330391,0.018255,-0.013493,-0.019971,0.338649,0.069292,-0.101207,0.103945,...,0.004710,0.310705,0.733820,0.939074,-0.811655,0.055371,-0.042550,-0.474677,0.687498,-0.563601
3,4,4,-0.289337,0.015754,0.057224,-0.011762,0.194523,0.031676,-0.807320,-0.307805,...,0.049118,0.409714,0.637316,0.789406,-0.882010,-0.034032,0.027687,-0.029135,0.901520,-0.585236
4,6,6,-0.349459,-0.001568,0.185978,-0.023623,0.169640,0.024005,-0.403352,-0.319444,...,0.023684,0.583504,0.385276,0.912945,-0.940413,0.040791,-0.024398,-0.123591,0.864648,-0.648501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,1003041,1003041,-0.358053,-0.012055,0.031007,0.012737,0.319451,0.012161,-0.256160,-0.128977,...,-0.003457,0.344874,0.635831,0.873587,-0.751018,0.053117,0.019422,-0.538231,0.785731,-0.736615
11,1003044,1003044,-0.349291,0.016366,0.148585,0.003093,-0.265090,0.003083,0.025936,0.144861,...,-0.002451,0.444156,0.758758,0.925738,-0.728925,-0.023700,-0.035471,-0.177315,0.977779,-0.697803
12,1003047,1003047,-0.303361,-0.014633,0.078522,-0.001761,-0.843175,0.023693,-0.315199,-0.414781,...,0.000027,0.225928,0.787283,0.924124,-0.980206,-0.019374,-0.006404,-0.305931,0.606995,-0.727586
13,1003048,1003048,-0.312421,-0.016163,0.061432,-0.014014,-0.011633,0.042073,-0.466151,-0.239149,...,-0.005659,0.249086,0.622677,0.878224,-0.848937,-0.010583,0.015017,-0.173265,0.857943,-0.651206


In [71]:
preds = regressor_time.predict(train_embeddings_with_last.drop(columns=['app_id']))

print('Time MAE:', mean_absolute_error(first_transactions_valid['hour_diff'], preds))

preds = classifier_mcc.predict(train_embeddings_with_last.drop(columns='app_id'))

print('Mcc Accuracy', accuracy_score(first_transactions_valid['mcc'], preds))

preds = classifier_amnt.predict(train_embeddings_with_last.drop(columns='app_id'))

print('Amnt Accuracy', accuracy_score(first_transactions_valid['amnt_bins'], preds))

preds = regressor_type.predict(train_embeddings_with_last.drop(columns='app_id'))

print('Type Accuracy', accuracy_score(first_transactions_valid['amnt_mcc_bins'], preds))

Time MAE: 30.783769659471176
Mcc Accuracy 0.3488623749450212
Amnt Accuracy 0.5782095408309306
Type Accuracy 0.20677850927716535
