# <font size="6">Libraries</font>

In [2]:
import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F
import os, sys
from tqdm import tqdm

parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

print('torch version:', torch.__version__)

# import ptls
# from ptls.data_load import IterableChain
# from ptls.data_load.iterable_processing import SeqLenFilter
# from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
# from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
# from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles

# import pytorch_lightning as pl

from glob import glob

import sklearn
from sklearn.model_selection import train_test_split

import os, sys

# from COTIC.src import utils

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# parent_dir = os.path.abspath('..')
# if parent_dir not in sys.path:
#     sys.path.append(parent_dir)

# log = utils.get_logger(__name__)

%load_ext autoreload
%autoreload 2

torch version: 1.9.1+cu111


# <font size="5">dataset</font>

In [3]:
train_df = pd.read_parquet('data/train.parquet')
valid_df = pd.read_parquet('data/valid.parquet')

In [4]:
last_transactions = train_df.groupby('app_id').tail(1)

In [74]:
train_df

Unnamed: 0,app_id,amnt,mcc,hour_diff,transaction_number,time,amnt_bins,amnt_mcc_bins,transaction_max,reversed_transaction,amnt_mcc_bins_shift
0,0,0.465425,2,-1,1,0,3,8,181,180,5.0
1,0,0.000000,2,0,2,0,0,5,181,179,8.0
2,0,0.521152,2,0,3,0,3,8,181,178,40.0
3,0,0.356078,10,52,4,52,3,40,181,177,5.0
4,0,0.000000,2,280,5,332,0,5,181,176,8.0
...,...,...,...,...,...,...,...,...,...,...,...
95046144,1003049,0.227264,108,0,55,3802,0,429,109,54,228.0
95046145,1003049,0.387565,57,0,56,3802,3,228,109,53,1.0
95046146,1003049,0.273395,1,1,57,3803,0,1,109,52,5.0
95046147,1003049,0.258972,2,19,58,3822,0,5,109,51,8.0


# <font size="6">Markov Chain Model</font>

In [83]:
import catboost
from catboost import CatBoostRegressor

regressor = CatBoostRegressor(
    iterations=500,
    depth=3,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)

regressor.fit(train_df[['amnt_mcc_bins', 'amnt', 'mcc', 'transaction_number']],
              train_df['hour_diff'])

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 25.8221319	total: 253ms	remaining: 2m 6s
100:	learn: 25.8094324	total: 25.1s	remaining: 1m 39s
200:	learn: 25.7963800	total: 49.9s	remaining: 1m 14s
300:	learn: 25.7837451	total: 1m 14s	remaining: 49.4s
400:	learn: 25.7712072	total: 1m 39s	remaining: 24.6s
499:	learn: 25.7587555	total: 2m 4s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f70f29c7f90>

In [84]:
class MarkovChainModel():
    def __init__(self, dataset, regressor, user_id_column='app_id'):
        self.dataset = dataset
        self.user_id_column = user_id_column
        self.regressor = regressor
        
        self.transition_matrix = self.build_transition_matrix()
        self.predicted_time_avg = self.dataset.groupby(user_id_column)['hour_diff'].mean().to_dict()

    def forward(self, _type, user_id):

        return [self.transition_matrix[_type]]
                
    def build_transition_matrix(self):
        self.dataset['amnt_mcc_bins_shift'] = self.dataset.groupby(self.user_id_column)['amnt_mcc_bins'].shift(-1)
        transition_matrix = self.dataset.groupby('amnt_mcc_bins')['amnt_mcc_bins_shift'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()
        
        return transition_matrix
    
markov_model = MarkovChainModel(train_df, regressor)

In [100]:
predicted_times = regressor.predict(valid_df[['amnt_mcc_bins', 'amnt', 'mcc', 'transaction_number']].values.reshape(-1, 4))
predicted_times

array([7.09925657, 7.09925657, 7.09925657, ..., 7.09925657, 7.09925657,
       7.09925657])

In [101]:
%%time

predicted_transactions = pd.DataFrame()
valid_counts = valid_df['app_id'].value_counts().to_dict()
predicted_types = []

for _, last_transaction in last_transactions.iterrows():
    user_id = last_transaction['app_id']
    last_type = last_transaction['amnt_mcc_bins']
    num_transactions = valid_counts.get(user_id, 0)
    
    for _ in range(num_transactions):
        predicted_transaction = markov_model.forward(last_type, user_id)
        
        predicted_types.append(predicted_transaction[0])
        
        last_type = predicted_transaction[0]

CPU times: user 35.1 s, sys: 257 ms, total: 35.3 s
Wall time: 35.3 s


In [102]:
valid_df['predicted_type'] = predicted_types
valid_df['times_pred'] = predicted_times

valid_df['pred_mcc'] = (valid_df['predicted_type'] - 1) // 4 + 1
valid_df['pred_amnt'] = (valid_df['predicted_type'] - 1) % 4

# <font size="6">Metrics</font>

In [104]:
from sklearn.metrics import mean_absolute_error, accuracy_score

print('Time MAE:', mean_absolute_error(valid_df['hour_diff'], valid_df['times_pred']))
print('Type accuracy:', accuracy_score(valid_df['amnt_mcc_bins'], valid_df['predicted_type']))
print('Amnt accuracy:', accuracy_score(valid_df['amnt_bins'], valid_df['pred_amnt']))
print('MCC accuracy:', accuracy_score(valid_df['mcc'], valid_df['pred_mcc']))

Time MAE: 22.905798208210882
Type accuracy: 0.14486412069822252
Amnt accuracy: 0.4834462810589852
MCC accuracy: 0.25024221606755204
