# <font size="6">Libraries</font>

In [1]:
import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F
import os, sys
from tqdm import tqdm

parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

print('torch version:', torch.__version__)

import ptls
from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles

import pytorch_lightning as pl

from glob import glob

import sklearn
from sklearn.model_selection import train_test_split

import os, sys

from COTIC.src import utils

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

log = utils.get_logger(__name__)

%load_ext autoreload
%autoreload 2

torch version: 1.12.1+cu102


# <font size="5">dataset</font>

In [2]:
train_df = pd.read_parquet('data/train.parquet')
valid_df = pd.read_parquet('data/valid.parquet')

In [3]:
last_transactions = train_df.groupby('app_id').tail(1)

# <font size="6">Markov Chain Model</font>

In [4]:
class MarkovChainModel():
    def __init__(self, dataset, user_id_column='app_id'):
        self.dataset = dataset
        self.user_id_column = user_id_column
        
        self.transition_matrix = self.build_transition_matrix()
        self.predicted_time_avg = self.dataset.groupby(user_id_column)['hour_diff'].mean().to_dict()

    def forward(self, _type, user_id):
        
        return [self.transition_matrix[_type], self.predicted_time_avg[user_id]]
                
    def build_transition_matrix(self):
        self.dataset['amnt_mcc_bins_shift'] = self.dataset.groupby(self.user_id_column)['amnt_mcc_bins'].shift(-1)
        transition_matrix = self.dataset.groupby('amnt_mcc_bins')['amnt_mcc_bins_shift'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None).to_dict()
        
        return transition_matrix
    
markov_model = MarkovChainModel(train_df)

In [5]:
%%time

predicted_transactions = pd.DataFrame()
valid_counts = valid_df['app_id'].value_counts().to_dict()
predicted_types = []
predicted_times = []

for _, last_transaction in last_transactions.iterrows():
    user_id = last_transaction['app_id']
    last_type = last_transaction['amnt_mcc_bins']
    num_transactions = valid_counts.get(user_id, 0)
    
    for _ in range(num_transactions):
        predicted_transaction = markov_model.forward(last_type, user_id)
        
        predicted_types.append(predicted_transaction[0])
        predicted_times.append(predicted_transaction[1])
        
        last_type = predicted_transaction[0]

CPU times: user 35.3 s, sys: 219 ms, total: 35.6 s
Wall time: 35.6 s


In [6]:
valid_df['predicted_type'] = predicted_types
valid_df['times_pred'] = predicted_times

valid_df['pred_mcc'] = (valid_df['predicted_type'] - 1) // 4 + 1
valid_df['pred_amnt'] = (valid_df['predicted_type'] - 1) % 4

# <font size="6">Metrics</font>

In [7]:
from sklearn.metrics import mean_absolute_error, accuracy_score

print('Time MAE:', mean_absolute_error(valid_df['hour_diff'], valid_df['times_pred']))
print('Type accuracy:', accuracy_score(valid_df['amnt_mcc_bins'], valid_df['predicted_type']))
print('Amnt accuracy:', accuracy_score(valid_df['amnt_bins'], valid_df['pred_amnt']))
print('MCC accuracy:', accuracy_score(valid_df['mcc'], valid_df['pred_mcc']))

Time MAE: 31.77129087252648
Type accuracy: 0.14486412069822252
Amnt accuracy: 0.4834462810589852
MCC accuracy: 0.25024221606755204
