# <font size="6">Libraries</font>

In [11]:
import torch
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F
import os, sys
from tqdm import tqdm

parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

print('torch version:', torch.__version__)

import ptls
from ptls.data_load import IterableChain
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load.datasets.parquet_dataset import ParquetDataset, ParquetFiles

import pytorch_lightning as pl

from glob import glob

import sklearn
from sklearn.model_selection import train_test_split

import os, sys

from COTIC.src import utils

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

log = utils.get_logger(__name__)

%load_ext autoreload
%autoreload 2

torch version: 1.12.1+cu102
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# <font size="5">dataset</font>

In [12]:
train_df = pd.read_parquet('data/train.parquet')
valid_df = pd.read_parquet('data/valid.parquet')

In [13]:
last_transactions = train_df.groupby('app_id').tail(1)

# <font size="6">Random model</font>

# <font size="3">В качестве предикта выдаёт одну рандомную транзакцию данного пользователя из train_df</font>

In [14]:
class RandomModel():
    def __init__(self, dataset, user_id_column='app_id'):
        self.user_id_column = user_id_column
        self.user_transactions = {user_id: group for user_id, group in dataset.groupby(user_id_column)}
        
    def forward(self, transaction):
        user_id = transaction[self.user_id_column]
        
        return self.user_transactions[user_id].sample(n=1).iloc[0]
    
random_model = RandomModel(train_df, user_id_column='app_id')

# <font size="6">Predicting</font>

In [25]:
valid_counts = valid_df['app_id'].value_counts().to_dict()
predicted_transactions = []

In [26]:
%%time

for _, last_transaction in last_transactions.iterrows():
    user_id = last_transaction['app_id']
    num_transactions = valid_counts.get(user_id, 0)
    
    for _ in range(num_transactions):
        next_transaction = random_model.forward(last_transaction)
        last_transaction = next_transaction

        predicted_transactions.append(next_transaction)
    
predicted_transactions = pd.DataFrame(predicted_transactions).reset_index(drop=True)

transaction number: 10000 / 430309
transaction number: 20000 / 430309
transaction number: 30000 / 430309
transaction number: 40000 / 430309
transaction number: 50000 / 430309
transaction number: 60000 / 430309
transaction number: 70000 / 430309
transaction number: 80000 / 430309
transaction number: 90000 / 430309
transaction number: 100000 / 430309
transaction number: 110000 / 430309
transaction number: 120000 / 430309
transaction number: 130000 / 430309
transaction number: 140000 / 430309
transaction number: 150000 / 430309
transaction number: 160000 / 430309
transaction number: 170000 / 430309
transaction number: 180000 / 430309
transaction number: 190000 / 430309
transaction number: 200000 / 430309
transaction number: 210000 / 430309
transaction number: 220000 / 430309
transaction number: 230000 / 430309
transaction number: 240000 / 430309
transaction number: 250000 / 430309
transaction number: 260000 / 430309
transaction number: 270000 / 430309
transaction number: 280000 / 430309
t

# <font size="6">Metrics</font>

In [27]:
from sklearn.metrics import mean_absolute_error, accuracy_score

print('Time MAE:', mean_absolute_error(valid_df['hour_diff'], predicted_transactions['hour_diff']))
print('Type accuracy:', accuracy_score(valid_df['amnt_mcc_bins'], predicted_transactions['amnt_mcc_bins']))
print('Amnt accuracy:', accuracy_score(valid_df['amnt_bins'], predicted_transactions['amnt_bins']))
print('MCC accuracy:', accuracy_score(valid_df['mcc'], predicted_transactions['mcc']))

Time MAE: 41.91192786821157
Type accuracy: 0.09903581238889023
Amnt accuracy: 0.42221236020433156
MCC accuracy: 0.18485706725903156
