# <font size="6">Libraries</font>

In [14]:
import pandas as pd
import numpy as np

# <font size="6">dataset</font>

In [2]:
train_df = pd.read_parquet('data/train.parquet')
valid_df = pd.read_parquet('data/valid.parquet')

train_df = train_df[['app_id', 'amnt_mcc_bins', 'hour_diff']]

In [3]:
last_transactions = train_df.groupby('app_id').tail(1)

In [4]:
last_transactions

Unnamed: 0,app_id,amnt_mcc_bins,hour_diff
130,0,8,42
436,1,77,4
615,2,8,31
682,4,36,26
892,6,11,3
...,...,...,...
95045500,1003041,13,0
95045710,1003044,16,0
95045920,1003047,69,5
95046089,1003048,8,41


# <font size="6">Model</font>

In [25]:
last_transactions = last_transactions.fillna(0)
last_transactions['num_transactions'] = last_transactions['num_transactions'].astype(int)
np.unique(last_transactions['num_transactions'])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [26]:
last_transactions

Unnamed: 0,app_id,amnt_mcc_bins,hour_diff,num_transactions
130,0,8,42,19
436,1,77,4,30
615,2,8,31,15
682,4,36,26,11
892,6,11,3,25
...,...,...,...,...
95045500,1003041,13,0,36
95045710,1003044,16,0,35
95045920,1003047,69,5,17
95046089,1003048,8,41,30


In [28]:
%%time

predicted_transactions = pd.DataFrame()
valid_counts = valid_df['app_id'].value_counts().to_dict()

last_transactions['num_transactions'] = last_transactions['app_id'].map(valid_counts)
last_transactions = last_transactions.fillna(0)
last_transactions['num_transactions'] = last_transactions['num_transactions'].astype(int)
   
repeated_df = last_transactions.loc[np.repeat(last_transactions.index.values,
                                              last_transactions['num_transactions'])].reset_index(drop=True)

repeated_df = repeated_df.drop(columns=['num_transactions'])

CPU times: user 1.89 s, sys: 219 ms, total: 2.11 s
Wall time: 2.11 s


In [30]:
valid_df['predicted_type'] = repeated_df['amnt_mcc_bins']

valid_df['pred_mcc'] = (valid_df['predicted_type'] - 1) // 4 + 1
valid_df['pred_amnt'] = (valid_df['predicted_type'] - 1) % 4

# <font size="6">Metrics</font>

In [33]:
from sklearn.metrics import mean_absolute_error, accuracy_score

print('Mae score:', accuracy_score(valid_df['hour_diff'], repeated_df['hour_diff']))
print('Type accuracy:', accuracy_score(valid_df['amnt_mcc_bins'], valid_df['predicted_type']))
print('Amnt accuracy:', accuracy_score(valid_df['amnt_bins'], valid_df['pred_amnt']))
print('MCC accuracy:', accuracy_score(valid_df['mcc'], valid_df['pred_mcc']))

Mae score: 0.0796393017130969
Type accuracy: 0.113121140582608
Amnt accuracy: 0.42970842774208484
MCC accuracy: 0.2064992685007695
