# <font size="6">Libraries</font>

In [1]:
import pandas as pd
import numpy as np

# <font size="5">dataset</font>

In [2]:
train_df = pd.read_parquet('data/train.parquet').drop(columns=['transaction_number',
                                                               'reversed_transaction', 
                                                               'transaction_max',
                                                               'time'])

valid_df = pd.read_parquet('data/valid.parquet').drop(columns=['transaction_number',
                                                               'reversed_transaction', 
                                                               'transaction_max',
                                                               'time'])

In [None]:
valid_df['hour_diff'].mean()

In [3]:
train_df

Unnamed: 0,app_id,amnt,mcc,hour_diff,amnt_bins,amnt_mcc_bins
0,0,0.465425,2,-1,3,8
1,0,0.000000,2,0,0,5
2,0,0.521152,2,0,3,8
3,0,0.356078,10,52,3,40
4,0,0.000000,2,280,0,5
...,...,...,...,...,...,...
95046144,1003049,0.227264,108,0,0,429
95046145,1003049,0.387565,57,0,3,228
95046146,1003049,0.273395,1,1,0,1
95046147,1003049,0.258972,2,19,0,5


In [4]:
train_df['mcc_next'] = train_df.groupby(['app_id'])['mcc'].shift(-1)
train_df['amnt_next'] = train_df.groupby(['app_id'])['amnt'].shift(-1)
train_df['hour_diff_next'] = train_df.groupby(['app_id'])['hour_diff'].shift(-1)
train_df['amnt_bins_next'] = train_df.groupby(['app_id'])['amnt_bins'].shift(-1)
train_df['amnt_mcc_bins_next'] = train_df.groupby(['app_id'])['amnt_mcc_bins'].shift(-1)

valid_df['mcc_next'] = valid_df.groupby(['app_id'])['mcc'].shift(-1)
valid_df['amnt_next'] = valid_df.groupby(['app_id'])['amnt'].shift(-1)
valid_df['hour_diff_next'] = valid_df.groupby(['app_id'])['hour_diff'].shift(-1)
valid_df['amnt_bins_next'] = valid_df.groupby(['app_id'])['amnt_bins'].shift(-1)
valid_df['amnt_mcc_bins_next'] = valid_df.groupby(['app_id'])['amnt_mcc_bins'].shift(-1)

In [5]:
last_transactions_train = train_df.groupby('app_id').tail(1)

train_df = train_df.drop(last_transactions_train.index)

train_df = train_df.dropna()

In [6]:
first_transactions_valid = valid_df.groupby('app_id').head(1)
first_transactions_valid = first_transactions_valid.dropna()

# <font size="6">Random model</font>

In [7]:
class RandomModel():
    def __init__(self, dataset, user_id_column='app_id'):
        self.user_id_column = user_id_column
        self.user_transactions = {user_id: group for user_id, group in dataset.groupby(user_id_column)}
        
    def forward(self, transaction):
        user_id = transaction[self.user_id_column]
        
        return self.user_transactions[user_id].sample(n=1).iloc[0]
    
random_model = RandomModel(train_df, user_id_column='app_id')

# <font size="6">Predicting</font>

In [8]:
%%time
    
predicted_transactions = []
    
for _, last_transaction in last_transactions_train.iterrows():
    user_id = last_transaction['app_id']
    
    next_transaction = random_model.forward(last_transaction)
    
    predicted_transactions.append(next_transaction)
    
predicted_transactions = pd.DataFrame(predicted_transactions).reset_index(drop=True)

CPU times: user 3min 23s, sys: 3.54 s, total: 3min 26s
Wall time: 3min 26s


In [10]:
predicted_transactions

Unnamed: 0,app_id,amnt,mcc,hour_diff,amnt_bins,amnt_mcc_bins,mcc_next,amnt_next,hour_diff_next,amnt_bins_next,amnt_mcc_bins_next
0,0.0,0.393020,2.0,42.0,3.0,8.0,2.0,0.367687,0.0,3.0,8.0
1,1.0,0.449287,2.0,20.0,3.0,8.0,1.0,0.341807,4.0,2.0,3.0
2,2.0,0.258972,28.0,5.0,0.0,109.0,28.0,0.214840,108.0,0.0,109.0
3,4.0,0.468641,9.0,0.0,3.0,36.0,2.0,0.468163,1.0,3.0,8.0
4,6.0,0.647580,2.0,52.0,3.0,8.0,2.0,0.674203,380.0,3.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...
430304,1003041.0,0.420635,98.0,21.0,3.0,392.0,50.0,0.358298,20.0,3.0,200.0
430305,1003044.0,0.333914,3.0,10.0,2.0,11.0,1.0,0.320248,20.0,1.0,2.0
430306,1003047.0,0.471045,4.0,0.0,3.0,16.0,4.0,0.415455,303.0,3.0,16.0
430307,1003048.0,0.264270,13.0,12.0,0.0,49.0,3.0,0.270281,5.0,0.0,9.0


In [12]:
predicted_transactions = predicted_transactions.rename(columns={'amnt': 'amnt_pred',
                                                        'mcc': 'mcc_pred',
                                                        'hour_diff': 'hour_diff_pred',
                                                        'amnt_mcc_bins': 'amnt_mcc_bins_pred',
                                                        'amnt_bins': 'amnt_bins_pred'}).drop(columns=['hour_diff_next',
                                                                                                              'amnt_next',
                                                                                                              'amnt_mcc_bins_next',
                                                                                                              'mcc_next',
                                                                                                               'amnt_bins_next'])

In [13]:
predicted_transactions

Unnamed: 0,app_id,amnt_pred,mcc_pred,hour_diff_pred,amnt_bins_pred,amnt_mcc_bins_pred
0,0.0,0.393020,2.0,42.0,3.0,8.0
1,1.0,0.449287,2.0,20.0,3.0,8.0
2,2.0,0.258972,28.0,5.0,0.0,109.0
3,4.0,0.468641,9.0,0.0,3.0,36.0
4,6.0,0.647580,2.0,52.0,3.0,8.0
...,...,...,...,...,...,...
430304,1003041.0,0.420635,98.0,21.0,3.0,392.0
430305,1003044.0,0.333914,3.0,10.0,2.0,11.0
430306,1003047.0,0.471045,4.0,0.0,3.0,16.0
430307,1003048.0,0.264270,13.0,12.0,0.0,49.0


In [None]:
predicted_transactions

In [14]:
dataframe = predicted_transactions.merge(first_transactions_valid, on='app_id', how='inner')

In [15]:
dataframe

Unnamed: 0,app_id,amnt_pred,mcc_pred,hour_diff_pred,amnt_bins_pred,amnt_mcc_bins_pred,amnt,mcc,hour_diff,amnt_bins,amnt_mcc_bins,hour_summed,transaction_min,mcc_next,amnt_next,hour_diff_next,amnt_bins_next,amnt_mcc_bins_next
0,0.0,0.393020,2.0,42.0,3.0,8.0,0.336345,2,0,2,7,0,132,20.0,0.359050,74.0,3.0,80.0
1,1.0,0.449287,2.0,20.0,3.0,8.0,0.439060,2,37,3,8,37,307,1.0,0.323313,0.0,1.0,2.0
2,2.0,0.258972,28.0,5.0,0.0,109.0,0.320805,28,156,1,110,156,180,2.0,0.485295,10.0,3.0,8.0
3,4.0,0.468641,9.0,0.0,3.0,36.0,0.387677,9,73,3,36,73,68,9.0,0.393020,0.0,3.0,36.0
4,6.0,0.647580,2.0,52.0,3.0,8.0,0.411448,58,24,3,232,24,211,8.0,0.384802,1.0,3.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428331,1003041.0,0.420635,98.0,21.0,3.0,392.0,0.281539,3,17,0,9,17,274,1.0,0.216285,109.0,0.0,1.0
428332,1003044.0,0.333914,3.0,10.0,2.0,11.0,0.334992,4,0,2,15,0,211,3.0,0.333474,21.0,2.0,11.0
428333,1003047.0,0.471045,4.0,0.0,3.0,16.0,0.516270,15,42,3,60,42,211,15.0,0.498168,0.0,3.0,60.0
428334,1003048.0,0.264270,13.0,12.0,0.0,49.0,0.589482,12,143,3,48,143,170,2.0,0.583667,0.0,3.0,8.0


# <font size="6">Metrics</font>

In [16]:
from sklearn.metrics import mean_absolute_error, accuracy_score

print('Time MAE:', mean_absolute_error(dataframe['hour_diff_next'], dataframe['hour_diff_pred']))
print('Type accuracy:', accuracy_score(dataframe['amnt_mcc_bins_next'], dataframe['amnt_mcc_bins_pred']))
print('Amnt accuracy:', accuracy_score(dataframe['amnt_bins_next'], dataframe['amnt_bins']))
print('MCC accuracy:', accuracy_score(dataframe['mcc_next'],dataframe['mcc_pred']))

Time MAE: 50.72273635650517
Type accuracy: 0.10781956221284225
Amnt accuracy: 0.48465223562810505
MCC accuracy: 0.192500747077061
