# <font size="6">Libraries</font>

In [1]:
import pandas as pd

# <font size="6">dataset</font>

In [34]:
train_df = pd.read_parquet('data/train.parquet')
valid_df = pd.read_parquet('data/valid.parquet')

In [35]:
last_transactions = train_df.groupby('app_id').tail(1)

# <font size="6">Model</font>

In [37]:
import catboost
from catboost import CatBoostRegressor

regressor = CatBoostRegressor(
    iterations=500,
    depth=3,
    verbose=100,
    task_type='GPU',
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=100
)

regressor.fit(train_df[['mcc', 'transaction_number', 'hour_diff']],
              train_df['amnt'])

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.0715132	total: 269ms	remaining: 2m 14s
100:	learn: 0.0575328	total: 27.7s	remaining: 1m 49s
200:	learn: 0.0564707	total: 55.9s	remaining: 1m 23s
300:	learn: 0.0560213	total: 1m 24s	remaining: 55.9s
400:	learn: 0.0558414	total: 1m 53s	remaining: 28s
499:	learn: 0.0558018	total: 2m 22s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f31ec03dc50>

In [39]:
class MostCommonMCCPredictor:
    def __init__(self):
        self.most_common_mcc = {}

    def fit(self, data):
        self.most_common_mcc = data.groupby('app_id')['mcc'].apply(lambda x: x.mode().iloc[0]).to_dict()

    def forward(self, app_id):
        
        return self.most_common_mcc.get(app_id, None)

    
model = MostCommonMCCPredictor()

model.fit(train_df[['app_id', 'mcc']])

In [40]:
%%time

predicted_transactions = pd.DataFrame()
valid_counts = valid_df['app_id'].value_counts().to_dict()
predicted_types = []

for _, last_transaction in last_transactions.iterrows():
    user_id = last_transaction['app_id']
    last_type = last_transaction['mcc']
    num_transactions = valid_counts.get(user_id, 0)
    
    for _ in range(num_transactions):
        predicted_type = model.forward(user_id)
        
        predicted_types.append(predicted_type)

CPU times: user 37.5 s, sys: 305 ms, total: 37.8 s
Wall time: 37.8 s


In [44]:
valid_df['pred_mcc'] = predicted_types
predicted_amount = regressor.predict(valid_df[['mcc', 'transaction_number', 'hour_diff']].values.reshape(-1, 3))

valid_df['pred_amnt'] = predicted_amount

valid_df['pred_amnt'] = pd.qcut(valid_df['pred_amnt'], q=[0, .33, .66, 1.], labels=False)

# <font size="6">Metrics</font>

In [54]:
from sklearn.metrics import mean_absolute_error, accuracy_score

print('Amnt accuracy:', accuracy_score(valid_df['amnt_bins'], valid_df['pred_amnt']))
print('MCC accuracy:', accuracy_score(valid_df['mcc'], valid_df['pred_mcc']))

Amnt accuracy: 0.2218262809820841
MCC accuracy: 0.3167607617034735
