In [2]:
from xgboost import XGBRegressor

import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'svg'

from tqdm import tqdm

from statsmodels.tsa.tsatools import lagmat

import matplotlib.pylab as plt
%matplotlib inline

In [20]:
transactions = pd.read_csv('transactions.csv')
transactions['day'] = transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)

# transactions.amount = transactions.amount.astype(np.float128)
transactions['log_pos_amount'] = transactions.amount.apply(lambda x: 0 if x<0 else np.log(x + 1))
transactions['neg_amount'] = transactions.amount.apply(lambda x: 0 if x>0 else -x)

transactions.drop(['amount', 'term_id', 'tr_datetime'], 1, inplace=True)

test_transactions = pd.DataFrame(columns=transactions.mcc_code.unique(), 
                                 index=np.arange(1, 31) + transactions.day.max())
test_transactions = test_transactions.unstack().reset_index().dropna(axis=1)
test_transactions.columns = ['mcc_code', 'day']


train_grid = pd.DataFrame(columns=transactions.mcc_code.unique(), 
                          index=transactions.day.unique())
train_grid = train_grid.unstack().reset_index().dropna(axis=1)
train_grid.columns = ['mcc_code', 'day']

for tr_table in tqdm([transactions, test_transactions, train_grid]):
    tr_table['week_num'] = (tr_table['day'] - 2) // 7
    tr_table['week_day'] = (tr_table['day'] - 2) % 7
    tr_table['month_num'] = tr_table['day'] // 28
    tr_table['month_day'] = tr_table['day'] % 28
    
merge_col_names = ['day', 'week_num', 'week_day', 'month_num', 'month_day', 'mcc_code']

train_transactions = pd.merge(
    train_grid,
    transactions.groupby(merge_col_names)[['neg_amount', 'log_pos_amount']].sum().reset_index(),
    how='left').fillna(0)

train_transactions = pd.merge(
    train_transactions,
    transactions.groupby(merge_col_names)[['customer_id']].count().reset_index(),
    how='left').fillna(0).astype(np.int32)
train_transactions.columns = np.hstack([train_transactions.columns[:-1], ['n_transactions']])

train_transactions['log_neg_amount'] = train_transactions.neg_amount.apply(lambda x: np.log(x + 1))
train_transactions.head()

100%|██████████| 3/3 [00:00<00:00,  2.10it/s]


Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,neg_amount,log_pos_amount,n_transactions,log_neg_amount
0,4814,0,-1,5,0,0,11098744,0,2365,16.222343
1,4814,1,-1,6,0,1,7881825,0,1697,15.88007
2,4814,2,0,0,0,2,6777480,0,1524,15.729116
3,4814,3,0,1,0,3,9277943,0,1937,16.043151
4,4814,4,0,2,0,4,9999757,0,1943,16.118071


In [21]:
train = train_transactions.copy()

helper = train.copy().set_index(['mcc_code', 'week_num'])
helper['week_mean'] = train_transactions.groupby(['mcc_code', 'week_num']).mean()['log_neg_amount']
train = helper.reset_index().copy()

# helper = train.copy().set_index(['mcc_code', 'week_num'])
# helper['week_median'] = train_transactions.groupby(['mcc_code', 'week_num']).median()['neg_amount']
# train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'week_num'])
helper['week_std'] = train_transactions.groupby(['mcc_code', 'week_num']).std()['log_neg_amount']
train = helper.reset_index().copy()
train.head()

Unnamed: 0,mcc_code,week_num,day,week_day,month_num,month_day,neg_amount,log_pos_amount,n_transactions,log_neg_amount,week_mean,week_std
0,4814,-1,0,5,0,0,11098744,0,2365,16.222343,16.051206,0.242023
1,4814,-1,1,6,0,1,7881825,0,1697,15.88007,16.051206,0.242023
2,4814,0,2,0,0,2,6777480,0,1524,15.729116,15.985827,0.141873
3,4814,0,3,1,0,3,9277943,0,1937,16.043151,15.985827,0.141873
4,4814,0,4,2,0,4,9999757,0,1943,16.118071,15.985827,0.141873


In [22]:
helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_mean'] = train_transactions.groupby(['mcc_code', 'month_num']).mean()['log_neg_amount']
train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_std'] = train_transactions.groupby(['mcc_code', 'month_num']).std()['log_neg_amount']
train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_median'] = train_transactions.groupby(['mcc_code', 'month_num']).median()['log_neg_amount']
train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_min'] = train_transactions.groupby(['mcc_code', 'month_num']).min()['log_neg_amount']
train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_max'] = train_transactions.groupby(['mcc_code', 'month_num']).max()['log_neg_amount']
train = helper.reset_index().copy()

TIMESTAT_COLS = train.columns.difference(train_transactions.columns)

train.head()

Unnamed: 0,mcc_code,month_num,week_num,day,week_day,month_day,neg_amount,log_pos_amount,n_transactions,log_neg_amount,week_mean,week_std,month_mean,month_std,month_median,month_min,month_max
0,4814,0,-1,0,5,0,11098744,0,2365,16.222343,16.051206,0.242023,16.024313,0.164409,16.082561,15.694662,16.227089
1,4814,0,-1,1,6,1,7881825,0,1697,15.88007,16.051206,0.242023,16.024313,0.164409,16.082561,15.694662,16.227089
2,4814,0,0,2,0,2,6777480,0,1524,15.729116,15.985827,0.141873,16.024313,0.164409,16.082561,15.694662,16.227089
3,4814,0,0,3,1,3,9277943,0,1937,16.043151,15.985827,0.141873,16.024313,0.164409,16.082561,15.694662,16.227089
4,4814,0,0,4,2,4,9999757,0,1943,16.118071,15.985827,0.141873,16.024313,0.164409,16.082561,15.694662,16.227089


In [23]:
week_lag_max = 25
for week_shift in tqdm(np.arange(5, week_lag_max)):
    train_shift = train.copy()
    train_shift['week_num'] += week_shift
    train_shift['prev_week_{}_neg'.format(week_shift)] = train_shift.log_neg_amount
    train_shift['prev_week_{}_mean_neg'.format(week_shift)] = train_shift.week_mean
    train_shift['prev_week_{}_std_neg'.format(week_shift)] = train_shift.week_std
    train_shift = train_shift[[
        'week_num', 'week_day', 'mcc_code',
        'prev_week_{}_neg'.format(week_shift),
        'prev_week_{}_mean_neg'.format(week_shift),
        'prev_week_{}_std_neg'.format(week_shift)
    ]]
    train_transactions = pd.merge(
        train_transactions, train_shift, 
        on=['week_num', 'week_day', 'mcc_code'],
        how='left').fillna(0)
    test_transactions = pd.merge(
        test_transactions, train_shift, 
        on=['week_num', 'week_day', 'mcc_code'],
        how='left').fillna(0)
train_transactions.head()

100%|██████████| 20/20 [00:01<00:00,  8.88it/s]


Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,neg_amount,log_pos_amount,n_transactions,log_neg_amount,prev_week_5_neg,prev_week_5_mean_neg,prev_week_5_std_neg,prev_week_6_neg,prev_week_6_mean_neg,prev_week_6_std_neg,prev_week_7_neg,prev_week_7_mean_neg,prev_week_7_std_neg,prev_week_8_neg,prev_week_8_mean_neg,prev_week_8_std_neg,prev_week_9_neg,prev_week_9_mean_neg,prev_week_9_std_neg,prev_week_10_neg,prev_week_10_mean_neg,prev_week_10_std_neg,prev_week_11_neg,prev_week_11_mean_neg,prev_week_11_std_neg,prev_week_12_neg,prev_week_12_mean_neg,prev_week_12_std_neg,prev_week_13_neg,prev_week_13_mean_neg,prev_week_13_std_neg,prev_week_14_neg,prev_week_14_mean_neg,prev_week_14_std_neg,prev_week_15_neg,prev_week_15_mean_neg,prev_week_15_std_neg,prev_week_16_neg,prev_week_16_mean_neg,prev_week_16_std_neg,prev_week_17_neg,prev_week_17_mean_neg,prev_week_17_std_neg,prev_week_18_neg,prev_week_18_mean_neg,prev_week_18_std_neg,prev_week_19_neg,prev_week_19_mean_neg,prev_week_19_std_neg,prev_week_20_neg,prev_week_20_mean_neg,prev_week_20_std_neg,prev_week_21_neg,prev_week_21_mean_neg,prev_week_21_std_neg,prev_week_22_neg,prev_week_22_mean_neg,prev_week_22_std_neg,prev_week_23_neg,prev_week_23_mean_neg,prev_week_23_std_neg,prev_week_24_neg,prev_week_24_mean_neg,prev_week_24_std_neg
0,4814,0,-1,5,0,0,11098744,0,2365,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4814,1,-1,6,0,1,7881825,0,1697,15.88007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4814,2,0,0,0,2,6777480,0,1524,15.729116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4814,3,0,1,0,3,9277943,0,1937,16.043151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4814,4,0,2,0,4,9999757,0,1943,16.118071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
month_lag_max = 8
for month_shift in tqdm(np.arange(2, month_lag_max)):
    train_shift = train.copy()
    train_shift['month_num'] += month_shift
    train_shift['prev_month_{}_mean_neg'.format(month_shift)] = train_shift.month_mean
    train_shift['prev_month_{}_std_neg'.format(month_shift)] = train_shift.month_std
    train_shift['prev_month_{}_median_neg'.format(month_shift)] = train_shift.month_median
    train_shift['prev_month_{}_max_neg'.format(month_shift)] = train_shift.month_max
    train_shift['prev_month_{}_min_neg'.format(month_shift)] = train_shift.month_min
    train_shift = train_shift[[
        'month_num', 'day', 'mcc_code',
        'prev_month_{}_mean_neg'.format(month_shift),
        'prev_month_{}_std_neg'.format(month_shift),
        'prev_month_{}_median_neg'.format(month_shift),
        'prev_month_{}_max_neg'.format(month_shift),
        'prev_month_{}_min_neg'.format(month_shift),
    ]]
    train_transactions = pd.merge(
        train_transactions, train_shift, 
        on=['month_num', 'day', 'mcc_code'],
        how='left').fillna(0)
    test_transactions = pd.merge(
        test_transactions, train_shift, 
        on=['month_num', 'day', 'mcc_code'],
        how='left').fillna(0)
train_transactions.head()

100%|██████████| 6/6 [00:00<00:00,  6.26it/s]


Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,neg_amount,log_pos_amount,n_transactions,log_neg_amount,prev_week_5_neg,prev_week_5_mean_neg,prev_week_5_std_neg,prev_week_6_neg,prev_week_6_mean_neg,prev_week_6_std_neg,prev_week_7_neg,prev_week_7_mean_neg,prev_week_7_std_neg,prev_week_8_neg,prev_week_8_mean_neg,prev_week_8_std_neg,prev_week_9_neg,prev_week_9_mean_neg,prev_week_9_std_neg,prev_week_10_neg,prev_week_10_mean_neg,prev_week_10_std_neg,prev_week_11_neg,prev_week_11_mean_neg,prev_week_11_std_neg,prev_week_12_neg,prev_week_12_mean_neg,prev_week_12_std_neg,prev_week_13_neg,prev_week_13_mean_neg,prev_week_13_std_neg,prev_week_14_neg,prev_week_14_mean_neg,prev_week_14_std_neg,prev_week_15_neg,prev_week_15_mean_neg,prev_week_15_std_neg,prev_week_16_neg,prev_week_16_mean_neg,prev_week_16_std_neg,prev_week_17_neg,prev_week_17_mean_neg,prev_week_17_std_neg,prev_week_18_neg,prev_week_18_mean_neg,prev_week_18_std_neg,prev_week_19_neg,prev_week_19_mean_neg,prev_week_19_std_neg,prev_week_20_neg,prev_week_20_mean_neg,prev_week_20_std_neg,prev_week_21_neg,prev_week_21_mean_neg,prev_week_21_std_neg,prev_week_22_neg,prev_week_22_mean_neg,prev_week_22_std_neg,prev_week_23_neg,prev_week_23_mean_neg,prev_week_23_std_neg,prev_week_24_neg,prev_week_24_mean_neg,prev_week_24_std_neg,prev_month_2_mean_neg,prev_month_2_std_neg,prev_month_2_median_neg,prev_month_2_max_neg,prev_month_2_min_neg,prev_month_3_mean_neg,prev_month_3_std_neg,prev_month_3_median_neg,prev_month_3_max_neg,prev_month_3_min_neg,prev_month_4_mean_neg,prev_month_4_std_neg,prev_month_4_median_neg,prev_month_4_max_neg,prev_month_4_min_neg,prev_month_5_mean_neg,prev_month_5_std_neg,prev_month_5_median_neg,prev_month_5_max_neg,prev_month_5_min_neg,prev_month_6_mean_neg,prev_month_6_std_neg,prev_month_6_median_neg,prev_month_6_max_neg,prev_month_6_min_neg,prev_month_7_mean_neg,prev_month_7_std_neg,prev_month_7_median_neg,prev_month_7_max_neg,prev_month_7_min_neg
0,4814,0,-1,5,0,0,11098744,0,2365,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4814,1,-1,6,0,1,7881825,0,1697,15.88007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4814,2,0,0,0,2,6777480,0,1524,15.729116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4814,3,0,1,0,3,9277943,0,1937,16.043151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4814,4,0,2,0,4,9999757,0,1943,16.118071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
train_transactions.columns.difference(test_transactions.columns)

Index(['log_neg_amount', 'log_pos_amount', 'n_transactions', 'neg_amount'], dtype='object')

In [25]:
dummy_train = pd.get_dummies(train_transactions, columns=['mcc_code'])
dummy_test = pd.get_dummies(test_transactions, columns=['mcc_code'])
dummy_train.head()

Unnamed: 0,day,week_num,week_day,month_num,month_day,neg_amount,log_pos_amount,n_transactions,log_neg_amount,prev_week_5_neg,prev_week_5_mean_neg,prev_week_5_std_neg,prev_week_6_neg,prev_week_6_mean_neg,prev_week_6_std_neg,prev_week_7_neg,prev_week_7_mean_neg,prev_week_7_std_neg,prev_week_8_neg,prev_week_8_mean_neg,prev_week_8_std_neg,prev_week_9_neg,prev_week_9_mean_neg,prev_week_9_std_neg,prev_week_10_neg,prev_week_10_mean_neg,prev_week_10_std_neg,prev_week_11_neg,prev_week_11_mean_neg,prev_week_11_std_neg,prev_week_12_neg,prev_week_12_mean_neg,prev_week_12_std_neg,prev_week_13_neg,prev_week_13_mean_neg,prev_week_13_std_neg,prev_week_14_neg,prev_week_14_mean_neg,prev_week_14_std_neg,prev_week_15_neg,prev_week_15_mean_neg,prev_week_15_std_neg,prev_week_16_neg,prev_week_16_mean_neg,prev_week_16_std_neg,prev_week_17_neg,prev_week_17_mean_neg,prev_week_17_std_neg,prev_week_18_neg,prev_week_18_mean_neg,prev_week_18_std_neg,prev_week_19_neg,prev_week_19_mean_neg,prev_week_19_std_neg,prev_week_20_neg,prev_week_20_mean_neg,prev_week_20_std_neg,prev_week_21_neg,prev_week_21_mean_neg,prev_week_21_std_neg,prev_week_22_neg,prev_week_22_mean_neg,prev_week_22_std_neg,prev_week_23_neg,prev_week_23_mean_neg,prev_week_23_std_neg,prev_week_24_neg,prev_week_24_mean_neg,prev_week_24_std_neg,prev_month_2_mean_neg,prev_month_2_std_neg,prev_month_2_median_neg,prev_month_2_max_neg,prev_month_2_min_neg,prev_month_3_mean_neg,prev_month_3_std_neg,prev_month_3_median_neg,prev_month_3_max_neg,prev_month_3_min_neg,prev_month_4_mean_neg,prev_month_4_std_neg,prev_month_4_median_neg,prev_month_4_max_neg,prev_month_4_min_neg,prev_month_5_mean_neg,prev_month_5_std_neg,prev_month_5_median_neg,prev_month_5_max_neg,prev_month_5_min_neg,prev_month_6_mean_neg,prev_month_6_std_neg,prev_month_6_median_neg,prev_month_6_max_neg,prev_month_6_min_neg,prev_month_7_mean_neg,prev_month_7_std_neg,prev_month_7_median_neg,prev_month_7_max_neg,prev_month_7_min_neg,mcc_code_742,...,mcc_code_5722,mcc_code_5732,mcc_code_5733,mcc_code_5734,mcc_code_5735,mcc_code_5811,mcc_code_5812,mcc_code_5813,mcc_code_5814,mcc_code_5816,mcc_code_5912,mcc_code_5921,mcc_code_5931,mcc_code_5940,mcc_code_5941,mcc_code_5942,mcc_code_5943,mcc_code_5944,mcc_code_5945,mcc_code_5946,mcc_code_5947,mcc_code_5948,mcc_code_5949,mcc_code_5950,mcc_code_5964,mcc_code_5965,mcc_code_5967,mcc_code_5968,mcc_code_5969,mcc_code_5970,mcc_code_5971,mcc_code_5976,mcc_code_5977,mcc_code_5983,mcc_code_5992,mcc_code_5993,mcc_code_5994,mcc_code_5995,mcc_code_5999,mcc_code_6010,mcc_code_6011,mcc_code_6012,mcc_code_6051,mcc_code_6211,mcc_code_6300,mcc_code_6513,mcc_code_6536,mcc_code_7011,mcc_code_7210,mcc_code_7216,mcc_code_7221,mcc_code_7230,mcc_code_7273,mcc_code_7278,mcc_code_7298,mcc_code_7299,mcc_code_7311,mcc_code_7338,mcc_code_7372,mcc_code_7375,mcc_code_7395,mcc_code_7399,mcc_code_7512,mcc_code_7523,mcc_code_7531,mcc_code_7538,mcc_code_7542,mcc_code_7629,mcc_code_7699,mcc_code_7829,mcc_code_7832,mcc_code_7841,mcc_code_7922,mcc_code_7932,mcc_code_7933,mcc_code_7991,mcc_code_7993,mcc_code_7994,mcc_code_7995,mcc_code_7996,mcc_code_7997,mcc_code_7999,mcc_code_8011,mcc_code_8021,mcc_code_8043,mcc_code_8062,mcc_code_8071,mcc_code_8099,mcc_code_8220,mcc_code_8244,mcc_code_8299,mcc_code_8398,mcc_code_8641,mcc_code_8699,mcc_code_8999,mcc_code_9211,mcc_code_9222,mcc_code_9311,mcc_code_9399,mcc_code_9402
0,0,-1,5,0,0,11098744,0,2365,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,-1,6,0,1,7881825,0,1697,15.88007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,2,6777480,0,1524,15.729116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,0,1,0,3,9277943,0,1937,16.043151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,0,2,0,4,9999757,0,1943,16.118071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [130]:
train_transactions.to_csv('train_week.csv', index=None)
test_transactions.to_csv('test_week.csv', index=None)

In [136]:
dummy_train.columns.difference(dummy_test.columns)

Index(['log_neg_amount', 'log_pos_amount', 'n_transactions', 'neg_amount'], dtype='object')

In [26]:
def rmsle(predicted, actual):
    assert(len(predicted) == len(actual))
    p = np.log(np.array(predicted) + 1)
    a = np.log(np.array(actual) + 1)
    return (((p - a)**2).sum() / len(predicted))**0.5

def rmsle_by_logs(predicted, actual):
    assert(len(predicted) == len(actual))
    return (((predicted - actual)**2).sum() / len(predicted))**0.5

def eval_model_1(labeled_data, target_col_name, clf, day_shifts=np.arange(90, 0, -15)):
    max_day = labeled_data.day.max()
    c = labeled_data.columns.difference([target_col_name])
    metric_by_shift = {}
    for day_shift in tqdm(day_shifts): 
        train_sample = labeled_data[labeled_data.day <= max_day - day_shift]
        test_sample = labeled_data[labeled_data.day > max_day - day_shift]
        clf.fit(train_sample[c], train_sample[target_col_name])
        predicted_volume = clf.predict(test_sample[c])
        metric_by_shift['RMSLE with {} days'.format(day_shift)] = rmsle_by_logs(
            predicted_volume, test_sample[target_col_name])
    return metric_by_shift

def eval_model(labeled_data, target_col_name, clf, day_shifts=np.arange(90, 0, -15)):
    max_day = labeled_data.day.max()
    c = labeled_data.columns.difference([target_col_name])
    c = [col for col in c if col not in {'day', 'month_num', 'week_num', 'week_day', 'month_day'}]
    metric_by_shift = {}
    for day_shift in tqdm(day_shifts): 
        train_sample = labeled_data[labeled_data.day <= max_day - day_shift]
        test_sample = labeled_data[labeled_data.day > max_day - day_shift]
#         train_sample.drop(['day', 'week_num', 'week_day', 'month_day'])
        clf.fit(train_sample[c], train_sample[target_col_name])
        predicted_volume = clf.predict(test_sample[c])
        metric_by_shift['RMSLE with {} days'.format(day_shift)] = rmsle_by_logs(
            predicted_volume, test_sample[target_col_name])
    return metric_by_shift

In [27]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression(n_jobs=-1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount'], 1),
    'log_neg_amount', clf)
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 6/6 [00:05<00:00,  1.10it/s]

RMSLE with 90 days: 3.5207078153271523
RMSLE with 75 days: 3.516770867321615
RMSLE with 60 days: 3.4843509934663595
RMSLE with 45 days: 3.4774858356040386
RMSLE with 30 days: 3.45820751415074
RMSLE with 15 days: 3.468239657507576





In [28]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression(n_jobs=-1)
metric_by_shift = eval_model_1(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount'], 1),
    'log_neg_amount', clf)
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 6/6 [00:05<00:00,  1.07it/s]

RMSLE with 90 days: 3.489230584488066
RMSLE with 75 days: 3.4845317790507027
RMSLE with 60 days: 3.456781148576693
RMSLE with 45 days: 3.452714065426008
RMSLE with 30 days: 3.442532123713018
RMSLE with 15 days: 3.4494150506511354





In [12]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=250, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model_1(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:48<00:00, 52.53s/it]

RMSLE with 60 days: 3.403776385467548
RMSLE with 30 days: 3.389731916996042





In [32]:
def get_prediction(X_train, y_train, X_test, clf, test_form):
    drop_cols = X_train.columns.difference(X_test.columns)
    clf.fit(X_train.drop(drop_cols, 1), y_train)
    predicted_log_volume = clf.predict(X_test)
    
    xgb_test = test_form.copy()
    xgb_test['id'] = xgb_test[['mcc_code', 'day']].apply(lambda x: '-'.join(map(str, x)), axis=1)
    xgb_test['volume'] = np.expm1(predicted_log_volume)
    
    return xgb_test[['id', 'volume']]    

clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=300, nthread=-1, reg_alpha=1, reg_lambda=1)
submit_table = get_prediction(dummy_train, dummy_train.log_neg_amount, 
                              dummy_test.drop(
                                  ['day', 'week_num', 'week_day', 'month_day', 'month_num'], 1),
                              clf, test_transactions)
submit_table.to_csv('xgb_by_weeks-2.csv', index=False)
submit_table.head()

Unnamed: 0,id,volume
0,4814-457,9704982.0
1,4814-458,10997186.0
2,4814-459,11535596.0
3,4814-460,12231984.0
4,4814-461,12105814.0


In [29]:
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=300, nthread=-1, reg_alpha=1, reg_lambda=1)

metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount'], 1),
    'log_neg_amount', clf, [30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 1/1 [00:44<00:00, 44.84s/it]

RMSLE with 30 days: 3.400830308052614





In [31]:
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=300, nthread=-1, reg_alpha=1, reg_lambda=1)
submit_table = get_prediction(dummy_train, dummy_train.log_neg_amount, 
                              dummy_test, clf, test_transactions)
submit_table.to_csv('xgb_by_weeks-3.csv', index=False)
submit_table.head()

Unnamed: 0,id,volume
0,4814-457,9368861.0
1,4814-458,10703086.0
2,4814-459,11704657.0
3,4814-460,12909008.0
4,4814-461,12050157.0


In [30]:
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=300, nthread=-1, reg_alpha=1, reg_lambda=1)

metric_by_shift = eval_model_1(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount'], 1),
    'log_neg_amount', clf, [30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 1/1 [00:45<00:00, 45.68s/it]

RMSLE with 30 days: 3.3977324347805244





In [None]:
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=300, nthread=-1, reg_alpha=1, reg_lambda=1)

metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount'], 1),
    'log_neg_amount', clf, [30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

In [24]:

def eval_model(labeled_data, target_col_name, clf, day_shifts=np.arange(90, 0, -15)):
    max_day = labeled_data.day.max()
    c = labeled_data.columns.difference([target_col_name])
    c = [col for col in c if col not in {'day', 'week_num', 'week_day', 'month_day'}]
    metric_by_shift = {}
    for day_shift in tqdm(day_shifts): 
        train_sample = labeled_data[labeled_data.day <= max_day - day_shift]
        test_sample = labeled_data[labeled_data.day > max_day - day_shift]
#         train_sample.drop(['day', 'week_num', 'week_day', 'month_day'])
        clf.fit(train_sample[c], train_sample[target_col_name])
        predicted_volume = clf.predict(test_sample[c])
        metric_by_shift['RMSLE with {} days'.format(day_shift)] = rmsle_by_logs(
            predicted_volume, test_sample[target_col_name])
    return metric_by_shift

clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=300, nthread=-1, reg_alpha=1, reg_lambda=1)

metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount'], 1),
    'log_neg_amount', clf, [30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 1/1 [00:54<00:00, 54.29s/it]

RMSLE with 30 days: 3.3941316858439743





In [26]:
clf = XGBRegressor(learning_rate=0.05, max_depth=10, n_estimators=400, nthread=-1, reg_alpha=1, reg_lambda=1)

metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount'], 1),
    'log_neg_amount', clf, [30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 1/1 [01:24<00:00, 84.69s/it]

RMSLE with 30 days: 3.506346924994657



