In [2]:
from xgboost import XGBRegressor

import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'svg'

from tqdm import tqdm

from statsmodels.tsa.tsatools import lagmat

import matplotlib.pylab as plt
%matplotlib inline

import os
os.chdir('..')

In [3]:
transactions = pd.read_csv('transactions.csv')
transactions['day'] = transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)

# transactions.amount = transactions.amount.astype(np.float128)
transactions['pos_amount'] = transactions.amount.apply(lambda x: 0 if x<0 else np.log(x + 1))
transactions['neg_amount'] = transactions.amount.apply(lambda x: 0 if x>0 else -x)

transactions.drop(['amount', 'term_id', 'tr_datetime'], 1, inplace=True)

test_transactions = pd.DataFrame(columns=transactions.mcc_code.unique(), 
                                 index=np.arange(1, 31) + transactions.day.max())
test_transactions = test_transactions.unstack().reset_index().dropna(axis=1)
test_transactions.columns = ['mcc_code', 'day']


train_grid = pd.DataFrame(columns=transactions.mcc_code.unique(), 
                          index=transactions.day.unique())
train_grid = train_grid.unstack().reset_index().dropna(axis=1)
train_grid.columns = ['mcc_code', 'day']

for tr_table in tqdm([transactions, test_transactions, train_grid]):
    tr_table['week_num'] = (tr_table['day'] - 2) // 7
    tr_table['week_day'] = (tr_table['day'] - 2) % 7
    tr_table['month_num'] = tr_table['day'] // 28
    tr_table['month_day'] = tr_table['day'] % 28
    
merge_col_names = ['day', 'week_num', 'week_day', 'month_num', 'month_day', 'mcc_code']

train_transactions = pd.merge(
    train_grid,
    transactions.groupby(merge_col_names)[['neg_amount', 'pos_amount']].sum().reset_index(),
    how='left').fillna(0)

train_transactions = pd.merge(
    train_transactions,
    transactions.groupby(merge_col_names)[['customer_id']].count().reset_index(),
    how='left').fillna(0).astype(np.int32)
train_transactions.columns = np.hstack([train_transactions.columns[:-1], ['n_transactions']])

train_transactions['log_neg_amount'] = train_transactions.neg_amount.apply(lambda x: np.log(x + 1))
train_transactions['log_pos_amount'] = train_transactions.pos_amount.apply(lambda x: np.log(x + 1))
train_transactions.head()

100%|██████████| 3/3 [00:00<00:00,  1.93it/s]


Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,neg_amount,pos_amount,n_transactions,log_neg_amount,log_pos_amount
0,4814,0,-1,5,0,0,11098744,0,2365,16.222343,0.0
1,4814,1,-1,6,0,1,7881825,0,1697,15.88007,0.0
2,4814,2,0,0,0,2,6777480,0,1524,15.729116,0.0
3,4814,3,0,1,0,3,9277943,0,1937,16.043151,0.0
4,4814,4,0,2,0,4,9999757,0,1943,16.118071,0.0


In [4]:
train = train_transactions.copy()

helper = train.copy().set_index(['mcc_code', 'week_num'])
helper['week_mean'] = train_transactions.groupby(['mcc_code', 'week_num']).mean()['log_neg_amount']
train = helper.reset_index().copy()

# helper = train.copy().set_index(['mcc_code', 'week_num'])
# helper['week_median'] = train_transactions.groupby(['mcc_code', 'week_num']).median()['neg_amount']
# train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'week_num'])
helper['week_std'] = train_transactions.groupby(['mcc_code', 'week_num']).std()['log_neg_amount']
train = helper.reset_index().copy()
train.head()

#POSITIVE
# helper = train.copy().set_index(['mcc_code', 'week_num'])
# helper['week_pos_mean'] = train_transactions.groupby(['mcc_code', 'week_num']).mean()['log_pos_amount']
# train = helper.reset_index().copy()

# helper = train.copy().set_index(['mcc_code', 'week_num'])
# helper['week_pos_std'] = train_transactions.groupby(['mcc_code', 'week_num']).std()['log_pos_amount']
# train = helper.reset_index().copy()
# train.head()

Unnamed: 0,mcc_code,week_num,day,week_day,month_num,month_day,neg_amount,pos_amount,n_transactions,log_neg_amount,log_pos_amount,week_mean,week_std
0,4814,-1,0,5,0,0,11098744,0,2365,16.222343,0.0,16.051206,0.242023
1,4814,-1,1,6,0,1,7881825,0,1697,15.88007,0.0,16.051206,0.242023
2,4814,0,2,0,0,2,6777480,0,1524,15.729116,0.0,15.985827,0.141873
3,4814,0,3,1,0,3,9277943,0,1937,16.043151,0.0,15.985827,0.141873
4,4814,0,4,2,0,4,9999757,0,1943,16.118071,0.0,15.985827,0.141873


In [5]:
helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_mean'] = train_transactions.groupby(['mcc_code', 'month_num']).mean()['log_neg_amount']
train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_std'] = train_transactions.groupby(['mcc_code', 'month_num']).std()['log_neg_amount']
train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_median'] = train_transactions.groupby(['mcc_code', 'month_num']).median()['log_neg_amount']
train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_min'] = train_transactions.groupby(['mcc_code', 'month_num']).min()['log_neg_amount']
train = helper.reset_index().copy()

helper = train.copy().set_index(['mcc_code', 'month_num'])
helper['month_max'] = train_transactions.groupby(['mcc_code', 'month_num']).max()['log_neg_amount']
train = helper.reset_index().copy()

# Positive

# helper = train.copy().set_index(['mcc_code', 'month_num'])
# helper['month_pos_mean'] = train_transactions.groupby(['mcc_code', 'month_num']).mean()['log_pos_amount']
# train = helper.reset_index().copy()

# helper = train.copy().set_index(['mcc_code', 'month_num'])
# helper['month_pos_std'] = train_transactions.groupby(['mcc_code', 'month_num']).std()['log_pos_amount']
# train = helper.reset_index().copy()

# helper = train.copy().set_index(['mcc_code', 'month_num'])
# helper['month_pos_median'] = train_transactions.groupby(['mcc_code', 'month_num']).median()['log_pos_amount']
# train = helper.reset_index().copy()

TIMESTAT_COLS = train.columns.difference(train_transactions.columns)

train.head()

Unnamed: 0,mcc_code,month_num,week_num,day,week_day,month_day,neg_amount,pos_amount,n_transactions,log_neg_amount,log_pos_amount,week_mean,week_std,month_mean,month_std,month_median,month_min,month_max
0,4814,0,-1,0,5,0,11098744,0,2365,16.222343,0.0,16.051206,0.242023,16.024313,0.164409,16.082561,15.694662,16.227089
1,4814,0,-1,1,6,1,7881825,0,1697,15.88007,0.0,16.051206,0.242023,16.024313,0.164409,16.082561,15.694662,16.227089
2,4814,0,0,2,0,2,6777480,0,1524,15.729116,0.0,15.985827,0.141873,16.024313,0.164409,16.082561,15.694662,16.227089
3,4814,0,0,3,1,3,9277943,0,1937,16.043151,0.0,15.985827,0.141873,16.024313,0.164409,16.082561,15.694662,16.227089
4,4814,0,0,4,2,4,9999757,0,1943,16.118071,0.0,15.985827,0.141873,16.024313,0.164409,16.082561,15.694662,16.227089


In [6]:
week_lag_max = 35
# week_pos_lag_max = 20
for week_shift in tqdm(np.arange(1, week_lag_max)):
    train_shift = train.copy()
    train_shift['week_num'] += week_shift
    train_shift['prev_week_{}_neg'.format(week_shift)] = train_shift.log_neg_amount
    train_shift['prev_week_{}_mean_neg'.format(week_shift)] = train_shift.week_mean
    train_shift['prev_week_{}_std_neg'.format(week_shift)] = train_shift.week_std
    
    #week
#     train_shift['prev_week_{}_mean_pos'.format(week_shift)] = train_shift.week_pos_mean
#     train_shift['prev_week_{}_std_pos'.format(week_shift)] = train_shift.week_pos_std
    
    train_shift = train_shift[[
        'week_num', 'week_day', 'mcc_code',
        'prev_week_{}_neg'.format(week_shift),
        'prev_week_{}_mean_neg'.format(week_shift),
        'prev_week_{}_std_neg'.format(week_shift),
        #added
#         'prev_week_{}_mean_pos'.format(week_shift),
#         'prev_week_{}_std_pos'.format(week_shift)
    ]]
    train_transactions = pd.merge(
        train_transactions, train_shift, 
        on=['week_num', 'week_day', 'mcc_code'],
        how='left').fillna(0)
    test_transactions = pd.merge(
        test_transactions, train_shift, 
        on=['week_num', 'week_day', 'mcc_code'],
        how='left').fillna(0)
train_transactions.head()

100%|██████████| 34/34 [00:04<00:00,  5.35it/s]


Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,neg_amount,pos_amount,n_transactions,log_neg_amount,log_pos_amount,prev_week_1_neg,prev_week_1_mean_neg,prev_week_1_std_neg,prev_week_2_neg,prev_week_2_mean_neg,prev_week_2_std_neg,prev_week_3_neg,prev_week_3_mean_neg,prev_week_3_std_neg,prev_week_4_neg,prev_week_4_mean_neg,prev_week_4_std_neg,prev_week_5_neg,prev_week_5_mean_neg,prev_week_5_std_neg,prev_week_6_neg,prev_week_6_mean_neg,prev_week_6_std_neg,prev_week_7_neg,prev_week_7_mean_neg,prev_week_7_std_neg,prev_week_8_neg,prev_week_8_mean_neg,prev_week_8_std_neg,prev_week_9_neg,prev_week_9_mean_neg,prev_week_9_std_neg,prev_week_10_neg,prev_week_10_mean_neg,prev_week_10_std_neg,prev_week_11_neg,prev_week_11_mean_neg,prev_week_11_std_neg,prev_week_12_neg,prev_week_12_mean_neg,prev_week_12_std_neg,prev_week_13_neg,prev_week_13_mean_neg,prev_week_13_std_neg,prev_week_14_neg,prev_week_14_mean_neg,prev_week_14_std_neg,prev_week_15_neg,prev_week_15_mean_neg,prev_week_15_std_neg,prev_week_16_neg,prev_week_16_mean_neg,prev_week_16_std_neg,prev_week_17_neg,prev_week_17_mean_neg,prev_week_17_std_neg,prev_week_18_neg,prev_week_18_mean_neg,prev_week_18_std_neg,prev_week_19_neg,prev_week_19_mean_neg,prev_week_19_std_neg,prev_week_20_neg,prev_week_20_mean_neg,prev_week_20_std_neg,prev_week_21_neg,prev_week_21_mean_neg,prev_week_21_std_neg,prev_week_22_neg,prev_week_22_mean_neg,prev_week_22_std_neg,prev_week_23_neg,prev_week_23_mean_neg,prev_week_23_std_neg,prev_week_24_neg,prev_week_24_mean_neg,prev_week_24_std_neg,prev_week_25_neg,prev_week_25_mean_neg,prev_week_25_std_neg,prev_week_26_neg,prev_week_26_mean_neg,prev_week_26_std_neg,prev_week_27_neg,prev_week_27_mean_neg,prev_week_27_std_neg,prev_week_28_neg,prev_week_28_mean_neg,prev_week_28_std_neg,prev_week_29_neg,prev_week_29_mean_neg,prev_week_29_std_neg,prev_week_30_neg,prev_week_30_mean_neg,prev_week_30_std_neg,prev_week_31_neg,prev_week_31_mean_neg,prev_week_31_std_neg,prev_week_32_neg,prev_week_32_mean_neg,prev_week_32_std_neg,prev_week_33_neg,prev_week_33_mean_neg,prev_week_33_std_neg,prev_week_34_neg,prev_week_34_mean_neg,prev_week_34_std_neg
0,4814,0,-1,5,0,0,11098744,0,2365,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4814,1,-1,6,0,1,7881825,0,1697,15.88007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4814,2,0,0,0,2,6777480,0,1524,15.729116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4814,3,0,1,0,3,9277943,0,1937,16.043151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4814,4,0,2,0,4,9999757,0,1943,16.118071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
month_lag_max = 8
for month_shift in tqdm(np.arange(1, month_lag_max)):
    train_shift = train.copy()
    train_shift['month_num'] += month_shift
    train_shift['prev_month_{}_mean_neg'.format(month_shift)] = train_shift.month_mean
    train_shift['prev_month_{}_std_neg'.format(month_shift)] = train_shift.month_std
    train_shift['prev_month_{}_median_neg'.format(month_shift)] = train_shift.month_median
    train_shift['prev_month_{}_max_neg'.format(month_shift)] = train_shift.month_max
    train_shift['prev_month_{}_min_neg'.format(month_shift)] = train_shift.month_min
    #added
#     train_shift['prev_month_{}_mean_pos'.format(month_shift)] = train_shift.month_pos_mean
#     train_shift['prev_month_{}_std_pos'.format(month_shift)] = train_shift.month_pos_std
#     train_shift['prev_month_{}_median_pos'.format(month_shift)] = train_shift.month_pos_median
    
    train_shift = train_shift[[
        'month_num', 'day', 'mcc_code',
        'prev_month_{}_mean_neg'.format(month_shift),
        'prev_month_{}_std_neg'.format(month_shift),
        'prev_month_{}_median_neg'.format(month_shift),
        'prev_month_{}_max_neg'.format(month_shift),
        'prev_month_{}_min_neg'.format(month_shift),
#         'prev_month_{}_mean_pos'.format(month_shift),
#         'prev_month_{}_std_pos'.format(month_shift),
#         'prev_month_{}_median_pos'.format(month_shift),
    ]]
    train_transactions = pd.merge(
        train_transactions, train_shift, 
        on=['month_num', 'day', 'mcc_code'],
        how='left').fillna(0)
    test_transactions = pd.merge(
        test_transactions, train_shift, 
        on=['month_num', 'day', 'mcc_code'],
        how='left').fillna(0)
train_transactions.head()

100%|██████████| 7/7 [00:01<00:00,  4.05it/s]


Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,neg_amount,pos_amount,n_transactions,log_neg_amount,log_pos_amount,prev_week_1_neg,prev_week_1_mean_neg,prev_week_1_std_neg,prev_week_2_neg,prev_week_2_mean_neg,prev_week_2_std_neg,prev_week_3_neg,prev_week_3_mean_neg,prev_week_3_std_neg,prev_week_4_neg,prev_week_4_mean_neg,prev_week_4_std_neg,prev_week_5_neg,prev_week_5_mean_neg,prev_week_5_std_neg,prev_week_6_neg,prev_week_6_mean_neg,prev_week_6_std_neg,prev_week_7_neg,prev_week_7_mean_neg,prev_week_7_std_neg,prev_week_8_neg,prev_week_8_mean_neg,prev_week_8_std_neg,prev_week_9_neg,prev_week_9_mean_neg,prev_week_9_std_neg,prev_week_10_neg,prev_week_10_mean_neg,prev_week_10_std_neg,prev_week_11_neg,prev_week_11_mean_neg,prev_week_11_std_neg,prev_week_12_neg,prev_week_12_mean_neg,prev_week_12_std_neg,prev_week_13_neg,prev_week_13_mean_neg,prev_week_13_std_neg,prev_week_14_neg,prev_week_14_mean_neg,prev_week_14_std_neg,prev_week_15_neg,prev_week_15_mean_neg,prev_week_15_std_neg,prev_week_16_neg,prev_week_16_mean_neg,prev_week_16_std_neg,prev_week_17_neg,prev_week_17_mean_neg,prev_week_17_std_neg,prev_week_18_neg,prev_week_18_mean_neg,prev_week_18_std_neg,prev_week_19_neg,prev_week_19_mean_neg,prev_week_19_std_neg,prev_week_20_neg,prev_week_20_mean_neg,prev_week_20_std_neg,prev_week_21_neg,prev_week_21_mean_neg,prev_week_21_std_neg,prev_week_22_neg,prev_week_22_mean_neg,prev_week_22_std_neg,prev_week_23_neg,prev_week_23_mean_neg,prev_week_23_std_neg,prev_week_24_neg,prev_week_24_mean_neg,prev_week_24_std_neg,prev_week_25_neg,prev_week_25_mean_neg,prev_week_25_std_neg,prev_week_26_neg,prev_week_26_mean_neg,prev_week_26_std_neg,prev_week_27_neg,prev_week_27_mean_neg,prev_week_27_std_neg,prev_week_28_neg,prev_week_28_mean_neg,prev_week_28_std_neg,prev_week_29_neg,prev_week_29_mean_neg,prev_week_29_std_neg,prev_week_30_neg,prev_week_30_mean_neg,prev_week_30_std_neg,prev_week_31_neg,prev_week_31_mean_neg,prev_week_31_std_neg,prev_week_32_neg,prev_week_32_mean_neg,prev_week_32_std_neg,prev_week_33_neg,prev_week_33_mean_neg,prev_week_33_std_neg,prev_week_34_neg,prev_week_34_mean_neg,prev_week_34_std_neg,prev_month_1_mean_neg,prev_month_1_std_neg,prev_month_1_median_neg,prev_month_1_max_neg,prev_month_1_min_neg,prev_month_2_mean_neg,prev_month_2_std_neg,prev_month_2_median_neg,prev_month_2_max_neg,prev_month_2_min_neg,prev_month_3_mean_neg,prev_month_3_std_neg,prev_month_3_median_neg,prev_month_3_max_neg,prev_month_3_min_neg,prev_month_4_mean_neg,prev_month_4_std_neg,prev_month_4_median_neg,prev_month_4_max_neg,prev_month_4_min_neg,prev_month_5_mean_neg,prev_month_5_std_neg,prev_month_5_median_neg,prev_month_5_max_neg,prev_month_5_min_neg,prev_month_6_mean_neg,prev_month_6_std_neg,prev_month_6_median_neg,prev_month_6_max_neg,prev_month_6_min_neg,prev_month_7_mean_neg,prev_month_7_std_neg,prev_month_7_median_neg,prev_month_7_max_neg,prev_month_7_min_neg
0,4814,0,-1,5,0,0,11098744,0,2365,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4814,1,-1,6,0,1,7881825,0,1697,15.88007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4814,2,0,0,0,2,6777480,0,1524,15.729116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4814,3,0,1,0,3,9277943,0,1937,16.043151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4814,4,0,2,0,4,9999757,0,1943,16.118071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
train_transactions.columns.difference(test_transactions.columns)

Index(['log_neg_amount', 'log_pos_amount', 'n_transactions', 'neg_amount',
       'pos_amount'],
      dtype='object')

In [8]:
dummy_train = pd.get_dummies(train_transactions, columns=['mcc_code'])
dummy_test = pd.get_dummies(test_transactions, columns=['mcc_code'])

In [9]:
def rmsle(predicted, actual):
    assert(len(predicted) == len(actual))
    p = np.log(np.array(predicted) + 1)
    a = np.log(np.array(actual) + 1)
    return (((p - a)**2).sum() / len(predicted))**0.5

def rmsle_by_logs(predicted, actual):
    assert(len(predicted) == len(actual))
    return (((predicted - actual)**2).sum() / len(predicted))**0.5

def eval_model(labeled_data, target_col_name, clf, day_shifts=np.arange(90, 0, -15)):
    max_day = labeled_data.day.max()
    c = labeled_data.columns.difference([target_col_name])
    metric_by_shift = {}
    for day_shift in tqdm(day_shifts): 
        train_sample = labeled_data[labeled_data.day <= max_day - day_shift]
        test_sample = labeled_data[labeled_data.day > max_day - day_shift]
        clf.fit(train_sample[c], train_sample[target_col_name])
        predicted_volume = clf.predict(test_sample[c])
        metric_by_shift['RMSLE with {} days'.format(day_shift)] = rmsle_by_logs(
            predicted_volume, test_sample[target_col_name])
    return metric_by_shift

In [10]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression(n_jobs=-1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'], 1),
    'log_neg_amount', clf)
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 6/6 [00:07<00:00,  1.18s/it]

RMSLE with 90 days: 3.4491445079570875
RMSLE with 75 days: 3.4393354156626583
RMSLE with 60 days: 3.407517429619658
RMSLE with 45 days: 3.398785683137353
RMSLE with 30 days: 3.3771007964983926
RMSLE with 15 days: 3.3888890794517343





In [19]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=250, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [02:23<00:00, 70.11s/it]

RMSLE with 60 days: 3.3513376138071163
RMSLE with 30 days: 3.3280629742555843





In [11]:
dummy_train.columns.difference(dummy_test.columns)

Index(['log_neg_amount', 'log_pos_amount', 'n_transactions', 'neg_amount',
       'pos_amount'],
      dtype='object')

In [14]:
def test_weeks_preparation(X_test):
    X_test_by_weeks = []
    current_test_week = X_test[X_test.week_num == 65].copy()
    X_test_by_weeks.append(current_test_week)
    removing_month_cols = ['prev_month_1_mean_neg', 'prev_month_1_std_neg', 'prev_month_1_median_neg', 
                           'prev_month_1_max_neg', 'prev_month_1_min_neg',
#                            'prev_month_1_mean_pos', 'prev_month_1_std_pos', 'prev_month_1_median_pos'
                          ]
    removing_week_cols = []
    for prev_week_index in range(1, 5):
        current_test_week = X_test[X_test.week_num == 65 + prev_week_index].copy()
        current_test_week.drop(removing_month_cols, 1, inplace=True)
        removing_week_cols += [
            'prev_week_{}_neg'.format(prev_week_index),
            'prev_week_{}_mean_neg'.format(prev_week_index),
            'prev_week_{}_std_neg'.format(prev_week_index),
#             'prev_week_{}_mean_pos'.format(prev_week_index),
#             'prev_week_{}_std_pos'.format(prev_week_index)
        ]
        current_test_week.drop(removing_week_cols, 1, inplace=True)
        X_test_by_weeks.append(current_test_week)
    return X_test_by_weeks
        
def get_agile_prediction(X_train, y_train, X_test, clf, test_form):
    X_test_by_weeks = test_weeks_preparation(X_test)
    out_form = pd.DataFrame()
    for prev_week_index in tqdm(range(5)):
        current_test = X_test_by_weeks[prev_week_index] 
        drop_cols = X_train.columns.difference(current_test.columns)
        current_train = X_train.drop(drop_cols, 1)
        clf.fit(current_train, y_train)
        predicted_log_volume = clf.predict(current_test)
        current_out_form = test_form[test_form.week_num == 65 + prev_week_index].copy()
        current_out_form['id'] = current_out_form[['mcc_code', 'day']].apply(
            lambda x: '-'.join(map(str, x)), axis=1)
        current_out_form['volume'] = np.expm1(predicted_log_volume)
        out_form = out_form.append(current_out_form[['id', 'volume']])
    return out_form


clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=300, nthread=-1, reg_alpha=1, reg_lambda=1)
submit_table = get_agile_prediction(dummy_train, dummy_train.log_neg_amount, dummy_test, clf, test_transactions)
submit_table.volume = submit_table.volume.apply(lambda x: 0 if x<0 else x)
submit_table.to_csv('agile_xgb_neg.csv', index=False)
submit_table.head()

100%|██████████| 5/5 [06:05<00:00, 72.34s/it]


Unnamed: 0,id,volume
0,4814-457,8398457.0
1,4814-458,10859408.0
2,4814-459,15518697.0
3,4814-460,11134035.0
4,4814-461,11134035.0


In [15]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=250, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:45<00:00, 52.23s/it]

RMSLE with 60 days: 3.356940934148469
RMSLE with 30 days: 3.337162818770206





In [17]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=200, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:25<00:00, 42.01s/it]

RMSLE with 60 days: 3.3518261747596427
RMSLE with 30 days: 3.331754810078325





In [20]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=90, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [00:44<00:00, 21.66s/it]

RMSLE with 60 days: 3.3407178672568225
RMSLE with 30 days: 3.323414890842121





In [23]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.03, max_depth=8, n_estimators=120, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [00:55<00:00, 27.44s/it]

RMSLE with 60 days: 3.3438463351843817
RMSLE with 30 days: 3.3220452359510695





In [26]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=8, n_estimators=140, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:04<00:00, 31.01s/it]

RMSLE with 60 days: 3.388755789845918
RMSLE with 30 days: 3.367923894462559





In [27]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=8, n_estimators=200, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:28<00:00, 43.28s/it]

RMSLE with 60 days: 3.3384691257039925
RMSLE with 30 days: 3.3212799307574343





In [28]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=6, n_estimators=200, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:05<00:00, 31.85s/it]

RMSLE with 60 days: 3.3341505265049167
RMSLE with 30 days: 3.311292520519646





In [32]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=90, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [00:56<00:00, 28.16s/it]

RMSLE with 60 days: 3.3398628109227895
RMSLE with 30 days: 3.331166103135162





In [39]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=6, n_estimators=200, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:06<00:00, 32.93s/it]

RMSLE with 60 days: 3.336694081971902
RMSLE with 30 days: 3.3204042141858685





In [43]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=6, n_estimators=250, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:19<00:00, 38.28s/it]

RMSLE with 60 days: 3.3322260184622037
RMSLE with 30 days: 3.316948550088747





In [44]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=6, n_estimators=350, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:56<00:00, 58.62s/it]

RMSLE with 60 days: 3.336665311589705
RMSLE with 30 days: 3.3163455174313117





In [46]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=6, n_estimators=300, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:26<00:00, 43.75s/it]

RMSLE with 60 days: 3.3320967760734237
RMSLE with 30 days: 3.3168435262607767





# Лучший на 6 глубине с 350 деревьями

In [56]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=6, n_estimators=350, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:40<00:00, 51.12s/it]

RMSLE with 60 days: 3.336665311589705
RMSLE with 30 days: 3.3163455174313117





In [48]:
clf = XGBRegressor(learning_rate=0.02, max_depth=6, n_estimators=350, nthread=-1, reg_alpha=1, reg_lambda=1)

drop_cols = [
    #'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
    
submit_table = get_agile_prediction(
    dummy_train, 
    dummy_train.log_neg_amount, 
    dummy_test.drop(drop_cols, 1), 
    clf, test_transactions)
submit_table.volume = submit_table.volume.apply(lambda x: 0 if x<0 else x)
submit_table.to_csv('agile_xgb_neg.csv', index=False)
submit_table.head()

100%|██████████| 5/5 [04:32<00:00, 53.60s/it]


Unnamed: 0,id,volume
0,4814-457,8012014.5
1,4814-458,10241280.0
2,4814-459,14833032.0
3,4814-460,10427905.0
4,4814-461,10480826.0


In [55]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=6,  n_estimators=350, min_child_weight=3,
                   nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:38<00:00, 48.74s/it]

RMSLE with 60 days: 3.3391240260969823
RMSLE with 30 days: 3.3152213437112303





In [58]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=6, subsample=0.3, n_estimators=350, min_child_weight=3,
                   nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [01:19<00:00, 39.24s/it]

RMSLE with 60 days: 3.326555144164365
RMSLE with 30 days: 3.3183181702384603





In [59]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.01, max_depth=6, subsample=0.3, n_estimators=750, min_child_weight=3,
                   nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [02:57<00:00, 88.26s/it]

RMSLE with 60 days: 3.3242878294765994
RMSLE with 30 days: 3.3104467482310156





In [62]:
clf = XGBRegressor(learning_rate=0.01, max_depth=6, subsample=0.3, n_estimators=750, min_child_weight=3,
                   nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    #'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
    
submit_table = get_agile_prediction(
    dummy_train, 
    dummy_train.log_neg_amount, 
    dummy_test.drop(drop_cols, 1), 
    clf, test_transactions)
submit_table.volume = submit_table.volume.apply(lambda x: 0 if x<0 else x)
submit_table.to_csv('agile_xgb_with_subsample_minchild.csv', index=False)
submit_table.head()

100%|██████████| 5/5 [08:33<00:00, 101.34s/it]


Unnamed: 0,id,volume
0,4814-457,7369921.0
1,4814-458,9932136.0
2,4814-459,13910074.0
3,4814-460,10166351.0
4,4814-461,10189277.0


In [61]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.01, max_depth=6, subsample=0.3, n_estimators=750, 
                   nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'
]
for prev_week_index in range(26,35):
    drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
                'prev_week_{}_mean_neg'.format(prev_week_index),
                'prev_week_{}_std_neg'.format(prev_week_index)]

for prev_month_index in []:
    drop_cols += ['prev_month_1_mean_neg'.format(prev_month_index), 
                  'prev_month_1_std_neg'.format(prev_month_index),
                  'prev_month_1_median_neg'.format(prev_month_index),
                  'prev_month_1_max_neg'.format(prev_month_index),
                  'prev_month_1_min_neg'.format(prev_month_index),
                 ]
metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [02:58<00:00, 86.78s/it]

RMSLE with 60 days: 3.3254445952418896
RMSLE with 30 days: 3.312567456908929





In [37]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=90, nthread=-1, reg_alpha=1, reg_lambda=1)
metric_by_shift = eval_model(
    dummy_train.drop(['log_pos_amount', 'n_transactions', 'neg_amount', 'pos_amount'], 1),
    'log_neg_amount', clf, [60, 30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 2/2 [00:43<00:00, 21.60s/it]

RMSLE with 60 days: 3.344558302950989
RMSLE with 30 days: 3.3319391522136916



