In [2]:
from xgboost import XGBRegressor

import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'svg'

from tqdm import tqdm

from statsmodels.tsa.tsatools import lagmat

import matplotlib.pylab as plt
%matplotlib inline

import os
os.chdir('..')

In [3]:
transactions = pd.read_csv('transactions.csv')
transactions['day'] = transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)

# transactions['pos_amount'] = transactions.amount.apply(lambda x: 0 if x<0 else np.log(x + 1))
transactions['neg_amount'] = transactions.amount.apply(lambda x: 0 if x>0 else -x)

transactions.drop(['amount', 'term_id', 'tr_datetime'], 1, inplace=True)

test_transactions = pd.DataFrame(columns=transactions.mcc_code.unique(), 
                                 index=np.arange(1, 31) + transactions.day.max())
test_transactions = test_transactions.unstack().reset_index().dropna(axis=1)
test_transactions.columns = ['mcc_code', 'day']


train_grid = pd.DataFrame(columns=transactions.mcc_code.unique(), 
                          index=transactions.day.unique())
train_grid = train_grid.unstack().reset_index().dropna(axis=1)
train_grid.columns = ['mcc_code', 'day']

for tr_table in tqdm([transactions, test_transactions, train_grid]):
    tr_table['week_num'] = (tr_table['day']) // 7
    tr_table['week_day'] = (tr_table['day']) % 7
    tr_table['month_num'] = tr_table['day'] // 28
    tr_table['month_day'] = tr_table['day'] % 28
    
merge_col_names = ['day', 'week_num', 'week_day', 'month_num', 'month_day', 'mcc_code']

train_transactions = pd.merge(
    train_grid,
    transactions.groupby(merge_col_names)[['neg_amount']].sum().reset_index(),
    how='left').fillna(0)

train_transactions = pd.merge(
    train_transactions,
    transactions.groupby(merge_col_names)[['customer_id']].count().reset_index(),
    how='left').fillna(0).astype(np.int32)
train_transactions.columns = np.hstack([train_transactions.columns[:-1], ['n_transactions']])

train_transactions['log_neg_amount'] = train_transactions.neg_amount.apply(lambda x: np.log(x + 1))
# train_transactions['log_pos_amount'] = train_transactions.pos_amount.apply(lambda x: np.log(x + 1))
train_transactions.head()

100%|██████████| 3/3 [00:00<00:00,  2.11it/s]


Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,neg_amount,n_transactions,log_neg_amount
0,4814,0,0,0,0,0,11098744,2365,16.222343
1,4814,1,0,1,0,1,7881825,1697,15.88007
2,4814,2,0,2,0,2,6777480,1524,15.729116
3,4814,3,0,3,0,3,9277943,1937,16.043151
4,4814,4,0,4,0,4,9999757,1943,16.118071


In [5]:
train = train_transactions.copy()

day_lag_max = 190

for day_shift in tqdm(np.arange(1, day_lag_max)):
    train_shift = train.copy()
    train_shift['day'] += day_shift
    train_shift['neg_day_{}'.format(day_shift)] = train_shift.log_neg_amount  
    train_shift['ntrans_day_{}'.format(day_shift)] = train_shift.n_transactions
    train_shift = train_shift[[
        'day', 'mcc_code', 'neg_day_{}'.format(day_shift),
        'ntrans_day_{}'.format(day_shift)]
    ]
    
    train_transactions = pd.merge(
        train_transactions, train_shift, 
        on=['day', 'mcc_code'],
        how='left').fillna(0)
    test_transactions = pd.merge(
        test_transactions, train_shift, 
        on=['day', 'mcc_code'],
        how='left').fillna(0)
    
train_transactions.head()

100%|██████████| 189/189 [02:18<00:00,  1.72s/it]


Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,neg_amount,n_transactions,log_neg_amount,neg_day_1,ntrans_day_1,neg_day_2,ntrans_day_2,neg_day_3,ntrans_day_3,neg_day_4,ntrans_day_4,neg_day_5,ntrans_day_5,neg_day_6,ntrans_day_6,neg_day_7,ntrans_day_7,neg_day_8,ntrans_day_8,neg_day_9,ntrans_day_9,neg_day_10,ntrans_day_10,neg_day_11,ntrans_day_11,neg_day_12,ntrans_day_12,neg_day_13,ntrans_day_13,neg_day_14,ntrans_day_14,neg_day_15,ntrans_day_15,neg_day_16,ntrans_day_16,neg_day_17,ntrans_day_17,neg_day_18,ntrans_day_18,neg_day_19,ntrans_day_19,neg_day_20,ntrans_day_20,neg_day_21,ntrans_day_21,neg_day_22,ntrans_day_22,neg_day_23,ntrans_day_23,neg_day_24,ntrans_day_24,neg_day_25,ntrans_day_25,neg_day_26,ntrans_day_26,neg_day_27,ntrans_day_27,neg_day_28,ntrans_day_28,neg_day_29,ntrans_day_29,neg_day_30,ntrans_day_30,neg_day_31,ntrans_day_31,neg_day_32,ntrans_day_32,neg_day_33,ntrans_day_33,neg_day_34,ntrans_day_34,neg_day_35,ntrans_day_35,neg_day_36,ntrans_day_36,neg_day_37,ntrans_day_37,neg_day_38,ntrans_day_38,neg_day_39,ntrans_day_39,neg_day_40,ntrans_day_40,neg_day_41,ntrans_day_41,neg_day_42,ntrans_day_42,neg_day_43,ntrans_day_43,neg_day_44,ntrans_day_44,neg_day_45,ntrans_day_45,neg_day_46,...,neg_day_140,ntrans_day_140,neg_day_141,ntrans_day_141,neg_day_142,ntrans_day_142,neg_day_143,ntrans_day_143,neg_day_144,ntrans_day_144,neg_day_145,ntrans_day_145,neg_day_146,ntrans_day_146,neg_day_147,ntrans_day_147,neg_day_148,ntrans_day_148,neg_day_149,ntrans_day_149,neg_day_150,ntrans_day_150,neg_day_151,ntrans_day_151,neg_day_152,ntrans_day_152,neg_day_153,ntrans_day_153,neg_day_154,ntrans_day_154,neg_day_155,ntrans_day_155,neg_day_156,ntrans_day_156,neg_day_157,ntrans_day_157,neg_day_158,ntrans_day_158,neg_day_159,ntrans_day_159,neg_day_160,ntrans_day_160,neg_day_161,ntrans_day_161,neg_day_162,ntrans_day_162,neg_day_163,ntrans_day_163,neg_day_164,ntrans_day_164,neg_day_165,ntrans_day_165,neg_day_166,ntrans_day_166,neg_day_167,ntrans_day_167,neg_day_168,ntrans_day_168,neg_day_169,ntrans_day_169,neg_day_170,ntrans_day_170,neg_day_171,ntrans_day_171,neg_day_172,ntrans_day_172,neg_day_173,ntrans_day_173,neg_day_174,ntrans_day_174,neg_day_175,ntrans_day_175,neg_day_176,ntrans_day_176,neg_day_177,ntrans_day_177,neg_day_178,ntrans_day_178,neg_day_179,ntrans_day_179,neg_day_180,ntrans_day_180,neg_day_181,ntrans_day_181,neg_day_182,ntrans_day_182,neg_day_183,ntrans_day_183,neg_day_184,ntrans_day_184,neg_day_185,ntrans_day_185,neg_day_186,ntrans_day_186,neg_day_187,ntrans_day_187,neg_day_188,ntrans_day_188,neg_day_189,ntrans_day_189
0,4814,0,0,0,0,0,11098744,2365,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4814,1,0,1,0,1,7881825,1697,15.88007,16.222343,2365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4814,2,0,2,0,2,6777480,1524,15.729116,15.88007,1697.0,16.222343,2365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4814,3,0,3,0,3,9277943,1937,16.043151,15.729116,1524.0,15.88007,1697.0,16.222343,2365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4814,4,0,4,0,4,9999757,1943,16.118071,16.043151,1937.0,15.729116,1524.0,15.88007,1697.0,16.222343,2365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
dummy_train = pd.get_dummies(train_transactions, columns=['week_day', 'mcc_code'])
dummy_test = pd.get_dummies(test_transactions, columns=['week_day', 'mcc_code'])

def rmsle(predicted, actual):
    assert(len(predicted) == len(actual))
    p = np.log(np.array(predicted) + 1)
    a = np.log(np.array(actual) + 1)
    return (((p - a)**2).sum() / len(predicted))**0.5

def rmsle_by_logs(predicted, actual):
    assert(len(predicted) == len(actual))
    return (((predicted - actual)**2).sum() / len(predicted))**0.5

def eval_model(labeled_data, target_col_name, clf, day_shifts=np.arange(90, 0, -15)):
    max_day = labeled_data.day.max()
    c = labeled_data.columns.difference([target_col_name])
    metric_by_shift = {}
    for day_shift in tqdm(day_shifts): 
        train_sample = labeled_data[labeled_data.day <= max_day - day_shift]
        test_sample = labeled_data[labeled_data.day > max_day - day_shift]
        clf.fit(train_sample[c], train_sample[target_col_name])
        predicted_volume = clf.predict(test_sample[c])
        metric_by_shift['RMSLE with {} days'.format(day_shift)] = rmsle_by_logs(
            predicted_volume, test_sample[target_col_name])
    return metric_by_shift

def test_weeks_preparation(X_test):
    X_test_by_weeks = []
    current_test_week = X_test[X_test.week_num == 65].copy()
    X_test_by_weeks.append(current_test_week)
    removing_month_cols = ['prev_month_1_mean_neg', 'prev_month_1_std_neg', 'prev_month_1_median_neg', 
                           'prev_month_1_max_neg', 'prev_month_1_min_neg',
#                            'prev_month_1_mean_pos', 'prev_month_1_std_pos', 'prev_month_1_median_pos'
                          ]
    removing_week_cols = []
    for prev_week_index in range(1, 5):
        current_test_week = X_test[X_test.week_num == 65 + prev_week_index].copy()
        current_test_week.drop(removing_month_cols, 1, inplace=True)
        removing_week_cols += [
            'prev_week_{}_neg'.format(prev_week_index),
            'prev_week_{}_mean_neg'.format(prev_week_index),
            'prev_week_{}_std_neg'.format(prev_week_index),
            'prev_week_{}_n_trans'.format(prev_week_index),
            'prev_week_{}_mean_n_trans'.format(prev_week_index),
            'prev_week_{}_std_n_trans'.format(prev_week_index),
            
        ]
        current_test_week.drop(removing_week_cols, 1, inplace=True)
        X_test_by_weeks.append(current_test_week)
    return X_test_by_weeks

def eval_nestimators(X, y, days_for_test, clf, early_stop=100):
    max_day = X.day.max()
    X_train = X[X.day <= max_day - days_for_test]
    y_train = y[X.day <= max_day - days_for_test]
    X_test = X[X.day > max_day - days_for_test]
    y_test = y[X.day > max_day - days_for_test]
    
    eval_set  = [(X_train, y_train), (X_test, y_test)]
    fit = clf.fit(X_train, y_train, early_stopping_rounds=early_stop, eval_metric='rmse', eval_set=eval_set)
    return clf.best_iteration, fit

def get_agile_prediction(X_train, y_train, X_test, clf, test_form):
    X_test_by_weeks = test_weeks_preparation(X_test)
    out_form = pd.DataFrame()
    for prev_week_index in tqdm(range(5)):
        clf.n_estimators = 3000
        current_test = X_test_by_weeks[prev_week_index] 
        drop_cols = X_train.columns.difference(current_test.columns)
        current_train = X_train.drop(drop_cols, 1)
    
        clf.n_estimators = eval_nestimators(current_train, y_train, 30, clf)
        print(clf.n_estimators)
        clf.fit(current_train, y_train)
        predicted_log_volume = clf.predict(current_test)
        
        current_out_form = test_form[test_form.week_num == 65 + prev_week_index].copy()
        current_out_form['id'] = current_out_form[['mcc_code', 'day']].apply(
            lambda x: '-'.join(map(str, x)), axis=1)
        current_out_form['volume'] = np.expm1(predicted_log_volume)
        out_form = out_form.append(current_out_form[['id', 'volume']])
    return out_form

In [7]:
trans_cols = [col for col in dummy_train.columns if 'ntrans_day' in col]
neg_cols = [col for col in dummy_train.columns if 'neg_day' in col]
MAX_NEG_DAY = day_lag_max
mcc_cols = [col for col in dummy_train.columns if 'mcc_code_' in col]
weekday_cols = [col for col in dummy_train.columns if 'week_day_' in col]
other_cols = dummy_train.drop(trans_cols + neg_cols + mcc_cols + weekday_cols, 1).columns.values.tolist()

for week in tqdm(range(0, MAX_NEG_DAY // 7)):
    current_week = dummy_train[
        ['neg_day_{}'.format(7 * week + weekday) for weekday in range(1,8)]]
    dummy_train['neg_week_mean_{}'.format(week + 1)] = current_week.mean(axis=1)
    dummy_train['neg_week_std_{}'.format(week + 1)] = current_week.std(axis=1)
    dummy_train['neg_week_max_{}'.format(week + 1)] = current_week.max(axis=1)
    dummy_train['neg_week_min_{}'.format(week + 1)] = current_week.min(axis=1)
dummy_train.head()

week_stat_cols = dummy_train.drop(
    trans_cols + neg_cols + mcc_cols + weekday_cols + other_cols, 1).columns.values.tolist()
# print(week_stat_cols)

for month in tqdm(range(0, MAX_NEG_DAY // 28)):
    current_month = dummy_train[
        ['neg_day_{}'.format(28 * month + monthday) for monthday in range(1,29)]]
    dummy_train['neg_month_mean_{}'.format(month + 1)] = current_month.mean(axis=1)
    dummy_train['neg_month_std_{}'.format(month + 1)] = current_month.std(axis=1)
    dummy_train['neg_month_max_{}'.format(month + 1)] = current_month.max(axis=1)
    dummy_train['neg_month_min_{}'.format(month + 1)] = current_month.min(axis=1)
    
month_stat_cols = dummy_train.drop(
    trans_cols + neg_cols + mcc_cols + weekday_cols + other_cols + week_stat_cols, 1).columns.values.tolist()
# print(month_stat_cols)

100%|██████████| 27/27 [00:05<00:00,  4.15it/s]
100%|██████████| 6/6 [00:01<00:00,  3.72it/s]


In [34]:
dummy_train.shape

(84088, 708)

In [24]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount', 'week_num', 'month_num', 'month_day'
]
drop_cols.extend([neg_cols[i - 1] for i in range(1, 190) if i % 7])
drop_cols.extend([trans_cols[i - 1] for i in range(1, 190) if i % 7])
n, fit = eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)

[0]	validation_0-rmse:9.80678	validation_1-rmse:10.1456
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.6357	validation_1-rmse:9.96497
[2]	validation_0-rmse:9.46847	validation_1-rmse:9.78844
[3]	validation_0-rmse:9.30502	validation_1-rmse:9.61579
[4]	validation_0-rmse:9.14528	validation_1-rmse:9.44698
[5]	validation_0-rmse:8.98916	validation_1-rmse:9.28168
[6]	validation_0-rmse:8.8366	validation_1-rmse:9.12044
[7]	validation_0-rmse:8.68751	validation_1-rmse:8.96286
[8]	validation_0-rmse:8.54184	validation_1-rmse:8.80871
[9]	validation_0-rmse:8.39959	validation_1-rmse:8.65798
[10]	validation_0-rmse:8.2606	validation_1-rmse:8.51087
[11]	validation_0-rmse:8.12486	validation_1-rmse:8.36718
[12]	validation_0-rmse:7.99238	validation_1-rmse:8.2266
[13]	validation_0-rmse:7.86294	validation_1-rmse:8.08935
[14]	validation_0-rmse:7.73657	validation_1-rmse:7.95529


[141]	validation_0-rmse:3.42795	validation_1-rmse:3.35241
[142]	validation_0-rmse:3.42582	validation_1-rmse:3.35041
[143]	validation_0-rmse:3.42376	validation_1-rmse:3.34864
[144]	validation_0-rmse:3.42175	validation_1-rmse:3.34688
[145]	validation_0-rmse:3.41978	validation_1-rmse:3.34515
[146]	validation_0-rmse:3.41782	validation_1-rmse:3.34349
[147]	validation_0-rmse:3.416	validation_1-rmse:3.34178
[148]	validation_0-rmse:3.41416	validation_1-rmse:3.34036
[149]	validation_0-rmse:3.41234	validation_1-rmse:3.33898
[150]	validation_0-rmse:3.41067	validation_1-rmse:3.33748
[151]	validation_0-rmse:3.40903	validation_1-rmse:3.33615
[152]	validation_0-rmse:3.40738	validation_1-rmse:3.33475
[153]	validation_0-rmse:3.40582	validation_1-rmse:3.33341
[154]	validation_0-rmse:3.40427	validation_1-rmse:3.33216
[155]	validation_0-rmse:3.40266	validation_1-rmse:3.33097
[156]	validation_0-rmse:3.40119	validation_1-rmse:3.32981
[157]	validation_0-rmse:3.39977	validation_1-rmse:3.32877
[158]	validation

[283]	validation_0-rmse:3.32389	validation_1-rmse:3.29854
[284]	validation_0-rmse:3.32366	validation_1-rmse:3.29853
[285]	validation_0-rmse:3.32332	validation_1-rmse:3.29851
[286]	validation_0-rmse:3.32315	validation_1-rmse:3.29844
[287]	validation_0-rmse:3.32277	validation_1-rmse:3.29835
[288]	validation_0-rmse:3.32247	validation_1-rmse:3.29807
[289]	validation_0-rmse:3.32223	validation_1-rmse:3.29801
[290]	validation_0-rmse:3.32191	validation_1-rmse:3.29791
[291]	validation_0-rmse:3.32171	validation_1-rmse:3.29791
[292]	validation_0-rmse:3.32111	validation_1-rmse:3.29782
[293]	validation_0-rmse:3.32079	validation_1-rmse:3.2978
[294]	validation_0-rmse:3.32029	validation_1-rmse:3.2976
[295]	validation_0-rmse:3.3201	validation_1-rmse:3.29754
[296]	validation_0-rmse:3.31999	validation_1-rmse:3.29754
[297]	validation_0-rmse:3.31967	validation_1-rmse:3.29754
[298]	validation_0-rmse:3.31925	validation_1-rmse:3.29754
[299]	validation_0-rmse:3.31894	validation_1-rmse:3.29756
[300]	validation_

In [33]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=5, n_estimators=650, nthread=-1, reg_alpha=1, reg_lambda=1)

drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount', 'week_num', 'month_num', 'month_day'
]
drop_cols.extend([neg_cols[i - 1] for i in range(31, 190) if i % 7])
drop_cols.extend(trans_cols)
n, fit = eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 50)

[0]	validation_0-rmse:9.80678	validation_1-rmse:10.1456
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 50 rounds.
[1]	validation_0-rmse:9.6357	validation_1-rmse:9.96497
[2]	validation_0-rmse:9.46848	validation_1-rmse:9.78844
[3]	validation_0-rmse:9.30504	validation_1-rmse:9.61579
[4]	validation_0-rmse:9.14532	validation_1-rmse:9.4469
[5]	validation_0-rmse:8.98921	validation_1-rmse:9.28157
[6]	validation_0-rmse:8.83668	validation_1-rmse:9.12022
[7]	validation_0-rmse:8.68759	validation_1-rmse:8.96266
[8]	validation_0-rmse:8.54192	validation_1-rmse:8.80866
[9]	validation_0-rmse:8.39966	validation_1-rmse:8.65783
[10]	validation_0-rmse:8.26066	validation_1-rmse:8.5106
[11]	validation_0-rmse:8.12495	validation_1-rmse:8.36685
[12]	validation_0-rmse:7.99247	validation_1-rmse:8.2263
[13]	validation_0-rmse:7.863	validation_1-rmse:8.08909
[14]	validation_0-rmse:7.7366	validation_1-rmse:7.95506
[15

[142]	validation_0-rmse:3.42389	validation_1-rmse:3.35653
[143]	validation_0-rmse:3.42168	validation_1-rmse:3.35464
[144]	validation_0-rmse:3.41961	validation_1-rmse:3.35285
[145]	validation_0-rmse:3.4176	validation_1-rmse:3.35122
[146]	validation_0-rmse:3.41567	validation_1-rmse:3.3495
[147]	validation_0-rmse:3.41368	validation_1-rmse:3.34789
[148]	validation_0-rmse:3.41188	validation_1-rmse:3.34642
[149]	validation_0-rmse:3.41001	validation_1-rmse:3.345
[150]	validation_0-rmse:3.40823	validation_1-rmse:3.34341
[151]	validation_0-rmse:3.40648	validation_1-rmse:3.34206
[152]	validation_0-rmse:3.40483	validation_1-rmse:3.34071
[153]	validation_0-rmse:3.40326	validation_1-rmse:3.33956
[154]	validation_0-rmse:3.40169	validation_1-rmse:3.33808
[155]	validation_0-rmse:3.40016	validation_1-rmse:3.33671
[156]	validation_0-rmse:3.39868	validation_1-rmse:3.33546
[157]	validation_0-rmse:3.39718	validation_1-rmse:3.33425
[158]	validation_0-rmse:3.39576	validation_1-rmse:3.33295
[159]	validation_0

[284]	validation_0-rmse:3.31576	validation_1-rmse:3.30227
[285]	validation_0-rmse:3.31541	validation_1-rmse:3.30221
[286]	validation_0-rmse:3.31525	validation_1-rmse:3.30216
[287]	validation_0-rmse:3.3147	validation_1-rmse:3.30217
[288]	validation_0-rmse:3.31452	validation_1-rmse:3.30211
[289]	validation_0-rmse:3.31413	validation_1-rmse:3.30192
[290]	validation_0-rmse:3.31396	validation_1-rmse:3.30197
[291]	validation_0-rmse:3.31341	validation_1-rmse:3.30194
[292]	validation_0-rmse:3.3131	validation_1-rmse:3.30187
[293]	validation_0-rmse:3.31298	validation_1-rmse:3.30182
[294]	validation_0-rmse:3.31257	validation_1-rmse:3.30169
[295]	validation_0-rmse:3.31226	validation_1-rmse:3.30147
[296]	validation_0-rmse:3.31213	validation_1-rmse:3.30148
[297]	validation_0-rmse:3.31194	validation_1-rmse:3.30153
[298]	validation_0-rmse:3.31146	validation_1-rmse:3.30126
[299]	validation_0-rmse:3.31121	validation_1-rmse:3.30117
[300]	validation_0-rmse:3.31096	validation_1-rmse:3.30112
[301]	validation

In [12]:
sorted(clf.booster().get_score(importance_type='weight').items(), key=lambda x: -x[1])

[('day', 655),
 ('neg_month_mean_1', 459),
 ('neg_week_mean_1', 413),
 ('neg_week_std_1', 271),
 ('ntrans_day_1', 225),
 ('neg_month_std_1', 214),
 ('neg_month_mean_2', 192),
 ('neg_day_7', 186),
 ('neg_day_1', 180),
 ('neg_week_mean_2', 160),
 ('neg_day_21', 141),
 ('neg_day_14', 138),
 ('week_day_2', 128),
 ('neg_day_42', 117),
 ('neg_month_max_1', 97),
 ('neg_month_mean_3', 94),
 ('neg_day_35', 92),
 ('ntrans_day_7', 89),
 ('neg_day_56', 86),
 ('neg_week_std_2', 85),
 ('neg_week_std_20', 80),
 ('neg_week_std_6', 79),
 ('neg_week_max_1', 79),
 ('neg_month_std_6', 79),
 ('neg_month_std_2', 78),
 ('neg_week_std_14', 78),
 ('neg_day_16', 77),
 ('neg_day_49', 76),
 ('ntrans_day_14', 74),
 ('neg_week_mean_3', 73),
 ('neg_day_70', 72),
 ('neg_day_28', 68),
 ('neg_week_max_2', 67),
 ('neg_week_std_10', 66),
 ('neg_week_std_5', 63),
 ('neg_week_std_19', 63),
 ('neg_day_2', 62),
 ('neg_month_std_3', 62),
 ('neg_week_std_3', 61),
 ('neg_week_max_3', 61),
 ('neg_week_std_4', 59),
 ('ntrans_day_

In [61]:
def get_train_cols(train):
    drop_cols = [
        'n_transactions', 'neg_amount', 'log_neg_amount', 'week_num', 'month_num', 'month_day'
    ]
    drop_cols.extend([neg_cols[i - 1] for i in range(31, 190) if i % 7])
    drop_cols.extend(trans_cols)
    return train.drop(drop_cols, 1)
  
train = get_train_cols(dummy_train)

def prepare_test_day_cols(test, current_day):
    test_sample = test[test.day == current_day]
    for week in range(0, MAX_NEG_DAY // 7):
        current_week = test_sample[
            ['neg_day_{}'.format(7 * week + weekday) for weekday in range(1,8)]]
        test_sample['neg_week_mean_{}'.format(week + 1)] = current_week.mean(axis=1)
        test_sample['neg_week_std_{}'.format(week + 1)] = current_week.std(axis=1)
        test_sample['neg_week_max_{}'.format(week + 1)] = current_week.max(axis=1)
        test_sample['neg_week_min_{}'.format(week + 1)] = current_week.min(axis=1)


    week_stat_cols = dummy_train.drop(
        trans_cols + neg_cols + mcc_cols + weekday_cols + other_cols, 1).columns.values.tolist()

    for month in range(0, MAX_NEG_DAY // 28):
        current_month = test_sample[
            ['neg_day_{}'.format(28 * month + monthday) for monthday in range(1,29)]]
        test_sample['neg_month_mean_{}'.format(month + 1)] = current_month.mean(axis=1)
        test_sample['neg_month_std_{}'.format(month + 1)] = current_month.std(axis=1)
        test_sample['neg_month_max_{}'.format(month + 1)] = current_month.max(axis=1)
        test_sample['neg_month_min_{}'.format(month + 1)] = current_month.min(axis=1)
        
        test_drop_cols = [
            'week_num', 'month_num', 'month_day'
        ]
        test_drop_cols.extend([neg_cols[i - 1] for i in range(31, 190) if i % 7])
        test_drop_cols.extend(trans_cols)
    return test_sample.drop(test_drop_cols, 1)

def predict_month(clf, test, test_form):
    edit_test = test.copy()
    prev_predicts = []
    for day_index, current_day in tqdm(enumerate(range(test.day.min(), test.day.max() + 1), 1)):
        test_sample = prepare_test_day_cols(edit_test, current_day)
        for i in range(1, day_index):
            test_sample['neg_day_{}'.format(i)] = prev_predicts[-i].volume
        current_out_form = test_form[test_form.day == current_day][['mcc_code', 'day']]
        current_out_form['volume'] = clf.predict(test_sample)
        prev_predicts.append(current_out_form)
    
    out_form = pd.DataFrame()
    for table in tqdm(prev_predicts):
        table['id'] = table[['mcc_code', 'day']].apply(lambda x: '-'.join(map(str, x)), axis=1)
        table['volume'] = table.volume.apply(np.expm1)
        out_form = out_form.append(table[['id', 'volume']])
    return out_form
predict_month(clf, dummy_test, test_transactions)

30it [03:47,  9.06s/it]
100%|██████████| 30/30 [00:00<00:00, 50.77it/s]


Unnamed: 0,id,volume
0,4814-457,8.065794e+06
30,6011-457,1.971681e+08
60,4829-457,1.843008e+08
90,5499-457,5.581186e+06
120,5541-457,6.974720e+06
150,5200-457,1.189399e+06
180,5732-457,2.784418e+06
210,6010-457,1.346335e+05
240,5331-457,3.859958e+06
270,5912-457,3.461652e+06


In [63]:
prediction = _.sort_index()
prediction.head()

Unnamed: 0,id,volume
0,4814-457,8065794.0
1,4814-458,6448637.0
2,4814-459,2645912.0
3,4814-460,3220414.0
4,4814-461,149136.0


In [64]:
prediction.to_csv('day_by_day.csv', index=False)

In [65]:
dummy_test.head()

Unnamed: 0,day,week_num,month_num,month_day,neg_day_1,ntrans_day_1,neg_day_2,ntrans_day_2,neg_day_3,ntrans_day_3,neg_day_4,ntrans_day_4,neg_day_5,ntrans_day_5,neg_day_6,ntrans_day_6,neg_day_7,ntrans_day_7,neg_day_8,ntrans_day_8,neg_day_9,ntrans_day_9,neg_day_10,ntrans_day_10,neg_day_11,ntrans_day_11,neg_day_12,ntrans_day_12,neg_day_13,ntrans_day_13,neg_day_14,ntrans_day_14,neg_day_15,ntrans_day_15,neg_day_16,ntrans_day_16,neg_day_17,ntrans_day_17,neg_day_18,ntrans_day_18,neg_day_19,ntrans_day_19,neg_day_20,ntrans_day_20,neg_day_21,ntrans_day_21,neg_day_22,ntrans_day_22,neg_day_23,ntrans_day_23,neg_day_24,ntrans_day_24,neg_day_25,ntrans_day_25,neg_day_26,ntrans_day_26,neg_day_27,ntrans_day_27,neg_day_28,ntrans_day_28,neg_day_29,ntrans_day_29,neg_day_30,ntrans_day_30,neg_day_31,ntrans_day_31,neg_day_32,ntrans_day_32,neg_day_33,ntrans_day_33,neg_day_34,ntrans_day_34,neg_day_35,ntrans_day_35,neg_day_36,ntrans_day_36,neg_day_37,ntrans_day_37,neg_day_38,ntrans_day_38,neg_day_39,ntrans_day_39,neg_day_40,ntrans_day_40,neg_day_41,ntrans_day_41,neg_day_42,ntrans_day_42,neg_day_43,ntrans_day_43,neg_day_44,ntrans_day_44,neg_day_45,ntrans_day_45,neg_day_46,ntrans_day_46,neg_day_47,ntrans_day_47,neg_day_48,ntrans_day_48,...,mcc_code_5722,mcc_code_5732,mcc_code_5733,mcc_code_5734,mcc_code_5735,mcc_code_5811,mcc_code_5812,mcc_code_5813,mcc_code_5814,mcc_code_5816,mcc_code_5912,mcc_code_5921,mcc_code_5931,mcc_code_5940,mcc_code_5941,mcc_code_5942,mcc_code_5943,mcc_code_5944,mcc_code_5945,mcc_code_5946,mcc_code_5947,mcc_code_5948,mcc_code_5949,mcc_code_5950,mcc_code_5964,mcc_code_5965,mcc_code_5967,mcc_code_5968,mcc_code_5969,mcc_code_5970,mcc_code_5971,mcc_code_5976,mcc_code_5977,mcc_code_5983,mcc_code_5992,mcc_code_5993,mcc_code_5994,mcc_code_5995,mcc_code_5999,mcc_code_6010,mcc_code_6011,mcc_code_6012,mcc_code_6051,mcc_code_6211,mcc_code_6300,mcc_code_6513,mcc_code_6536,mcc_code_7011,mcc_code_7210,mcc_code_7216,mcc_code_7221,mcc_code_7230,mcc_code_7273,mcc_code_7278,mcc_code_7298,mcc_code_7299,mcc_code_7311,mcc_code_7338,mcc_code_7372,mcc_code_7375,mcc_code_7395,mcc_code_7399,mcc_code_7512,mcc_code_7523,mcc_code_7531,mcc_code_7538,mcc_code_7542,mcc_code_7629,mcc_code_7699,mcc_code_7829,mcc_code_7832,mcc_code_7841,mcc_code_7922,mcc_code_7932,mcc_code_7933,mcc_code_7991,mcc_code_7993,mcc_code_7994,mcc_code_7995,mcc_code_7996,mcc_code_7997,mcc_code_7999,mcc_code_8011,mcc_code_8021,mcc_code_8043,mcc_code_8062,mcc_code_8071,mcc_code_8099,mcc_code_8220,mcc_code_8244,mcc_code_8299,mcc_code_8398,mcc_code_8641,mcc_code_8699,mcc_code_8999,mcc_code_9211,mcc_code_9222,mcc_code_9311,mcc_code_9399,mcc_code_9402
0,457,65,16,9,16.267132,2236.0,16.210361,2380.0,16.240256,2414.0,16.297775,2295.0,16.878214,2470.0,16.275456,2467.0,15.989156,1953.0,16.088563,2068.0,16.281234,2536.0,16.205387,2422.0,16.225407,2455.0,16.371756,2386.0,16.193185,2374.0,15.995234,2009.0,16.0656,2051.0,16.29311,2610.0,16.268241,2444.0,16.15954,2329.0,16.271099,2442.0,16.303271,2437.0,15.935668,1872.0,15.958657,1926.0,16.230722,2452.0,16.257208,2327.0,16.301091,2200.0,16.244498,2304.0,16.217647,2324.0,15.909834,1841.0,16.000073,2015.0,16.394115,2563,16.513388,2901,16.522233,2696,16.397969,2509,16.273085,2533,16.132775,2061,15.90955,2001,16.251377,2462,16.226801,2421,16.315595,2423,16.330693,2446,16.294113,2566,15.858058,1940,16.013587,2060,16.272356,2453,16.247154,2546,16.130758,2401,16.271559,2504,16.235268,2432,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,458,65,16,10,0.0,0.0,16.267132,2236.0,16.210361,2380.0,16.240256,2414.0,16.297775,2295.0,16.878214,2470.0,16.275456,2467.0,15.989156,1953.0,16.088563,2068.0,16.281234,2536.0,16.205387,2422.0,16.225407,2455.0,16.371756,2386.0,16.193185,2374.0,15.995234,2009.0,16.0656,2051.0,16.29311,2610.0,16.268241,2444.0,16.15954,2329.0,16.271099,2442.0,16.303271,2437.0,15.935668,1872.0,15.958657,1926.0,16.230722,2452.0,16.257208,2327.0,16.301091,2200.0,16.244498,2304.0,16.217647,2324.0,15.909834,1841.0,16.000073,2015,16.394115,2563,16.513388,2901,16.522233,2696,16.397969,2509,16.273085,2533,16.132775,2061,15.90955,2001,16.251377,2462,16.226801,2421,16.315595,2423,16.330693,2446,16.294113,2566,15.858058,1940,16.013587,2060,16.272356,2453,16.247154,2546,16.130758,2401,16.271559,2504,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,459,65,16,11,0.0,0.0,0.0,0.0,16.267132,2236.0,16.210361,2380.0,16.240256,2414.0,16.297775,2295.0,16.878214,2470.0,16.275456,2467.0,15.989156,1953.0,16.088563,2068.0,16.281234,2536.0,16.205387,2422.0,16.225407,2455.0,16.371756,2386.0,16.193185,2374.0,15.995234,2009.0,16.0656,2051.0,16.29311,2610.0,16.268241,2444.0,16.15954,2329.0,16.271099,2442.0,16.303271,2437.0,15.935668,1872.0,15.958657,1926.0,16.230722,2452.0,16.257208,2327.0,16.301091,2200.0,16.244498,2304.0,16.217647,2324.0,15.909834,1841,16.000073,2015,16.394115,2563,16.513388,2901,16.522233,2696,16.397969,2509,16.273085,2533,16.132775,2061,15.90955,2001,16.251377,2462,16.226801,2421,16.315595,2423,16.330693,2446,16.294113,2566,15.858058,1940,16.013587,2060,16.272356,2453,16.247154,2546,16.130758,2401,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,460,65,16,12,0.0,0.0,0.0,0.0,0.0,0.0,16.267132,2236.0,16.210361,2380.0,16.240256,2414.0,16.297775,2295.0,16.878214,2470.0,16.275456,2467.0,15.989156,1953.0,16.088563,2068.0,16.281234,2536.0,16.205387,2422.0,16.225407,2455.0,16.371756,2386.0,16.193185,2374.0,15.995234,2009.0,16.0656,2051.0,16.29311,2610.0,16.268241,2444.0,16.15954,2329.0,16.271099,2442.0,16.303271,2437.0,15.935668,1872.0,15.958657,1926.0,16.230722,2452.0,16.257208,2327.0,16.301091,2200.0,16.244498,2304.0,16.217647,2324,15.909834,1841,16.000073,2015,16.394115,2563,16.513388,2901,16.522233,2696,16.397969,2509,16.273085,2533,16.132775,2061,15.90955,2001,16.251377,2462,16.226801,2421,16.315595,2423,16.330693,2446,16.294113,2566,15.858058,1940,16.013587,2060,16.272356,2453,16.247154,2546,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,461,65,16,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.267132,2236.0,16.210361,2380.0,16.240256,2414.0,16.297775,2295.0,16.878214,2470.0,16.275456,2467.0,15.989156,1953.0,16.088563,2068.0,16.281234,2536.0,16.205387,2422.0,16.225407,2455.0,16.371756,2386.0,16.193185,2374.0,15.995234,2009.0,16.0656,2051.0,16.29311,2610.0,16.268241,2444.0,16.15954,2329.0,16.271099,2442.0,16.303271,2437.0,15.935668,1872.0,15.958657,1926.0,16.230722,2452.0,16.257208,2327.0,16.301091,2200.0,16.244498,2304,16.217647,2324,15.909834,1841,16.000073,2015,16.394115,2563,16.513388,2901,16.522233,2696,16.397969,2509,16.273085,2533,16.132775,2061,15.90955,2001,16.251377,2462,16.226801,2421,16.315595,2423,16.330693,2446,16.294113,2566,15.858058,1940,16.013587,2060,16.272356,2453,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount', 'week_num', 'month_num', 'month_day'
]
drop_cols.extend([neg_cols[i - 1] for i in range(1, 190) if i % 7])
drop_cols.extend([trans_cols[i - 1] for i in range(1, 190) if i % 7])
drop_cols.extend(mcc_cols)
n, fit = eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)

[0]	validation_0-rmse:9.80678	validation_1-rmse:10.1456
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.6357	validation_1-rmse:9.96497
[2]	validation_0-rmse:9.46847	validation_1-rmse:9.78844
[3]	validation_0-rmse:9.30502	validation_1-rmse:9.61579
[4]	validation_0-rmse:9.14528	validation_1-rmse:9.44698
[5]	validation_0-rmse:8.98916	validation_1-rmse:9.28168
[6]	validation_0-rmse:8.8366	validation_1-rmse:9.12044
[7]	validation_0-rmse:8.68751	validation_1-rmse:8.96286
[8]	validation_0-rmse:8.54184	validation_1-rmse:8.80871
[9]	validation_0-rmse:8.39959	validation_1-rmse:8.65798
[10]	validation_0-rmse:8.2606	validation_1-rmse:8.51087
[11]	validation_0-rmse:8.12486	validation_1-rmse:8.36718
[12]	validation_0-rmse:7.99239	validation_1-rmse:8.2266
[13]	validation_0-rmse:7.86295	validation_1-rmse:8.08935
[14]	validation_0-rmse:7.73658	validation_1-rmse:7.95529


[142]	validation_0-rmse:3.42639	validation_1-rmse:3.35043
[143]	validation_0-rmse:3.42433	validation_1-rmse:3.34844
[144]	validation_0-rmse:3.4223	validation_1-rmse:3.34637
[145]	validation_0-rmse:3.42034	validation_1-rmse:3.34469
[146]	validation_0-rmse:3.41836	validation_1-rmse:3.34316
[147]	validation_0-rmse:3.41648	validation_1-rmse:3.34165
[148]	validation_0-rmse:3.41465	validation_1-rmse:3.34018
[149]	validation_0-rmse:3.4129	validation_1-rmse:3.33869
[150]	validation_0-rmse:3.41108	validation_1-rmse:3.33724
[151]	validation_0-rmse:3.40943	validation_1-rmse:3.3359
[152]	validation_0-rmse:3.40778	validation_1-rmse:3.33455
[153]	validation_0-rmse:3.40617	validation_1-rmse:3.33324
[154]	validation_0-rmse:3.40452	validation_1-rmse:3.33191
[155]	validation_0-rmse:3.40302	validation_1-rmse:3.33065
[156]	validation_0-rmse:3.40159	validation_1-rmse:3.32965
[157]	validation_0-rmse:3.40023	validation_1-rmse:3.32855
[158]	validation_0-rmse:3.39882	validation_1-rmse:3.32751
[159]	validation_

[284]	validation_0-rmse:3.322	validation_1-rmse:3.29853
[285]	validation_0-rmse:3.32157	validation_1-rmse:3.29871
[286]	validation_0-rmse:3.32122	validation_1-rmse:3.29846
[287]	validation_0-rmse:3.32108	validation_1-rmse:3.29846
[288]	validation_0-rmse:3.32073	validation_1-rmse:3.29821
[289]	validation_0-rmse:3.32063	validation_1-rmse:3.29818
[290]	validation_0-rmse:3.3205	validation_1-rmse:3.29815
[291]	validation_0-rmse:3.32006	validation_1-rmse:3.29811
[292]	validation_0-rmse:3.31986	validation_1-rmse:3.29817
[293]	validation_0-rmse:3.31968	validation_1-rmse:3.29821
[294]	validation_0-rmse:3.31911	validation_1-rmse:3.29818
[295]	validation_0-rmse:3.31874	validation_1-rmse:3.29814
[296]	validation_0-rmse:3.31855	validation_1-rmse:3.2981
[297]	validation_0-rmse:3.31812	validation_1-rmse:3.29824
[298]	validation_0-rmse:3.31769	validation_1-rmse:3.29804
[299]	validation_0-rmse:3.31716	validation_1-rmse:3.29805
[300]	validation_0-rmse:3.31704	validation_1-rmse:3.29805
[301]	validation_0

In [30]:
', '.join(dummy_train.drop(drop_cols, 1).columns)

'day, neg_day_7, ntrans_day_7, neg_day_14, ntrans_day_14, neg_day_21, ntrans_day_21, neg_day_28, ntrans_day_28, neg_day_35, ntrans_day_35, neg_day_42, ntrans_day_42, neg_day_49, ntrans_day_49, neg_day_56, ntrans_day_56, neg_day_63, ntrans_day_63, neg_day_70, ntrans_day_70, neg_day_77, ntrans_day_77, neg_day_84, ntrans_day_84, neg_day_91, ntrans_day_91, neg_day_98, ntrans_day_98, neg_day_105, ntrans_day_105, neg_day_112, ntrans_day_112, neg_day_119, ntrans_day_119, neg_day_126, ntrans_day_126, neg_day_133, ntrans_day_133, neg_day_140, ntrans_day_140, neg_day_147, ntrans_day_147, neg_day_154, ntrans_day_154, neg_day_161, ntrans_day_161, neg_day_168, ntrans_day_168, neg_day_175, ntrans_day_175, neg_day_182, ntrans_day_182, neg_day_189, ntrans_day_189, week_day_0, week_day_1, week_day_2, week_day_3, week_day_4, week_day_5, week_day_6, neg_week_mean_1, neg_week_std_1, neg_week_max_1, neg_week_min_1, neg_week_mean_2, neg_week_std_2, neg_week_max_2, neg_week_min_2, neg_week_mean_3, neg_week_s

In [19]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
# drop_cols.extend(trans_cols)
n, fit = eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)

[0]	validation_0-rmse:9.80684	validation_1-rmse:10.1457
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.63584	validation_1-rmse:9.96501
[2]	validation_0-rmse:9.46867	validation_1-rmse:9.78851
[3]	validation_0-rmse:9.30529	validation_1-rmse:9.61588
[4]	validation_0-rmse:9.14564	validation_1-rmse:9.44699
[5]	validation_0-rmse:8.98958	validation_1-rmse:9.28161
[6]	validation_0-rmse:8.83711	validation_1-rmse:9.1203
[7]	validation_0-rmse:8.68811	validation_1-rmse:8.96252
[8]	validation_0-rmse:8.5426	validation_1-rmse:8.80808
[9]	validation_0-rmse:8.4005	validation_1-rmse:8.65732
[10]	validation_0-rmse:8.26156	validation_1-rmse:8.51006
[11]	validation_0-rmse:8.12603	validation_1-rmse:8.36622
[12]	validation_0-rmse:7.99361	validation_1-rmse:8.22587
[13]	validation_0-rmse:7.86434	validation_1-rmse:8.08865
[14]	validation_0-rmse:7.7379	validation_1-rmse:7.95471


[141]	validation_0-rmse:3.41926	validation_1-rmse:3.3464
[142]	validation_0-rmse:3.41701	validation_1-rmse:3.34436
[143]	validation_0-rmse:3.41484	validation_1-rmse:3.34243
[144]	validation_0-rmse:3.41277	validation_1-rmse:3.34049
[145]	validation_0-rmse:3.41075	validation_1-rmse:3.33871
[146]	validation_0-rmse:3.40876	validation_1-rmse:3.33712
[147]	validation_0-rmse:3.40687	validation_1-rmse:3.3356
[148]	validation_0-rmse:3.40496	validation_1-rmse:3.33406
[149]	validation_0-rmse:3.40312	validation_1-rmse:3.33244
[150]	validation_0-rmse:3.40136	validation_1-rmse:3.33091
[151]	validation_0-rmse:3.39965	validation_1-rmse:3.3294
[152]	validation_0-rmse:3.39799	validation_1-rmse:3.32791
[153]	validation_0-rmse:3.39632	validation_1-rmse:3.32668
[154]	validation_0-rmse:3.39472	validation_1-rmse:3.3253
[155]	validation_0-rmse:3.39317	validation_1-rmse:3.32409
[156]	validation_0-rmse:3.39165	validation_1-rmse:3.32291
[157]	validation_0-rmse:3.3901	validation_1-rmse:3.32183
[158]	validation_0-

[283]	validation_0-rmse:3.30769	validation_1-rmse:3.29269
[284]	validation_0-rmse:3.30714	validation_1-rmse:3.29265
[285]	validation_0-rmse:3.30699	validation_1-rmse:3.29269
[286]	validation_0-rmse:3.30645	validation_1-rmse:3.29253
[287]	validation_0-rmse:3.30606	validation_1-rmse:3.2925
[288]	validation_0-rmse:3.30567	validation_1-rmse:3.29233
[289]	validation_0-rmse:3.30526	validation_1-rmse:3.29244
[290]	validation_0-rmse:3.30514	validation_1-rmse:3.2925
[291]	validation_0-rmse:3.30453	validation_1-rmse:3.29245
[292]	validation_0-rmse:3.30435	validation_1-rmse:3.29241
[293]	validation_0-rmse:3.30416	validation_1-rmse:3.29236
[294]	validation_0-rmse:3.30362	validation_1-rmse:3.29207
[295]	validation_0-rmse:3.30318	validation_1-rmse:3.292
[296]	validation_0-rmse:3.30305	validation_1-rmse:3.29201
[297]	validation_0-rmse:3.30268	validation_1-rmse:3.29194
[298]	validation_0-rmse:3.30214	validation_1-rmse:3.2919
[299]	validation_0-rmse:3.30188	validation_1-rmse:3.29199
[300]	validation_0-

In [23]:
dummy_train = pd.get_dummies(train_transactions, columns=['week_day'])
dummy_test = pd.get_dummies(test_transactions, columns=['week_day'])

In [24]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
# drop_cols.extend(trans_cols)
n, fit = eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)

[0]	validation_0-rmse:9.80893	validation_1-rmse:10.1481
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.63995	validation_1-rmse:9.96981
[2]	validation_0-rmse:9.47481	validation_1-rmse:9.79513
[3]	validation_0-rmse:9.31338	validation_1-rmse:9.62483
[4]	validation_0-rmse:9.15557	validation_1-rmse:9.45813
[5]	validation_0-rmse:9.00137	validation_1-rmse:9.2951
[6]	validation_0-rmse:8.85053	validation_1-rmse:9.13591
[7]	validation_0-rmse:8.70325	validation_1-rmse:8.97998
[8]	validation_0-rmse:8.55922	validation_1-rmse:8.82762
[9]	validation_0-rmse:8.41854	validation_1-rmse:8.67865
[10]	validation_0-rmse:8.28088	validation_1-rmse:8.53363
[11]	validation_0-rmse:8.14645	validation_1-rmse:8.39158
[12]	validation_0-rmse:8.01527	validation_1-rmse:8.25227
[13]	validation_0-rmse:7.88707	validation_1-rmse:8.11674
[14]	validation_0-rmse:7.76187	validation_1-rmse:7.984

[141]	validation_0-rmse:3.41468	validation_1-rmse:3.35443
[142]	validation_0-rmse:3.41216	validation_1-rmse:3.35207
[143]	validation_0-rmse:3.40983	validation_1-rmse:3.34999
[144]	validation_0-rmse:3.40743	validation_1-rmse:3.34803
[145]	validation_0-rmse:3.40514	validation_1-rmse:3.34604
[146]	validation_0-rmse:3.403	validation_1-rmse:3.34421
[147]	validation_0-rmse:3.40093	validation_1-rmse:3.34251
[148]	validation_0-rmse:3.39883	validation_1-rmse:3.34057
[149]	validation_0-rmse:3.39675	validation_1-rmse:3.33891
[150]	validation_0-rmse:3.39478	validation_1-rmse:3.33718
[151]	validation_0-rmse:3.39288	validation_1-rmse:3.33554
[152]	validation_0-rmse:3.39106	validation_1-rmse:3.33401
[153]	validation_0-rmse:3.38926	validation_1-rmse:3.33249
[154]	validation_0-rmse:3.38749	validation_1-rmse:3.33116
[155]	validation_0-rmse:3.38588	validation_1-rmse:3.32974
[156]	validation_0-rmse:3.38423	validation_1-rmse:3.32818
[157]	validation_0-rmse:3.38261	validation_1-rmse:3.32684
[158]	validation

[283]	validation_0-rmse:3.29413	validation_1-rmse:3.28844
[284]	validation_0-rmse:3.29373	validation_1-rmse:3.28838
[285]	validation_0-rmse:3.29326	validation_1-rmse:3.28847
[286]	validation_0-rmse:3.29309	validation_1-rmse:3.28841
[287]	validation_0-rmse:3.29254	validation_1-rmse:3.28833
[288]	validation_0-rmse:3.29213	validation_1-rmse:3.2885
[289]	validation_0-rmse:3.29175	validation_1-rmse:3.28859
[290]	validation_0-rmse:3.29151	validation_1-rmse:3.28855
[291]	validation_0-rmse:3.29134	validation_1-rmse:3.28852
[292]	validation_0-rmse:3.29089	validation_1-rmse:3.28844
[293]	validation_0-rmse:3.29052	validation_1-rmse:3.2885
[294]	validation_0-rmse:3.29031	validation_1-rmse:3.28833
[295]	validation_0-rmse:3.28981	validation_1-rmse:3.28859
[296]	validation_0-rmse:3.28922	validation_1-rmse:3.28883
[297]	validation_0-rmse:3.2888	validation_1-rmse:3.28875
[298]	validation_0-rmse:3.28842	validation_1-rmse:3.28887
[299]	validation_0-rmse:3.28786	validation_1-rmse:3.28875
[300]	validation_

In [42]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
# drop_cols.extend(trans_cols)
n, fit = eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)

[0]	validation_0-rmse:9.80893	validation_1-rmse:10.1481
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.63995	validation_1-rmse:9.96981
[2]	validation_0-rmse:9.47481	validation_1-rmse:9.79513
[3]	validation_0-rmse:9.31338	validation_1-rmse:9.62483
[4]	validation_0-rmse:9.15557	validation_1-rmse:9.45813
[5]	validation_0-rmse:9.00137	validation_1-rmse:9.2951
[6]	validation_0-rmse:8.85053	validation_1-rmse:9.13591
[7]	validation_0-rmse:8.70325	validation_1-rmse:8.97998
[8]	validation_0-rmse:8.55922	validation_1-rmse:8.82762
[9]	validation_0-rmse:8.41854	validation_1-rmse:8.67865
[10]	validation_0-rmse:8.28088	validation_1-rmse:8.53363
[11]	validation_0-rmse:8.14645	validation_1-rmse:8.39158
[12]	validation_0-rmse:8.01527	validation_1-rmse:8.25227
[13]	validation_0-rmse:7.88707	validation_1-rmse:8.11674
[14]	validation_0-rmse:7.76187	validation_1-rmse:7.984

[141]	validation_0-rmse:3.41476	validation_1-rmse:3.35525
[142]	validation_0-rmse:3.41229	validation_1-rmse:3.35316
[143]	validation_0-rmse:3.40999	validation_1-rmse:3.35105
[144]	validation_0-rmse:3.40766	validation_1-rmse:3.34899
[145]	validation_0-rmse:3.40532	validation_1-rmse:3.34697
[146]	validation_0-rmse:3.40316	validation_1-rmse:3.34516
[147]	validation_0-rmse:3.40097	validation_1-rmse:3.34336
[148]	validation_0-rmse:3.39885	validation_1-rmse:3.34159
[149]	validation_0-rmse:3.39683	validation_1-rmse:3.34014
[150]	validation_0-rmse:3.39496	validation_1-rmse:3.33844
[151]	validation_0-rmse:3.39304	validation_1-rmse:3.33686
[152]	validation_0-rmse:3.39125	validation_1-rmse:3.33528
[153]	validation_0-rmse:3.38942	validation_1-rmse:3.33382
[154]	validation_0-rmse:3.38765	validation_1-rmse:3.33245
[155]	validation_0-rmse:3.38591	validation_1-rmse:3.331
[156]	validation_0-rmse:3.38424	validation_1-rmse:3.32964
[157]	validation_0-rmse:3.38259	validation_1-rmse:3.32821
[158]	validation

[283]	validation_0-rmse:3.29905	validation_1-rmse:3.29217
[284]	validation_0-rmse:3.29888	validation_1-rmse:3.29215
[285]	validation_0-rmse:3.29857	validation_1-rmse:3.29192
[286]	validation_0-rmse:3.29829	validation_1-rmse:3.29201
[287]	validation_0-rmse:3.29801	validation_1-rmse:3.29211
[288]	validation_0-rmse:3.29786	validation_1-rmse:3.29217
[289]	validation_0-rmse:3.29738	validation_1-rmse:3.29205
[290]	validation_0-rmse:3.2971	validation_1-rmse:3.29204
[291]	validation_0-rmse:3.2969	validation_1-rmse:3.29201
[292]	validation_0-rmse:3.29645	validation_1-rmse:3.29189
[293]	validation_0-rmse:3.29607	validation_1-rmse:3.2921
[294]	validation_0-rmse:3.29584	validation_1-rmse:3.29205
[295]	validation_0-rmse:3.2957	validation_1-rmse:3.29212
[296]	validation_0-rmse:3.29545	validation_1-rmse:3.29204
[297]	validation_0-rmse:3.29529	validation_1-rmse:3.29205
[298]	validation_0-rmse:3.295	validation_1-rmse:3.2922
[299]	validation_0-rmse:3.29478	validation_1-rmse:3.29219
[300]	validation_0-rm

In [43]:
sorted(clf.booster().get_score(importance_type='weight').items(), key=lambda x: -x[1])

[('day', 762),
 ('neg_day_1', 386),
 ('neg_day_7', 273),
 ('ntrans_day_1', 212),
 ('neg_day_14', 176),
 ('neg_day_6', 162),
 ('neg_day_21', 159),
 ('neg_day_2', 147),
 ('neg_day_3', 141),
 ('neg_day_28', 138),
 ('neg_day_42', 131),
 ('neg_day_35', 131),
 ('neg_day_4', 121),
 ('neg_day_5', 120),
 ('week_day_0', 117),
 ('mcc_code_6536', 105),
 ('neg_day_49', 101),
 ('neg_day_13', 100),
 ('neg_day_9', 99),
 ('neg_day_56', 96),
 ('neg_day_8', 92),
 ('ntrans_day_7', 88),
 ('neg_day_20', 86),
 ('neg_day_15', 85),
 ('neg_day_70', 85),
 ('neg_day_12', 84),
 ('neg_day_63', 77),
 ('month_day', 77),
 ('ntrans_day_2', 76),
 ('ntrans_day_4', 73),
 ('neg_day_19', 69),
 ('neg_day_11', 67),
 ('neg_day_84', 67),
 ('neg_day_87', 64),
 ('ntrans_day_3', 63),
 ('neg_day_10', 57),
 ('ntrans_day_13', 56),
 ('neg_day_77', 56),
 ('neg_day_16', 55),
 ('neg_day_27', 55),
 ('neg_day_17', 55),
 ('neg_day_41', 54),
 ('ntrans_day_14', 53),
 ('ntrans_day_6', 49),
 ('ntrans_day_8', 49),
 ('neg_day_32', 49),
 ('neg_day

In [15]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
drop_cols.extend(trans_cols)
n, fit = eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)

[0]	validation_0-rmse:9.80678	validation_1-rmse:10.1456
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.6357	validation_1-rmse:9.96497
[2]	validation_0-rmse:9.46848	validation_1-rmse:9.78844
[3]	validation_0-rmse:9.30504	validation_1-rmse:9.61579
[4]	validation_0-rmse:9.14532	validation_1-rmse:9.44689
[5]	validation_0-rmse:8.98921	validation_1-rmse:9.28152
[6]	validation_0-rmse:8.83667	validation_1-rmse:9.12016
[7]	validation_0-rmse:8.68757	validation_1-rmse:8.9626
[8]	validation_0-rmse:8.54191	validation_1-rmse:8.80847
[9]	validation_0-rmse:8.39964	validation_1-rmse:8.65768
[10]	validation_0-rmse:8.26065	validation_1-rmse:8.51058
[11]	validation_0-rmse:8.12492	validation_1-rmse:8.36687
[12]	validation_0-rmse:7.99244	validation_1-rmse:8.22638
[13]	validation_0-rmse:7.86297	validation_1-rmse:8.08924
[14]	validation_0-rmse:7.73657	validation_1-rmse:7.9551

[142]	validation_0-rmse:3.42239	validation_1-rmse:3.35726
[143]	validation_0-rmse:3.42019	validation_1-rmse:3.35544
[144]	validation_0-rmse:3.41806	validation_1-rmse:3.35374
[145]	validation_0-rmse:3.41601	validation_1-rmse:3.35202
[146]	validation_0-rmse:3.41405	validation_1-rmse:3.35028
[147]	validation_0-rmse:3.41215	validation_1-rmse:3.34883
[148]	validation_0-rmse:3.41019	validation_1-rmse:3.34709
[149]	validation_0-rmse:3.40839	validation_1-rmse:3.34566
[150]	validation_0-rmse:3.4066	validation_1-rmse:3.34398
[151]	validation_0-rmse:3.40484	validation_1-rmse:3.34274
[152]	validation_0-rmse:3.40316	validation_1-rmse:3.3412
[153]	validation_0-rmse:3.40149	validation_1-rmse:3.33991
[154]	validation_0-rmse:3.39991	validation_1-rmse:3.33839
[155]	validation_0-rmse:3.39833	validation_1-rmse:3.33709
[156]	validation_0-rmse:3.39694	validation_1-rmse:3.33594
[157]	validation_0-rmse:3.3955	validation_1-rmse:3.33476
[158]	validation_0-rmse:3.39398	validation_1-rmse:3.33363
[159]	validation_

[284]	validation_0-rmse:3.31204	validation_1-rmse:3.30543
[285]	validation_0-rmse:3.31152	validation_1-rmse:3.30563
[286]	validation_0-rmse:3.31136	validation_1-rmse:3.30558
[287]	validation_0-rmse:3.31091	validation_1-rmse:3.3056
[288]	validation_0-rmse:3.31036	validation_1-rmse:3.30553
[289]	validation_0-rmse:3.31014	validation_1-rmse:3.30554
[290]	validation_0-rmse:3.30955	validation_1-rmse:3.30553
[291]	validation_0-rmse:3.30919	validation_1-rmse:3.30557
[292]	validation_0-rmse:3.30865	validation_1-rmse:3.30555
[293]	validation_0-rmse:3.30853	validation_1-rmse:3.3055
[294]	validation_0-rmse:3.30812	validation_1-rmse:3.30554
[295]	validation_0-rmse:3.30802	validation_1-rmse:3.30557
[296]	validation_0-rmse:3.30782	validation_1-rmse:3.30553
[297]	validation_0-rmse:3.30745	validation_1-rmse:3.30535
[298]	validation_0-rmse:3.30724	validation_1-rmse:3.30546
[299]	validation_0-rmse:3.30713	validation_1-rmse:3.30541
[300]	validation_0-rmse:3.30663	validation_1-rmse:3.30538
[301]	validation

[426]	validation_0-rmse:3.26131	validation_1-rmse:3.3025
[427]	validation_0-rmse:3.261	validation_1-rmse:3.30248
[428]	validation_0-rmse:3.26089	validation_1-rmse:3.30251
[429]	validation_0-rmse:3.26072	validation_1-rmse:3.30247
[430]	validation_0-rmse:3.26041	validation_1-rmse:3.30237
[431]	validation_0-rmse:3.26007	validation_1-rmse:3.30238
[432]	validation_0-rmse:3.25963	validation_1-rmse:3.30236
[433]	validation_0-rmse:3.25945	validation_1-rmse:3.30229
[434]	validation_0-rmse:3.25918	validation_1-rmse:3.30223
[435]	validation_0-rmse:3.25867	validation_1-rmse:3.30214
[436]	validation_0-rmse:3.25834	validation_1-rmse:3.30218
[437]	validation_0-rmse:3.25784	validation_1-rmse:3.30214
[438]	validation_0-rmse:3.25755	validation_1-rmse:3.3022
[439]	validation_0-rmse:3.25709	validation_1-rmse:3.30227
[440]	validation_0-rmse:3.25665	validation_1-rmse:3.30225
[441]	validation_0-rmse:3.2561	validation_1-rmse:3.30216
[442]	validation_0-rmse:3.25569	validation_1-rmse:3.30203
[443]	validation_0-

479

In [14]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.02, max_depth=6, n_estimators=350, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount'
]
# for prev_week_index in range(26,35):
#     drop_cols += ['prev_week_{}_neg'.format(prev_week_index),
#                 'prev_week_{}_mean_neg'.format(prev_week_index),
#                 'prev_week_{}_std_neg'.format(prev_week_index)]

metric_by_shift = eval_model(
    dummy_train.drop(drop_cols, 1),
    'log_neg_amount', clf, [30])
for elem in metric_by_shift:
    print('{0}: {1}'.format(elem, metric_by_shift[elem]))

100%|██████████| 1/1 [01:39<00:00, 99.50s/it]

RMSLE with 30 days: 3.2912642267132117





In [15]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.01, max_depth=6, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf)

[0]	validation_0-rmse:9.8949	validation_1-rmse:10.2389
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:9.80898	validation_1-rmse:10.1484
[2]	validation_0-rmse:9.72399	validation_1-rmse:10.0588
[3]	validation_0-rmse:9.63999	validation_1-rmse:9.9702
[4]	validation_0-rmse:9.55692	validation_1-rmse:9.88266
[5]	validation_0-rmse:9.47478	validation_1-rmse:9.79596
[6]	validation_0-rmse:9.39357	validation_1-rmse:9.71024
[7]	validation_0-rmse:9.31327	validation_1-rmse:9.62544
[8]	validation_0-rmse:9.23389	validation_1-rmse:9.54186
[9]	validation_0-rmse:9.1554	validation_1-rmse:9.459
[10]	validation_0-rmse:9.07775	validation_1-rmse:9.37708
[11]	validation_0-rmse:9.00105	validation_1-rmse:9.29608
[12]	validation_0-rmse:8.92515	validation_1-rmse:9.21602
[13]	validation_0-rmse:8.85013	validation_1-rmse:9.13682
[14]	validation_0-rmse:8.77594	validation_1-rmse:9.05835


[141]	validation_0-rmse:4.0865	validation_1-rmse:4.08183
[142]	validation_0-rmse:4.07342	validation_1-rmse:4.06797
[143]	validation_0-rmse:4.06052	validation_1-rmse:4.05444
[144]	validation_0-rmse:4.04786	validation_1-rmse:4.04105
[145]	validation_0-rmse:4.03535	validation_1-rmse:4.02762
[146]	validation_0-rmse:4.02312	validation_1-rmse:4.01472
[147]	validation_0-rmse:4.01103	validation_1-rmse:4.00184
[148]	validation_0-rmse:3.9991	validation_1-rmse:3.98917
[149]	validation_0-rmse:3.9874	validation_1-rmse:3.97674
[150]	validation_0-rmse:3.97578	validation_1-rmse:3.96445
[151]	validation_0-rmse:3.96446	validation_1-rmse:3.95249
[152]	validation_0-rmse:3.95323	validation_1-rmse:3.94073
[153]	validation_0-rmse:3.9422	validation_1-rmse:3.92914
[154]	validation_0-rmse:3.93144	validation_1-rmse:3.91774
[155]	validation_0-rmse:3.92076	validation_1-rmse:3.90648
[156]	validation_0-rmse:3.91038	validation_1-rmse:3.89557
[157]	validation_0-rmse:3.90005	validation_1-rmse:3.88476
[158]	validation_0

[283]	validation_0-rmse:3.35541	validation_1-rmse:3.34836
[284]	validation_0-rmse:3.35396	validation_1-rmse:3.34731
[285]	validation_0-rmse:3.35262	validation_1-rmse:3.34629
[286]	validation_0-rmse:3.35124	validation_1-rmse:3.34522
[287]	validation_0-rmse:3.34993	validation_1-rmse:3.34429
[288]	validation_0-rmse:3.3486	validation_1-rmse:3.34332
[289]	validation_0-rmse:3.34728	validation_1-rmse:3.34235
[290]	validation_0-rmse:3.34603	validation_1-rmse:3.34138
[291]	validation_0-rmse:3.34471	validation_1-rmse:3.34043
[292]	validation_0-rmse:3.34345	validation_1-rmse:3.33952
[293]	validation_0-rmse:3.34224	validation_1-rmse:3.33858
[294]	validation_0-rmse:3.34098	validation_1-rmse:3.33771
[295]	validation_0-rmse:3.3398	validation_1-rmse:3.33696
[296]	validation_0-rmse:3.33865	validation_1-rmse:3.33619
[297]	validation_0-rmse:3.33742	validation_1-rmse:3.33539
[298]	validation_0-rmse:3.33629	validation_1-rmse:3.3345
[299]	validation_0-rmse:3.33509	validation_1-rmse:3.33384
[300]	validation_

[425]	validation_0-rmse:3.25	validation_1-rmse:3.29622
[426]	validation_0-rmse:3.24959	validation_1-rmse:3.29608
[427]	validation_0-rmse:3.24914	validation_1-rmse:3.29594
[428]	validation_0-rmse:3.24869	validation_1-rmse:3.29581
[429]	validation_0-rmse:3.24828	validation_1-rmse:3.2958
[430]	validation_0-rmse:3.24794	validation_1-rmse:3.29576
[431]	validation_0-rmse:3.24746	validation_1-rmse:3.29561
[432]	validation_0-rmse:3.24712	validation_1-rmse:3.2955
[433]	validation_0-rmse:3.24671	validation_1-rmse:3.29544
[434]	validation_0-rmse:3.24635	validation_1-rmse:3.29529
[435]	validation_0-rmse:3.24596	validation_1-rmse:3.29527
[436]	validation_0-rmse:3.24553	validation_1-rmse:3.29522
[437]	validation_0-rmse:3.24517	validation_1-rmse:3.29508
[438]	validation_0-rmse:3.24484	validation_1-rmse:3.295
[439]	validation_0-rmse:3.24443	validation_1-rmse:3.29495
[440]	validation_0-rmse:3.244	validation_1-rmse:3.295
[441]	validation_0-rmse:3.24366	validation_1-rmse:3.29501
[442]	validation_0-rmse:3

[568]	validation_0-rmse:3.20561	validation_1-rmse:3.29219
[569]	validation_0-rmse:3.20543	validation_1-rmse:3.29214
[570]	validation_0-rmse:3.20525	validation_1-rmse:3.2921
[571]	validation_0-rmse:3.20513	validation_1-rmse:3.2921
[572]	validation_0-rmse:3.20495	validation_1-rmse:3.29207
[573]	validation_0-rmse:3.20476	validation_1-rmse:3.29202
[574]	validation_0-rmse:3.20466	validation_1-rmse:3.29202
[575]	validation_0-rmse:3.20445	validation_1-rmse:3.29203
[576]	validation_0-rmse:3.20428	validation_1-rmse:3.292
[577]	validation_0-rmse:3.20416	validation_1-rmse:3.29207
[578]	validation_0-rmse:3.20398	validation_1-rmse:3.29204
[579]	validation_0-rmse:3.20381	validation_1-rmse:3.29201
[580]	validation_0-rmse:3.20366	validation_1-rmse:3.29201
[581]	validation_0-rmse:3.20356	validation_1-rmse:3.29203
[582]	validation_0-rmse:3.20334	validation_1-rmse:3.29211
[583]	validation_0-rmse:3.2032	validation_1-rmse:3.29211
[584]	validation_0-rmse:3.20309	validation_1-rmse:3.29212
[585]	validation_0-

576

In [16]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=6, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf)

[0]	validation_0-rmse:9.54849	validation_1-rmse:9.87382
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:9.1394	validation_1-rmse:9.44206
[2]	validation_0-rmse:8.75324	validation_1-rmse:9.03473
[3]	validation_0-rmse:8.38906	validation_1-rmse:8.64986
[4]	validation_0-rmse:8.0456	validation_1-rmse:8.28583
[5]	validation_0-rmse:7.72182	validation_1-rmse:7.9446
[6]	validation_0-rmse:7.41707	validation_1-rmse:7.62269
[7]	validation_0-rmse:7.13065	validation_1-rmse:7.32016
[8]	validation_0-rmse:6.86107	validation_1-rmse:7.03506
[9]	validation_0-rmse:6.60786	validation_1-rmse:6.7664
[10]	validation_0-rmse:6.37035	validation_1-rmse:6.51391
[11]	validation_0-rmse:6.14768	validation_1-rmse:6.27646
[12]	validation_0-rmse:5.93899	validation_1-rmse:6.05464
[13]	validation_0-rmse:5.74373	validation_1-rmse:5.84731
[14]	validation_0-rmse:5.56124	validation_1-rmse:5.65298

[141]	validation_0-rmse:3.18342	validation_1-rmse:3.29957
[142]	validation_0-rmse:3.18281	validation_1-rmse:3.29977
[143]	validation_0-rmse:3.18141	validation_1-rmse:3.29933
[144]	validation_0-rmse:3.18074	validation_1-rmse:3.2994
[145]	validation_0-rmse:3.18054	validation_1-rmse:3.29941
[146]	validation_0-rmse:3.17913	validation_1-rmse:3.29925
[147]	validation_0-rmse:3.17813	validation_1-rmse:3.29913
[148]	validation_0-rmse:3.1774	validation_1-rmse:3.2993
[149]	validation_0-rmse:3.17667	validation_1-rmse:3.29957
[150]	validation_0-rmse:3.1759	validation_1-rmse:3.29916
[151]	validation_0-rmse:3.17432	validation_1-rmse:3.2995
[152]	validation_0-rmse:3.17368	validation_1-rmse:3.2999
[153]	validation_0-rmse:3.1731	validation_1-rmse:3.30017
[154]	validation_0-rmse:3.17243	validation_1-rmse:3.30061
[155]	validation_0-rmse:3.17199	validation_1-rmse:3.30071
[156]	validation_0-rmse:3.17076	validation_1-rmse:3.30065
[157]	validation_0-rmse:3.16997	validation_1-rmse:3.30067
[158]	validation_0-rm

115

In [19]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)

[0]	validation_0-rmse:9.55053	validation_1-rmse:9.87558
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.14364	validation_1-rmse:9.44529
[2]	validation_0-rmse:8.75954	validation_1-rmse:9.03927
[3]	validation_0-rmse:8.39719	validation_1-rmse:8.65593
[4]	validation_0-rmse:8.05524	validation_1-rmse:8.29569
[5]	validation_0-rmse:7.73352	validation_1-rmse:7.9554
[6]	validation_0-rmse:7.43038	validation_1-rmse:7.63526
[7]	validation_0-rmse:7.14535	validation_1-rmse:7.33271
[8]	validation_0-rmse:6.87738	validation_1-rmse:7.04732
[9]	validation_0-rmse:6.62589	validation_1-rmse:6.77938
[10]	validation_0-rmse:6.38991	validation_1-rmse:6.5281
[11]	validation_0-rmse:6.16879	validation_1-rmse:6.29263
[12]	validation_0-rmse:5.96173	validation_1-rmse:6.07138
[13]	validation_0-rmse:5.76814	validation_1-rmse:5.86379
[14]	validation_0-rmse:5.58703	validation_1-rmse:5.6703

[141]	validation_0-rmse:3.28506	validation_1-rmse:3.29779
[142]	validation_0-rmse:3.28393	validation_1-rmse:3.2984
[143]	validation_0-rmse:3.28324	validation_1-rmse:3.29845
[144]	validation_0-rmse:3.28269	validation_1-rmse:3.2983
[145]	validation_0-rmse:3.28217	validation_1-rmse:3.29841
[146]	validation_0-rmse:3.28183	validation_1-rmse:3.29839
[147]	validation_0-rmse:3.28125	validation_1-rmse:3.29816
[148]	validation_0-rmse:3.28082	validation_1-rmse:3.29817
[149]	validation_0-rmse:3.28025	validation_1-rmse:3.29821
[150]	validation_0-rmse:3.27922	validation_1-rmse:3.29717
[151]	validation_0-rmse:3.27859	validation_1-rmse:3.29748
[152]	validation_0-rmse:3.27806	validation_1-rmse:3.29766
[153]	validation_0-rmse:3.27775	validation_1-rmse:3.29759
[154]	validation_0-rmse:3.27648	validation_1-rmse:3.29743
[155]	validation_0-rmse:3.27607	validation_1-rmse:3.29795
[156]	validation_0-rmse:3.27581	validation_1-rmse:3.29786
[157]	validation_0-rmse:3.27526	validation_1-rmse:3.29797
[158]	validation

129

In [22]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)

[0]	validation_0-rmse:9.55199	validation_1-rmse:9.87683
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.1464	validation_1-rmse:9.44717
[2]	validation_0-rmse:8.76356	validation_1-rmse:9.04238
[3]	validation_0-rmse:8.40237	validation_1-rmse:8.66067
[4]	validation_0-rmse:8.06134	validation_1-rmse:8.30058
[5]	validation_0-rmse:7.74046	validation_1-rmse:7.95974
[6]	validation_0-rmse:7.43808	validation_1-rmse:7.63893
[7]	validation_0-rmse:7.15375	validation_1-rmse:7.33601
[8]	validation_0-rmse:6.88629	validation_1-rmse:7.05041
[9]	validation_0-rmse:6.63535	validation_1-rmse:6.78388
[10]	validation_0-rmse:6.39975	validation_1-rmse:6.53327
[11]	validation_0-rmse:6.17864	validation_1-rmse:6.29786
[12]	validation_0-rmse:5.97171	validation_1-rmse:6.07631
[13]	validation_0-rmse:5.77799	validation_1-rmse:5.86944
[14]	validation_0-rmse:5.59687	validation_1-rmse:5.676

[142]	validation_0-rmse:3.29131	validation_1-rmse:3.30051
[143]	validation_0-rmse:3.29097	validation_1-rmse:3.30046
[144]	validation_0-rmse:3.29067	validation_1-rmse:3.30043
[145]	validation_0-rmse:3.28982	validation_1-rmse:3.30011
[146]	validation_0-rmse:3.28885	validation_1-rmse:3.30103
[147]	validation_0-rmse:3.28847	validation_1-rmse:3.3009
[148]	validation_0-rmse:3.28829	validation_1-rmse:3.30091
[149]	validation_0-rmse:3.28794	validation_1-rmse:3.30115
[150]	validation_0-rmse:3.28755	validation_1-rmse:3.30118
[151]	validation_0-rmse:3.28678	validation_1-rmse:3.3015
[152]	validation_0-rmse:3.28634	validation_1-rmse:3.30143
[153]	validation_0-rmse:3.28596	validation_1-rmse:3.30148
[154]	validation_0-rmse:3.2854	validation_1-rmse:3.30173
[155]	validation_0-rmse:3.28502	validation_1-rmse:3.30193
[156]	validation_0-rmse:3.28458	validation_1-rmse:3.30211
[157]	validation_0-rmse:3.28387	validation_1-rmse:3.30197
Stopping. Best iteration:
[117]	validation_0-rmse:3.30523	validation_1-rmse

117

In [39]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
drop_cols.extend(trans_cols)
eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)


[0]	validation_0-rmse:9.54606	validation_1-rmse:9.87127
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.13505	validation_1-rmse:9.43803
[2]	validation_0-rmse:8.7471	validation_1-rmse:9.02867
[3]	validation_0-rmse:8.38164	validation_1-rmse:8.64184
[4]	validation_0-rmse:8.03695	validation_1-rmse:8.27779
[5]	validation_0-rmse:7.71259	validation_1-rmse:7.93376
[6]	validation_0-rmse:7.40736	validation_1-rmse:7.6105
[7]	validation_0-rmse:7.12066	validation_1-rmse:7.306
[8]	validation_0-rmse:6.85132	validation_1-rmse:7.01973
[9]	validation_0-rmse:6.5987	validation_1-rmse:6.75061
[10]	validation_0-rmse:6.36169	validation_1-rmse:6.49822
[11]	validation_0-rmse:6.13967	validation_1-rmse:6.26155
[12]	validation_0-rmse:5.93214	validation_1-rmse:6.03907
[13]	validation_0-rmse:5.7382	validation_1-rmse:5.83261
[14]	validation_0-rmse:5.55706	validation_1-rmse:5.63929
[1

[141]	validation_0-rmse:3.29057	validation_1-rmse:3.3041
[142]	validation_0-rmse:3.28988	validation_1-rmse:3.30397
[143]	validation_0-rmse:3.28897	validation_1-rmse:3.30356
[144]	validation_0-rmse:3.28836	validation_1-rmse:3.30329
[145]	validation_0-rmse:3.28794	validation_1-rmse:3.30342
[146]	validation_0-rmse:3.28755	validation_1-rmse:3.30339
[147]	validation_0-rmse:3.28714	validation_1-rmse:3.30353
[148]	validation_0-rmse:3.28669	validation_1-rmse:3.3034
[149]	validation_0-rmse:3.28609	validation_1-rmse:3.3032
[150]	validation_0-rmse:3.28494	validation_1-rmse:3.303
[151]	validation_0-rmse:3.28422	validation_1-rmse:3.30299
[152]	validation_0-rmse:3.28332	validation_1-rmse:3.30254
[153]	validation_0-rmse:3.28254	validation_1-rmse:3.30262
[154]	validation_0-rmse:3.2821	validation_1-rmse:3.30258
[155]	validation_0-rmse:3.28142	validation_1-rmse:3.30277
[156]	validation_0-rmse:3.28027	validation_1-rmse:3.30174
[157]	validation_0-rmse:3.27981	validation_1-rmse:3.30166
[158]	validation_0-r

181

100%|██████████| 5/5 [00:00<00:00,  5.14it/s]


Unnamed: 0,day,week_num,week_day,month_num,month_day,neg_amount,n_transactions,log_neg_amount,neg_day_1,ntrans_day_1,neg_day_2,ntrans_day_2,neg_day_3,ntrans_day_3,neg_day_4,ntrans_day_4,neg_day_5,ntrans_day_5,neg_day_6,ntrans_day_6,neg_day_7,ntrans_day_7,neg_day_8,ntrans_day_8,neg_day_9,ntrans_day_9,neg_day_10,ntrans_day_10,neg_day_11,ntrans_day_11,neg_day_12,ntrans_day_12,neg_day_13,ntrans_day_13,neg_day_14,ntrans_day_14,neg_day_15,ntrans_day_15,neg_day_16,ntrans_day_16,neg_day_17,ntrans_day_17,neg_day_18,ntrans_day_18,neg_day_19,ntrans_day_19,neg_day_20,ntrans_day_20,neg_day_21,ntrans_day_21,neg_day_22,ntrans_day_22,neg_day_23,ntrans_day_23,neg_day_24,ntrans_day_24,neg_day_25,ntrans_day_25,neg_day_26,ntrans_day_26,neg_day_27,ntrans_day_27,neg_day_28,ntrans_day_28,neg_day_29,ntrans_day_29,neg_day_30,ntrans_day_30,neg_day_31,ntrans_day_31,neg_day_32,ntrans_day_32,neg_day_33,ntrans_day_33,neg_day_34,ntrans_day_34,neg_day_35,ntrans_day_35,neg_day_36,ntrans_day_36,neg_day_37,ntrans_day_37,neg_day_38,ntrans_day_38,neg_day_39,ntrans_day_39,neg_day_40,ntrans_day_40,neg_day_41,ntrans_day_41,neg_day_42,ntrans_day_42,neg_day_43,ntrans_day_43,neg_day_44,ntrans_day_44,neg_day_45,ntrans_day_45,neg_day_46,ntrans_day_46,...,mcc_code_5947,mcc_code_5948,mcc_code_5949,mcc_code_5950,mcc_code_5964,mcc_code_5965,mcc_code_5967,mcc_code_5968,mcc_code_5969,mcc_code_5970,mcc_code_5971,mcc_code_5976,mcc_code_5977,mcc_code_5983,mcc_code_5992,mcc_code_5993,mcc_code_5994,mcc_code_5995,mcc_code_5999,mcc_code_6010,mcc_code_6011,mcc_code_6012,mcc_code_6051,mcc_code_6211,mcc_code_6300,mcc_code_6513,mcc_code_6536,mcc_code_7011,mcc_code_7210,mcc_code_7216,mcc_code_7221,mcc_code_7230,mcc_code_7273,mcc_code_7278,mcc_code_7298,mcc_code_7299,mcc_code_7311,mcc_code_7338,mcc_code_7372,mcc_code_7375,mcc_code_7395,mcc_code_7399,mcc_code_7512,mcc_code_7523,mcc_code_7531,mcc_code_7538,mcc_code_7542,mcc_code_7629,mcc_code_7699,mcc_code_7829,mcc_code_7832,mcc_code_7841,mcc_code_7922,mcc_code_7932,mcc_code_7933,mcc_code_7991,mcc_code_7993,mcc_code_7994,mcc_code_7995,mcc_code_7996,mcc_code_7997,mcc_code_7999,mcc_code_8011,mcc_code_8021,mcc_code_8043,mcc_code_8062,mcc_code_8071,mcc_code_8099,mcc_code_8220,mcc_code_8244,mcc_code_8299,mcc_code_8398,mcc_code_8641,mcc_code_8699,mcc_code_8999,mcc_code_9211,mcc_code_9222,mcc_code_9311,mcc_code_9399,mcc_code_9402,neg_month_mean_1,neg_month_std_1,neg_month_max_1,neg_month_min_1,neg_month_mean_2,neg_month_std_2,neg_month_max_2,neg_month_min_2,neg_month_mean_3,neg_month_std_3,neg_month_max_3,neg_month_min_3,neg_month_mean_4,neg_month_std_4,neg_month_max_4,neg_month_min_4,neg_month_mean_5,neg_month_std_5,neg_month_max_5,neg_month_min_5
0,0,-1,5,0,0,11098744,2365,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,-1,6,0,1,7881825,1697,15.88007,16.222343,2365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.579369,3.065735,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0,0,0,2,6777480,1524,15.729116,15.88007,1697.0,16.222343,2365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.146515,4.209932,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0,1,0,3,9277943,1937,16.043151,15.729116,1524.0,15.88007,1697.0,16.222343,2365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.708269,5.02231,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0,2,0,4,9999757,1943,16.118071,16.043151,1937.0,15.729116,1524.0,15.88007,1697.0,16.222343,2365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.281239,5.690848,16.222343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
drop_cols.extend(trans_cols)
eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)

[0]	validation_0-rmse:9.54515	validation_1-rmse:9.86871
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.13312	validation_1-rmse:9.43313
[2]	validation_0-rmse:8.74452	validation_1-rmse:9.02103
[3]	validation_0-rmse:8.37811	validation_1-rmse:8.63302
[4]	validation_0-rmse:8.03303	validation_1-rmse:8.26778
[5]	validation_0-rmse:7.70817	validation_1-rmse:7.92302
[6]	validation_0-rmse:7.4026	validation_1-rmse:7.59879
[7]	validation_0-rmse:7.11553	validation_1-rmse:7.29396
[8]	validation_0-rmse:6.84586	validation_1-rmse:7.00648
[9]	validation_0-rmse:6.59288	validation_1-rmse:6.73768
[10]	validation_0-rmse:6.35567	validation_1-rmse:6.48507
[11]	validation_0-rmse:6.13376	validation_1-rmse:6.24814
[12]	validation_0-rmse:5.92614	validation_1-rmse:6.02538
[13]	validation_0-rmse:5.73208	validation_1-rmse:5.81805
[14]	validation_0-rmse:5.55112	validation_1-rmse:5.623

[142]	validation_0-rmse:3.30018	validation_1-rmse:3.29838
[143]	validation_0-rmse:3.29903	validation_1-rmse:3.29862
[144]	validation_0-rmse:3.29785	validation_1-rmse:3.2985
[145]	validation_0-rmse:3.29766	validation_1-rmse:3.29847
[146]	validation_0-rmse:3.29655	validation_1-rmse:3.29826
[147]	validation_0-rmse:3.29615	validation_1-rmse:3.29835
[148]	validation_0-rmse:3.29513	validation_1-rmse:3.2985
[149]	validation_0-rmse:3.29418	validation_1-rmse:3.29906
[150]	validation_0-rmse:3.29336	validation_1-rmse:3.29878
[151]	validation_0-rmse:3.29229	validation_1-rmse:3.29912
[152]	validation_0-rmse:3.29174	validation_1-rmse:3.299
[153]	validation_0-rmse:3.29112	validation_1-rmse:3.29912
[154]	validation_0-rmse:3.28994	validation_1-rmse:3.29908
[155]	validation_0-rmse:3.28934	validation_1-rmse:3.29963
[156]	validation_0-rmse:3.28878	validation_1-rmse:3.29984
[157]	validation_0-rmse:3.28797	validation_1-rmse:3.29974
[158]	validation_0-rmse:3.28726	validation_1-rmse:3.29985
[159]	validation_0

138

In [38]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=5, n_estimators=3500, nthread=-1, reg_alpha=1, reg_lambda=1)
drop_cols = [
    'n_transactions', 'neg_amount', 'log_neg_amount'
]
drop_cols.extend(trans_cols)
eval_nestimators(dummy_train.drop(drop_cols, 1), dummy_train['log_neg_amount'], 30, clf, 40)


[0]	validation_0-rmse:9.54606	validation_1-rmse:9.87127
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:9.13505	validation_1-rmse:9.43803
[2]	validation_0-rmse:8.7471	validation_1-rmse:9.02867
[3]	validation_0-rmse:8.38164	validation_1-rmse:8.64184
[4]	validation_0-rmse:8.03695	validation_1-rmse:8.27779
[5]	validation_0-rmse:7.71259	validation_1-rmse:7.93376
[6]	validation_0-rmse:7.40736	validation_1-rmse:7.6105
[7]	validation_0-rmse:7.12066	validation_1-rmse:7.306
[8]	validation_0-rmse:6.85132	validation_1-rmse:7.01973
[9]	validation_0-rmse:6.5987	validation_1-rmse:6.75061
[10]	validation_0-rmse:6.36169	validation_1-rmse:6.49822
[11]	validation_0-rmse:6.13967	validation_1-rmse:6.26155
[12]	validation_0-rmse:5.93214	validation_1-rmse:6.03907
[13]	validation_0-rmse:5.7382	validation_1-rmse:5.83261
[14]	validation_0-rmse:5.55706	validation_1-rmse:5.63929
[1

[141]	validation_0-rmse:3.29057	validation_1-rmse:3.3041
[142]	validation_0-rmse:3.28988	validation_1-rmse:3.30397
[143]	validation_0-rmse:3.28897	validation_1-rmse:3.30356
[144]	validation_0-rmse:3.28836	validation_1-rmse:3.30329
[145]	validation_0-rmse:3.28794	validation_1-rmse:3.30342
[146]	validation_0-rmse:3.28755	validation_1-rmse:3.30339
[147]	validation_0-rmse:3.28714	validation_1-rmse:3.30353
[148]	validation_0-rmse:3.28669	validation_1-rmse:3.3034
[149]	validation_0-rmse:3.28609	validation_1-rmse:3.3032
[150]	validation_0-rmse:3.28494	validation_1-rmse:3.303
[151]	validation_0-rmse:3.28422	validation_1-rmse:3.30299
[152]	validation_0-rmse:3.28332	validation_1-rmse:3.30254
[153]	validation_0-rmse:3.28254	validation_1-rmse:3.30262
[154]	validation_0-rmse:3.2821	validation_1-rmse:3.30258
[155]	validation_0-rmse:3.28142	validation_1-rmse:3.30277
[156]	validation_0-rmse:3.28027	validation_1-rmse:3.30174
[157]	validation_0-rmse:3.27981	validation_1-rmse:3.30166
[158]	validation_0-r

181