In [1]:
import numpy as np
import xgboost as xgb
import pandas as pd
from xgboost.sklearn import XGBRegressor
from sklearn import model_selection, metrics
import matplotlib.pyplot as pl
import itertools

%matplotlib inline



In [7]:
train = pd.read_csv("train.tsv")

In [8]:
for i in range(1, 31):
    for j_1, j_2 in zip(train['f{}'.format(i)], train['f{}'.format(i + 30)]):
        if j_1 != j_2:
            print("different!")

In [9]:
train.drop(["f{}".format(i) for i in range(31, 61)], axis=1, inplace=True)

In [10]:
n_buckets = 100#int(train['item_id'].nunique() / 10)
id_hash = train['item_id'].values % n_buckets
for i in range(n_buckets):
    train['id_hash_{}'.format(i)] = [1 if id_hash[j] == i else 0 for j in range(len(id_hash))]

In [12]:
def SMAPE(regressor, X, y):
    prediction = regressor.predict(X)
    result = 0
    for ans, right_ans in zip(prediction, np.array(y)):
        result += abs(ans - right_ans) / (abs(right_ans) + abs(ans))
    return result * 200 / len(y)
    
def test_regressor(regressor, data, target, features):
    tscv = model_selection.TimeSeriesSplit(n_splits=5)
    score = []
    for train_index, test_index in tscv.split(data):
        X_train, X_test = data[features].values[train_index], data[features].values[test_index]
        y_train, y_test = data[target].values[train_index], data[target].values[test_index]
        regressor.fit(X_train, y_train, eval_metric='mae')
        score.append(SMAPE(regressor, X_test, y_test))

    return np.array(score).mean()

def print_params(params_arr, names_arr, params):
    for i, p, name in zip(range(len(params)), params, names_arr):
        if len(params_arr[i]) > 1:
            if isinstance(p, int):
                print("{} = {}".format(name, p), end="  ") 
            else:
                print("{} = {:.4f}".format(name, p), end="  ")  

def get_best_params(params_arr, names_arr, train, target, features):
    maximum = 200
    best_params = []
    for params in itertools.product(*params_arr):
        param_dict = {}
        for i, name in enumerate(names_arr):
            param_dict[name] = params[i]
        print_params(params_arr, names_arr, params)
        result = test_regressor(XGBRegressor(**param_dict), train, target, features)
        print("SMPAE = {:.4f}".format(result))
        if (result < maximum):
            maximum = result
            best_params = params
    print("best parametrs:")
    print_params(params_arr, names_arr, best_params)

In [45]:
target = ['y']
features = [x for x in train.columns if x not in ['y', 'item_id', 'Num', 'year', 'week', 'shift']]
param_names = ['learning_rate',
               'n_estimators',
               'max_depth',
               'min_child_weight',
               'gamma',
               'subsample',
               'colsample_bytree',
               'nthread',
               'seed',
               'reg_alpha'
              ]

In [46]:
params_1 = [[0.1], range(1, 130, 10), [5], [1], [0], [0.8], [0.8], [4], [27], [0]]
get_best_params(params_1, param_names, train, target, features)

n_estimators = 1  SMPAE = 149.5773
n_estimators = 11  SMPAE = 52.8662
n_estimators = 21  SMPAE = 40.8768
n_estimators = 31  SMPAE = 40.8074
n_estimators = 41  SMPAE = 43.9632
n_estimators = 51  SMPAE = 46.8504
n_estimators = 61  SMPAE = 47.6556
n_estimators = 71  SMPAE = 47.7299
n_estimators = 81  SMPAE = 47.6907
n_estimators = 91  SMPAE = 47.4650
n_estimators = 101  SMPAE = 47.3913
n_estimators = 111  SMPAE = 47.3360
n_estimators = 121  SMPAE = 47.2482
best parametrs:
n_estimators = 31  

In [47]:
params_1_b = [[0.1], range(22, 41, 2), [5], [1], [0], [0.8], [0.8], [4], [27], [0]]
get_best_params(params_1_b, param_names, train, target, features)

n_estimators = 22  SMPAE = 40.4075
n_estimators = 24  SMPAE = 39.9762
n_estimators = 26  SMPAE = 40.0378
n_estimators = 28  SMPAE = 40.4735
n_estimators = 30  SMPAE = 40.7290
n_estimators = 32  SMPAE = 41.0926
n_estimators = 34  SMPAE = 41.5178
n_estimators = 36  SMPAE = 41.9667
n_estimators = 38  SMPAE = 42.8993
n_estimators = 40  SMPAE = 43.6588
best parametrs:
n_estimators = 24  

In [48]:
params_2 = [[0.1], [24], range(3, 11), range(1, 6), [0], [0.8], [0.8], [4], [27], [0]]
get_best_params(params_2, param_names, train, target, features)

max_depth = 3  min_child_weight = 1  SMPAE = 55.4193
max_depth = 3  min_child_weight = 2  SMPAE = 55.4193
max_depth = 3  min_child_weight = 3  SMPAE = 55.4192
max_depth = 3  min_child_weight = 4  SMPAE = 55.4195
max_depth = 3  min_child_weight = 5  SMPAE = 55.4206
max_depth = 4  min_child_weight = 1  SMPAE = 45.4621
max_depth = 4  min_child_weight = 2  SMPAE = 45.4617
max_depth = 4  min_child_weight = 3  SMPAE = 45.6498
max_depth = 4  min_child_weight = 4  SMPAE = 45.8119
max_depth = 4  min_child_weight = 5  SMPAE = 45.7016
max_depth = 5  min_child_weight = 1  SMPAE = 39.9762
max_depth = 5  min_child_weight = 2  SMPAE = 40.3371
max_depth = 5  min_child_weight = 3  SMPAE = 40.3514
max_depth = 5  min_child_weight = 4  SMPAE = 40.0614
max_depth = 5  min_child_weight = 5  SMPAE = 40.1059
max_depth = 6  min_child_weight = 1  SMPAE = 37.2978
max_depth = 6  min_child_weight = 2  SMPAE = 37.3284
max_depth = 6  min_child_weight = 3  SMPAE = 36.9981
max_depth = 6  min_child_weight = 4  SMPAE = 3

In [55]:
params_3 = [[0.1], [24], [10], [5], np.linspace(0, 0.5, 5), [0.8], [0.8], [4], [27], [0]]
get_best_params(params_3, param_names, train, target, features)

gamma = 0.0000  SMPAE = 31.7087
gamma = 0.1250  SMPAE = 31.7087
gamma = 0.2500  SMPAE = 31.7087
gamma = 0.3750  SMPAE = 31.7087
gamma = 0.5000  SMPAE = 31.7087
best parametrs:
gamma = 0.0000  

In [56]:
params_4 = [[0.1], [24], [10], [5], [0], np.linspace(0.6, 0.9, 5),
            np.linspace(0.6, 0.9, 5), [4], [27], [0]]
get_best_params(params_4, param_names, train, target, features)

subsample = 0.6000  colsample_bytree = 0.6000  SMPAE = 32.2377
subsample = 0.6000  colsample_bytree = 0.6750  SMPAE = 32.2689
subsample = 0.6000  colsample_bytree = 0.7500  SMPAE = 32.0564
subsample = 0.6000  colsample_bytree = 0.8250  SMPAE = 31.9782
subsample = 0.6000  colsample_bytree = 0.9000  SMPAE = 31.9620
subsample = 0.6750  colsample_bytree = 0.6000  SMPAE = 32.1754
subsample = 0.6750  colsample_bytree = 0.6750  SMPAE = 32.0014
subsample = 0.6750  colsample_bytree = 0.7500  SMPAE = 31.9735
subsample = 0.6750  colsample_bytree = 0.8250  SMPAE = 31.8435
subsample = 0.6750  colsample_bytree = 0.9000  SMPAE = 31.9281
subsample = 0.7500  colsample_bytree = 0.6000  SMPAE = 32.0154
subsample = 0.7500  colsample_bytree = 0.6750  SMPAE = 31.8555
subsample = 0.7500  colsample_bytree = 0.7500  SMPAE = 31.8459
subsample = 0.7500  colsample_bytree = 0.8250  SMPAE = 31.7773
subsample = 0.7500  colsample_bytree = 0.9000  SMPAE = 31.7234
subsample = 0.8250  colsample_bytree = 0.6000  SMPAE = 

In [57]:
params_5 = [[0.1], [24], [10], [5], [0], [0.9], [0.825], [4], [27], [10**(i) for i in [4, 2, 1, 0, -1]]]
get_best_params(params_5, param_names, train, target, features)

reg_alpha = 10000  SMPAE = 31.7576
reg_alpha = 100  SMPAE = 31.8022
reg_alpha = 10  SMPAE = 31.7610
reg_alpha = 1  SMPAE = 31.7610
reg_alpha = 0.1000  SMPAE = 31.6803
best parametrs:
reg_alpha = 0.1000  

In [58]:
params_5_b = [[0.1], [24], [10], [5], [0], [0.9], [0.825], [4], [27],
            [10**(i) for i in [-4, -2, -1]]]
get_best_params(params_5_b, param_names, train, target, features)

reg_alpha = 0.0001  SMPAE = 31.6803
reg_alpha = 0.0100  SMPAE = 31.6803
reg_alpha = 0.1000  SMPAE = 31.6803
best parametrs:
reg_alpha = 0.1000  

In [61]:
params_5_c = [[0.1], [24], [10], [5], [0], [0.9], [0.825], [4], [27],
            np.linspace(0.1, 0.9, 3)]
get_best_params(params_5_c, param_names, train, target, features)

reg_alpha = 0.1000  SMPAE = 31.6803
reg_alpha = 0.5000  SMPAE = 31.7610
reg_alpha = 0.9000  SMPAE = 31.7610
best parametrs:
reg_alpha = 0.1000  

In [62]:
params_6 = [[0.01], [100, 200, 300, 400, 500], [10], [5], [0], [0.9], [0.825], [4], [27], [0.1]]
get_best_params(params_6, param_names, train, target, features)

n_estimators = 100  SMPAE = 50.1469
n_estimators = 200  SMPAE = 32.9527
n_estimators = 300  SMPAE = 31.1267
n_estimators = 400  SMPAE = 31.6020
n_estimators = 500  SMPAE = 33.2067
best parametrs:
n_estimators = 300  

In [63]:
params_6_b = [[0.01], range(250, 360, 20), [10], [5], [0], [0.9], [0.825], [4], [27], [0.1]]
get_best_params(params_6_b, param_names, train, target, features)

n_estimators = 250  SMPAE = 31.4627
n_estimators = 270  SMPAE = 31.2528
n_estimators = 290  SMPAE = 31.1453
n_estimators = 310  SMPAE = 31.1384
n_estimators = 330  SMPAE = 31.1511
n_estimators = 350  SMPAE = 31.2092
best parametrs:
n_estimators = 310  

In [40]:
final_params = [0.01, 330, 10, 5, 0, 0.9, 0.825, 4, 27, 0.1]

test = pd.read_csv("test.tsv")
param_dict = {}
for i, name in enumerate(param_names):
    param_dict[name] = final_params[i]

regressor = XGBRegressor(**param_dict)

In [41]:
test.drop(["f{}".format(i) for i in range(31, 61)], axis=1, inplace=True)

n_buckets = 100
id_hash = test['item_id'].values % n_buckets
for i in range(n_buckets):
    test['id_hash_{}'.format(i)] = [1 if id_hash[j] == i else 0 for j in range(len(id_hash))]

In [42]:
regressor.fit(train[features].values, train[target].values)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=12,
       min_child_weight=4, missing=None, n_estimators=330, nthread=4,
       objective='reg:linear', reg_alpha=3.2, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.9)

In [43]:
final_answer = regressor.predict(test[features].values)

In [44]:
sample_submission = pd.read_csv("sample_submission.tsv")
sample_submission['y'] = final_answer
sample_submission.to_csv("submission1.tsv", sep=',', index=False)