In [68]:
import numpy as np
import xgboost as xgb
import pandas as pd
from xgboost.sklearn import XGBRegressor
from sklearn import model_selection, metrics
import matplotlib.pyplot as pl
import itertools

%matplotlib inline

Считываем данные

In [69]:
train = pd.read_csv("train.tsv")

Убеждаемся, что фичи с 1 по 30 полная копия фич с 31 по 60

In [70]:
for i in range(1, 31):
    for j_1, j_2 in zip(train['f{}'.format(i)], train['f{}'.format(i + 30)]):
        if j_1 != j_2:
            print("different!")

Удаляем повторы

In [71]:
train.drop(["f{}".format(i) for i in range(31, 61)], axis=1, inplace=True)

Похешируем ID и добавим новые булевские признаки для каждого значения хеша

In [72]:
n_buckets = 100 #int(train['item_id'].nunique() / 10)
id_hash = train['item_id'].values % n_buckets
for i in range(n_buckets):
    train['id_hash_{}'.format(i)] = [1 if id_hash[j] == i else 0 for j in range(len(id_hash))]

In [73]:
def SMAPE(regressor, X, y):
    prediction = regressor.predict(X)
    result = 0
    for ans, right_ans in zip(prediction, np.array(y)):
        result += abs(ans - right_ans) / (abs(right_ans) + abs(ans))
    return result * 200 / len(y)
    
def test_regressor(regressor, data, target, features):
    tscv = model_selection.TimeSeriesSplit(n_splits=5)
    score = []
    for train_index, test_index in tscv.split(data):
        X_train, X_test = data[features].values[train_index], data[features].values[test_index]
        y_train, y_test = data[target].values[train_index], data[target].values[test_index]
        regressor.fit(X_train, y_train, eval_metric='mae')
        score.append(SMAPE(regressor, X_test, y_test))

    return np.array(score).mean()

def print_params(params_arr, names_arr, params):
    for i, p, name in zip(range(len(params)), params, names_arr):
        if len(params_arr[i]) > 1:
            if isinstance(p, int):
                print("{} = {}".format(name, p), end="  ") 
            else:
                print("{} = {:.4f}".format(name, p), end="  ")  

def get_best_params(params_arr, names_arr, train, target, features):
    maximum = 200
    best_params = []
    for params in itertools.product(*params_arr):
        param_dict = {}
        for i, name in enumerate(names_arr):
            param_dict[name] = params[i]
        print_params(params_arr, names_arr, params)
        result = test_regressor(XGBRegressor(**param_dict), train, target, features)
        print("SMPAE = {:.4f}".format(result))
        if (result < maximum):
            maximum = result
            best_params = params
    print("best parametrs:")
    print_params(params_arr, names_arr, best_params)

In [74]:
target = ['y']
features = [x for x in train.columns if x not in ['y', 'item_id', 'Num', 'year', 'week', 'shift']]
param_names = ['learning_rate',
               'n_estimators',
               'max_depth',
               'min_child_weight',
               'gamma',
               'subsample',
               'colsample_bytree',
               'nthread',
               'seed',
               'reg_alpha'
              ]

Тьюним параметры для XGBRegressor

In [75]:
params_1 = [[0.1], range(20, 131, 10), [8], [1], [0], [0.9], [0.9], [4], [27], [0]]
get_best_params(params_1, param_names, train, target, features)

n_estimators = 20  SMPAE = 35.0989
n_estimators = 30  SMPAE = 33.5186
n_estimators = 40  SMPAE = 34.7030
n_estimators = 50  SMPAE = 37.3771
n_estimators = 60  SMPAE = 39.3887
n_estimators = 70  SMPAE = 39.8739
n_estimators = 80  SMPAE = 40.0137
n_estimators = 90  SMPAE = 39.9551
n_estimators = 100  SMPAE = 39.8284
n_estimators = 110  SMPAE = 39.7100
n_estimators = 120  SMPAE = 39.5515
n_estimators = 130  SMPAE = 39.5849
best parametrs:
n_estimators = 30  

In [77]:
params_1_b = [[0.1], range(22, 39, 2), [8], [1], [0], [0.9], [0.9], [4], [27], [0]]
get_best_params(params_1_b, param_names, train, target, features)

n_estimators = 22  SMPAE = 34.3834
n_estimators = 24  SMPAE = 34.0773
n_estimators = 26  SMPAE = 33.8042
n_estimators = 28  SMPAE = 33.6313
n_estimators = 30  SMPAE = 33.5186
n_estimators = 32  SMPAE = 33.6373
n_estimators = 34  SMPAE = 33.7620
n_estimators = 36  SMPAE = 33.9932
n_estimators = 38  SMPAE = 34.3528
best parametrs:
n_estimators = 30  

In [79]:
params_2 = [[0.1], [30], range(5, 11), range(1, 6), [0], [0.9], [0.9], [4], [27], [0]]
get_best_params(params_2, param_names, train, target, features)

max_depth = 5  min_child_weight = 1  SMPAE = 39.8802
max_depth = 5  min_child_weight = 2  SMPAE = 40.1103
max_depth = 5  min_child_weight = 3  SMPAE = 40.0799
max_depth = 5  min_child_weight = 4  SMPAE = 40.2420
max_depth = 5  min_child_weight = 5  SMPAE = 40.0912
max_depth = 6  min_child_weight = 1  SMPAE = 37.3249
max_depth = 6  min_child_weight = 2  SMPAE = 37.0260
max_depth = 6  min_child_weight = 3  SMPAE = 37.3472
max_depth = 6  min_child_weight = 4  SMPAE = 37.0988
max_depth = 6  min_child_weight = 5  SMPAE = 37.1733
max_depth = 7  min_child_weight = 1  SMPAE = 35.0644
max_depth = 7  min_child_weight = 2  SMPAE = 35.1661
max_depth = 7  min_child_weight = 3  SMPAE = 35.0477
max_depth = 7  min_child_weight = 4  SMPAE = 34.9126
max_depth = 7  min_child_weight = 5  SMPAE = 34.8727
max_depth = 8  min_child_weight = 1  SMPAE = 33.5186
max_depth = 8  min_child_weight = 2  SMPAE = 33.5132
max_depth = 8  min_child_weight = 3  SMPAE = 33.6401
max_depth = 8  min_child_weight = 4  SMPAE = 3

In [80]:
params_3 = [[0.1], [30], [10], [4], np.linspace(0, 0.5, 5), [0.9], [0.9], [4], [27], [0]]
get_best_params(params_3, param_names, train, target, features)

gamma = 0.0000  SMPAE = 31.4859
gamma = 0.1250  SMPAE = 31.4859
gamma = 0.2500  SMPAE = 31.4859
gamma = 0.3750  SMPAE = 31.4859
gamma = 0.5000  SMPAE = 31.4859
best parametrs:
gamma = 0.0000  

In [82]:
params_4 = [[0.1], [30], [10], [4], [0], np.linspace(0.75, 0.95, 4),
            np.linspace(0.75, 0.95, 4), [4], [27], [0]]
get_best_params(params_4, param_names, train, target, features)

subsample = 0.7500  colsample_bytree = 0.7500  SMPAE = 31.7255
subsample = 0.7500  colsample_bytree = 0.8167  SMPAE = 31.6240
subsample = 0.7500  colsample_bytree = 0.8833  SMPAE = 31.8093
subsample = 0.7500  colsample_bytree = 0.9500  SMPAE = 31.8161
subsample = 0.8167  colsample_bytree = 0.7500  SMPAE = 31.5569
subsample = 0.8167  colsample_bytree = 0.8167  SMPAE = 31.5071
subsample = 0.8167  colsample_bytree = 0.8833  SMPAE = 31.5612
subsample = 0.8167  colsample_bytree = 0.9500  SMPAE = 31.4612
subsample = 0.8833  colsample_bytree = 0.7500  SMPAE = 31.3801
subsample = 0.8833  colsample_bytree = 0.8167  SMPAE = 31.2875
subsample = 0.8833  colsample_bytree = 0.8833  SMPAE = 31.4840
subsample = 0.8833  colsample_bytree = 0.9500  SMPAE = 31.6642
subsample = 0.9500  colsample_bytree = 0.7500  SMPAE = 31.5401
subsample = 0.9500  colsample_bytree = 0.8167  SMPAE = 31.3151
subsample = 0.9500  colsample_bytree = 0.8833  SMPAE = 31.4755
subsample = 0.9500  colsample_bytree = 0.9500  SMPAE = 

In [83]:
params_5 = [[0.1], [30], [10], [4], [0], [0.8833], [0.8167], [4], [27],
            [10**(i) for i in [3, 2, 1, 0, -1, -2, -3]]]
get_best_params(params_5, param_names, train, target, features)

reg_alpha = 1000  SMPAE = 31.5757
reg_alpha = 100  SMPAE = 31.3410
reg_alpha = 10  SMPAE = 31.2685
reg_alpha = 1  SMPAE = 31.2730
reg_alpha = 0.1000  SMPAE = 31.2730
reg_alpha = 0.0100  SMPAE = 31.2730
reg_alpha = 0.0010  SMPAE = 31.2730
best parametrs:
reg_alpha = 10  

In [84]:
params_5_b = [[0.1], [30], [10], [4], [0], [0.8833], [0.8167], [4], [27],
              range(2, 20, 2)]
get_best_params(params_5_b, param_names, train, target, features)

reg_alpha = 2  SMPAE = 31.2730
reg_alpha = 4  SMPAE = 31.2730
reg_alpha = 6  SMPAE = 31.2685
reg_alpha = 8  SMPAE = 31.2685
reg_alpha = 10  SMPAE = 31.2685
reg_alpha = 12  SMPAE = 31.2685
reg_alpha = 14  SMPAE = 31.2542
reg_alpha = 16  SMPAE = 31.2376
reg_alpha = 18  SMPAE = 31.2377
best parametrs:
reg_alpha = 16  

In [86]:
params_6 = [[0.002], [500, 1000, 1500, 2000], [10], [4], [0], [0.8833], [0.8167], [4], [27], [16]]
get_best_params(params_6, param_names, train, target, features)

n_estimators = 500  SMPAE = 50.3103
n_estimators = 1000  SMPAE = 33.0605
n_estimators = 1500  SMPAE = 31.1308
n_estimators = 2000  SMPAE = 31.7216
best parametrs:
n_estimators = 1500  

In [63]:
params_6_b = [[0.01], range(250, 360, 20), [10], [4], [0], [0.8833], [0.8167], [4], [27], []]
get_best_params(params_6_b, param_names, train, target, features)

n_estimators = 250  SMPAE = 31.4627
n_estimators = 270  SMPAE = 31.2528
n_estimators = 290  SMPAE = 31.1453
n_estimators = 310  SMPAE = 31.1384
n_estimators = 330  SMPAE = 31.1511
n_estimators = 350  SMPAE = 31.2092
best parametrs:
n_estimators = 310  

In [49]:
final_params = [0.01, 310, 10, 4, 0, 0.8833, 0.8167, 4, 27, ]

test = pd.read_csv("test.tsv")
param_dict = {}
for i, name in enumerate(param_names):
    param_dict[name] = final_params[i]

regressor = XGBRegressor(**param_dict)

Работаем с признаками так же как и для train

In [50]:
test.drop(["f{}".format(i) for i in range(31, 61)], axis=1, inplace=True)

n_buckets = 100
id_hash = test['item_id'].values % n_buckets
for i in range(n_buckets):
    test['id_hash_{}'.format(i)] = [1 if id_hash[j] == i else 0 for j in range(len(id_hash))]

In [51]:
regressor.fit(train[features].values, train[target].values, eval_metric='mae')

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.825,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=10,
       min_child_weight=5, missing=None, n_estimators=310, nthread=4,
       objective='reg:linear', reg_alpha=0.1, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.9)

Оцениваем данные, которые можно оценить непосредственно с помощью последующих данных. Сделать это можно балогодаря тому, что $f30$ это с некоторым коэффициентом $y$ для недели с номером $номер\_текущей\_недели - shift$.

In [14]:
ID_dict = {}
for i in test["item_id"].values:
    ID_dict[i] = True

IDs = list(ID_dict.keys())
ID_indexes = {}
for i, ID in enumerate(IDs):
    ID_indexes[ID] = i

In [27]:
week_data_for_ID = np.ones([3, len(IDs)]) * -1
final_answer = np.zeros(len(test['week']))
unknown_index = []


Это как раз коэффициент между $y$ и $f30$. Я вычислил его просто по одному значению, но он примерно одинаковый для всех данных.

In [58]:
train[train['item_id'] == IDs[1]][['year', 'week', 'shift']].head()

Unnamed: 0,year,week,shift
102,2012,52,1
332,2013,1,2
564,2013,2,3
793,2013,1,1
1025,2013,2,2


In [59]:
magic = (train[train['week'] == 52]
 [train['year'] == 2012]
 [train['item_id'] == IDs[1]]
 [train['shift'] == 1]['y'].values[0] /
 train[train['week'] == 1]
 [train['year'] == 2013]
 [train['item_id'] == IDs[1]]
 [train['shift'] == 1]['f30'].values[0])

  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [60]:
magic

1.6099949562878277

In [29]:
for i in range(len(test['week'])):
    week_num = test['week'].values[i] - test['shift'].values[i]
    if week_num > 2:
        week_data_for_ID[week_num - 3][ID_indexes[test['item_id'].values[i]]] = magic * test['f30'].values[i]

In [31]:
for i in range(len(test['week'])):
    value = week_data_for_ID[test['week'].values[i] - 3][ID_indexes[test['item_id'].values[i]]]
    if value == - 1:
        unknown_index.append(i)
    else:
        final_answer[i] = value

In [62]:
predict_for_unknown = regressor.predict(test[features].values[unknown_index])

In [64]:
for ind, ans in zip(unknown_index, predict_for_unknown):
    final_answer[ind] = ans

In [66]:
sample_submission = pd.read_csv("sample_submission.tsv")
sample_submission['y'] = final_answer
sample_submission.to_csv("submission1.tsv", sep=',', index=False)