In [1]:
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp
import pandas as pd
import numpy as np
import gc

<h3>NA imputed with 0</h3>

In [2]:
data = pd.read_pickle('./data/data.pkl')

X_train = data[data.date_block_num < 33].drop(['item_cnt_month', 'ID'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month', 'ID'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month', 'ID'], axis=1)

del data
gc.collect()

In [3]:
#Search space
param_space = {'max_depth': hp.quniform('max_depth', 7, 13, 1),
               'subsample': hp.uniform('subsample', 0.7, 1),
               'n_estimators': hp.quniform('n_estimators', 1000, 2000, 100),
               'eta':hp.loguniform('eta', np.log(0.01), np.log(0.9))}

#objective
def objective(params):
    model = XGBRegressor(
        max_depth = int(params['max_depth']),
        n_estimators = int(params['n_estimators']),
        objective = 'reg:squarederror',
        min_child_weight=300,
        subsample = params['subsample'],
        colsample_bytree=0.8,
        eta=params['eta'])
    model.fit(
        X_train, 
        Y_train, 
        eval_metric="rmse", 
        eval_set=[(X_valid, Y_valid)], 
        verbose=False,
        early_stopping_rounds = 10)
    
    return mean_squared_error(Y_valid, model.predict(X_valid).clip(0, 20), squared=True)

#run
best = fmin(fn=objective, space=param_space, 
            max_evals=50, 
            rstate=np.random.RandomState(777), 
            algo=tpe.suggest)
print(best)


100%|█████████████████████████████████████████████| 50/50 [7:28:52<00:00, 538.65s/trial, best loss: 0.8500720262527466]
{'eta': 0.054842366281169515, 'max_depth': 11.0, 'n_estimators': 1600.0, 'subsample': 0.8995297930425445}


In [6]:
model = XGBRegressor(
    objective = 'reg:squarederror',
    max_depth=11,
    n_estimators=1600,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8995297930425445, 
    eta=0.054842366281169515)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_valid, Y_valid)], 
    verbose=10,
    early_stopping_rounds = 20)

[0]	validation_0-rmse:1.18261
Will train until validation_0-rmse hasn't improved in 20 rounds.
[10]	validation_0-rmse:0.956289
[20]	validation_0-rmse:0.930723
[30]	validation_0-rmse:0.932522
[40]	validation_0-rmse:0.937173
Stopping. Best iteration:
[20]	validation_0-rmse:0.930723



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.054842366281169515,
             gamma=0, importance_type='gain', learning_rate=0.1,
             max_delta_step=0, max_depth=11, min_child_weight=300, missing=None,
             n_estimators=1600, n_jobs=1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=0.8995297930425445, verbosity=1)

In [7]:
data = pd.read_csv("./data/test.csv")
data['item_cnt_month'] = model.predict(X_test).clip(0, 20)
data[['ID', 'item_cnt_month']].to_csv("./features/feature_xgboost.csv", index = False)

In [8]:
pred = model.predict(pd.concat((X_train, X_valid))).clip(0, 20)
data = pd.concat((X_train, X_valid))
data['item_cnt_month'] = pred
data.to_csv("./features_train/feature_xgboost.csv")

<h3>NA not imputed</h3>

In [2]:
data = pd.read_pickle('./data/data_na.pkl')

X_train = data[data.date_block_num < 33].drop(['item_cnt_month', 'ID'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month', 'ID'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month', 'ID'], axis=1)

del data
gc.collect()

0

In [None]:
#Search space
param_space = {'max_depth': hp.quniform('max_depth', 7, 13, 1),
               'subsample': hp.uniform('subsample', 0.7, 1),
               'n_estimators': hp.quniform('n_estimators', 1000, 2000, 100),
               'eta':hp.loguniform('eta', np.log(0.01), np.log(0.9))}

#objective
def objective(params):
    model = XGBRegressor(
        max_depth = int(params['max_depth']),
        n_estimators = int(params['n_estimators']),
        objective = 'reg:squarederror',
        min_child_weight=300,
        subsample = params['subsample'],
        colsample_bytree=0.8,
        eta=params['eta'])
    model.fit(
        X_train, 
        Y_train, 
        eval_metric="rmse", 
        eval_set=[(X_valid, Y_valid)], 
        verbose=False, 
        early_stopping_rounds = 10)
    
    return mean_squared_error(Y_valid, model.predict(X_valid), squared=True)

#run
best = fmin(fn=objective, space=param_space, 
            max_evals=50, 
            rstate=np.random.RandomState(777), 
            algo=tpe.suggest)
print(best)


  6%|██▊                                           | 3/50 [35:53<9:21:27, 716.75s/trial, best loss: 0.8658555746078491]

In [3]:
model = XGBRegressor(
    objective = 'reg:squarederror',
    max_depth=13,
    n_estimators=1200,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.9928380413657569, 
    eta=0.0816209890020063)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_valid, Y_valid)], 
    verbose=10,
    early_stopping_rounds = 20)

[0]	validation_0-rmse:1.1786
Will train until validation_0-rmse hasn't improved in 20 rounds.
[10]	validation_0-rmse:0.947804
[20]	validation_0-rmse:0.923671
[30]	validation_0-rmse:0.931383
Stopping. Best iteration:
[19]	validation_0-rmse:0.923338



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.0816209890020063,
             gamma=0, importance_type='gain', learning_rate=0.1,
             max_delta_step=0, max_depth=13, min_child_weight=300, missing=None,
             n_estimators=1200, n_jobs=1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
             subsample=0.9928380413657569, verbosity=1)

In [4]:
data = pd.read_csv("./data/test.csv")
data['item_cnt_month'] = model.predict(X_test).clip(0, 20)
data[['ID', 'item_cnt_month']].to_csv("./features/feature_xgboost_na.csv", index = False)

In [5]:
pred = model.predict(pd.concat((X_train, X_valid))).clip(0, 20)
data = pd.concat((X_train, X_valid))
data['item_cnt_month'] = pred
data.to_csv("./features_train/feature_xgboost_na.csv")