In [1]:
from lightgbm import LGBMRegressor
from hyperopt import fmin, tpe, hp
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import gc

<h3>NA imputed with 0</h3>

In [2]:
data = pd.read_pickle('./data/data.pkl')

X_train = data[data.date_block_num < 33].drop(['item_cnt_month', 'ID'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month', 'ID'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month', 'ID'], axis=1)

del data
gc.collect()

In [3]:
#Search space
param_space = {
    'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.1)),
    'max_depth': hp.quniform('max_depth', 7, 20, 1),
    'subsample': hp.uniform('subsample', 0.1, 0.9),
    'n_estimators': hp.quniform('n_estimators', 500, 2000, 100),
    'num_leaves': hp.quniform('num_leaves', 50, 250, 10)}

#objective
def objective(params):
    parameters = {
        'learning_rate': params['learning_rate'],
        'max_depth': int(params['max_depth']), 
        'boosting': 'gbdt', 
        'objective': 'regression',
        'n_estimators' : int(params['n_estimators']),
        'metric': 'rmse', 
        'is_training_metric': True, 
        'num_leaves': int(params['num_leaves']),
        'subsample' : params['subsample'],
        'feature_fraction': 0.9, 
        'bagging_fraction': 0.7, 
        'bagging_freq': 5}
    model = LGBMRegressor(**parameters)
    
    model.fit(
        X_train,
        Y_train, 
        eval_metric="rmse", 
        eval_set=[(X_valid, Y_valid)], 
        verbose=False, 
        early_stopping_rounds = 10)
    
    return mean_squared_error(Y_valid, model.predict(X_valid).clip(0, 20), squared=True)

#run
best = fmin(fn=objective, space=param_space, 
            max_evals=1000, 
            rstate=np.random.RandomState(777), 
            algo=tpe.suggest)
print(best)

100%|██████████████████████████████████████████| 1000/1000 [4:08:24<00:00, 14.90s/trial, best loss: 0.8638107966557266]
{'learning_rate': 0.03468199052620827, 'max_depth': 10.0, 'n_estimators': 700.0, 'num_leaves': 190.0, 'subsample': 0.8431376859767725}


In [17]:
parameters = {
        'learning_rate': 0.03468199052620827,
        'max_depth': 10, 
        'boosting': 'gbdt', 
        'objective': 'regression',
        'n_estimators' : 700,
        'metric': 'rmse', 
        'is_training_metric': True, 
        'num_leaves': 190,
        'subsample' : 0.8431376859767725}
model = LGBMRegressor(**parameters)
    
model.fit(
    X_train,
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_valid, Y_valid)], 
    verbose=10,
    early_stopping_rounds = 100)

Training until validation scores don't improve for 100 rounds
[10]	valid_0's rmse: 1.09625
[20]	valid_0's rmse: 1.01699
[30]	valid_0's rmse: 0.978223
[40]	valid_0's rmse: 0.958632
[50]	valid_0's rmse: 0.948649
[60]	valid_0's rmse: 0.943686
[70]	valid_0's rmse: 0.942817
[80]	valid_0's rmse: 0.944902
[90]	valid_0's rmse: 0.948934
[100]	valid_0's rmse: 0.95182
[110]	valid_0's rmse: 0.954684
[120]	valid_0's rmse: 0.956235
[130]	valid_0's rmse: 0.956447
[140]	valid_0's rmse: 0.957937
[150]	valid_0's rmse: 0.958939
[160]	valid_0's rmse: 0.960486
Early stopping, best iteration is:
[64]	valid_0's rmse: 0.9426


LGBMRegressor(boosting='gbdt', boosting_type='gbdt', class_weight=None,
              colsample_bytree=1.0, importance_type='split',
              is_training_metric=True, learning_rate=0.03468199052620827,
              max_depth=10, metric='rmse', min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=700,
              n_jobs=-1, num_leaves=190, objective='regression',
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=0.8431376859767725, subsample_for_bin=200000,
              subsample_freq=0)

In [15]:
data = pd.read_csv("./data/test.csv")
data['item_cnt_month'] = model.predict(X_test).clip(0, 20)
data[['ID', 'item_cnt_month']].to_csv("./features/feature_lgbm.csv", index = False)

In [16]:
pred = model.predict(pd.concat((X_train, X_valid))).clip(0, 20)
data = pd.concat((X_train, X_valid))
data['item_cnt_month'] = pred
data.to_csv("./features_train/feature_lgbm.csv")

<h3>NA not imputed</h3>

In [2]:
data = pd.read_pickle('./data/data_na.pkl')

X_train = data[data.date_block_num < 33].drop(['item_cnt_month', 'ID'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month', 'ID'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month', 'ID'], axis=1)

del data
gc.collect()

0

In [None]:
#Search space
param_space = {
    'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.1)),
    'max_depth': hp.quniform('max_depth', 7, 20, 1),
    'subsample': hp.uniform('subsample', 0.1, 0.9),
    'n_estimators': hp.quniform('n_estimators', 500, 2000, 100),
    'num_leaves': hp.quniform('num_leaves', 50, 250, 10)}

#objective
def objective(params):
    parameters = {
        'learning_rate': params['learning_rate'],
        'max_depth': int(params['max_depth']), 
        'boosting': 'gbdt', 
        'objective': 'regression',
        'n_estimators' : int(params['n_estimators']),
        'metric': 'rmse', 
        'is_training_metric': True, 
        'num_leaves': int(params['num_leaves']),
        'subsample' : params['subsample']}
    model = LGBMRegressor(**parameters)
    
    model.fit(
        X_train,
        Y_train, 
        eval_metric="rmse", 
        eval_set=[(X_valid, Y_valid)], 
        verbose=False,
        early_stopping_rounds = 10)
    
    return mean_squared_error(Y_valid, model.predict(X_valid), squared=True)

#run
best = fmin(fn=objective, space=param_space, 
            max_evals=1000, 
            rstate=np.random.RandomState(777), 
            algo=tpe.suggest)
print(best)

 98%|█████████████████████████████████████████▉ | 976/1000 [3:54:04<04:36, 11.51s/trial, best loss: 0.8614991173810351]

In [9]:
parameters = {
        'learning_rate': 0.05478584195652779,
        'max_depth': 13, 
        'boosting': 'gbdt', 
        'objective': 'regression',
        'n_estimators' : 1400,
        'metric': 'rmse', 
        'is_training_metric': True, 
        'num_leaves': 140,
        'subsample' : 0.33150640732190106}
model = LGBMRegressor(**parameters)
    
model.fit(
    X_train,
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_valid, Y_valid)], 
    verbose=10,
    early_stopping_rounds = 100)

Training until validation scores don't improve for 100 rounds
[10]	valid_0's rmse: 1.03835
[20]	valid_0's rmse: 0.95918
[30]	valid_0's rmse: 0.934791
[40]	valid_0's rmse: 0.930065
[50]	valid_0's rmse: 0.931467
[60]	valid_0's rmse: 0.934066
[70]	valid_0's rmse: 0.937915
[80]	valid_0's rmse: 0.938713
[90]	valid_0's rmse: 0.941475
[100]	valid_0's rmse: 0.941933
[110]	valid_0's rmse: 0.941998
[120]	valid_0's rmse: 0.942466
[130]	valid_0's rmse: 0.940318
[140]	valid_0's rmse: 0.939039
Early stopping, best iteration is:
[45]	valid_0's rmse: 0.92817


LGBMRegressor(boosting='gbdt', boosting_type='gbdt', class_weight=None,
              colsample_bytree=1.0, importance_type='split',
              is_training_metric=True, learning_rate=0.05478584195652779,
              max_depth=13, metric='rmse', min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=1400,
              n_jobs=-1, num_leaves=140, objective='regression',
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=0.33150640732190106, subsample_for_bin=200000,
              subsample_freq=0)

In [12]:
data = pd.read_csv("./data/test.csv")
data['item_cnt_month'] = model.predict(X_test).clip(0, 20)
data[['ID', 'item_cnt_month']].to_csv("./features/feature_lgbm_na.csv", index = False)

In [13]:
pred = model.predict(pd.concat((X_train, X_valid))).clip(0, 20)
data = pd.concat((X_train, X_valid))
data['item_cnt_month'] = pred
data.to_csv("./features_train/feature_lgbm_na.csv")