In [1]:
import pandas as pd
import numpy as np
import time

import xgboost as xgb
import lightgbm as lgb

In [2]:
train_data = pd.read_pickle('pd_data_train.pkl')
valid_data = pd.read_pickle('pd_data_valid.pkl')
test_data = pd.read_pickle('pd_data_test.pkl')

In [3]:
from xgboost import XGBRegressor
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

In [10]:
def get_X_y(data):
#     data = data[data.date_block_num > 11]
    X = data.drop(['date_block_num', 'item_cnt_month', 'shop_id', 'item_id'], axis=1)
    y = data['item_cnt_month']
    return X, y

In [5]:
train_X, train_y = get_X_y(train_data)
valid_X, valid_y = get_X_y(valid_data)
test_X, test_y = get_X_y(test_data)

In [6]:
del train_data
del valid_data
del test_data

In [9]:
categorical_columns = ['city_id', 'item_category_id', 'category_id', 'subcategory_id', 'subsubcategory_id', 'month', 'season']

In [10]:
train_data = lgb.Dataset(train_X, label=train_y, categorical_feature=categorical_columns)
train_data.save_binary('lgb_train_dataset.bin')

<lightgbm.basic.Dataset at 0x1b6e900f7c8>

In [11]:
valid_data = lgb.Dataset(valid_X, label=valid_y, categorical_feature=categorical_columns, reference=train_data)
valid_data.save_binary('lgb_valid_dataset.bin')

<lightgbm.basic.Dataset at 0x1b6e900f3c8>

In [12]:
test_data = lgb.Dataset(test_X, label=test_y, categorical_feature=categorical_columns)
test_data.save_binary('lgb_test_dataset.bin')

<lightgbm.basic.Dataset at 0x1b6e900fa48>

<h1>Use lightGBM</h1>

In [2]:
import pandas as pd
import numpy as np
import time

import xgboost as xgb
import lightgbm as lgb

In [3]:
train_data = lgb.Dataset('lgb_train_dataset.bin')

In [4]:
valid_data = lgb.Dataset('lgb_valid_dataset.bin')

In [4]:
from sklearn.model_selection import ParameterGrid, ParameterSampler

param_test = {
    'num_leaves':  [2**5, 2**7, 2**9],
    'subsample': [0.5, 0.7, 0.9], 
    'reg_alpha': [0, 1e-1, 1, 2, 5],
    'reg_lambda': [0, 1e-1, 1, 2, 5],
    'max_depth' : [3, 5, 7],
    'metric': ['rmse'],
    'num_threads': [4],
    'objective': ['regression_l2', None],
}

sampler = ParameterSampler(param_test, n_iter=25, random_state=123)

In [5]:
num_round = 100
early_stopping_rounds = 10

In [5]:
import os

In [6]:
trained_models_dir = 'trained_models_dir'
trained_models_dir = trained_models_dir + '/lgb'
if not os.path.exists(trained_models_dir):
    os.makedirs(trained_models_dir)

In [8]:
def get_inner_dirs(path):
    for file in os.listdir(path):
        if os.path.isdir(os.path.join(path, file)):
            yield file

In [9]:
def __create_model_directory(models_dir) -> str:
    inner_dirs = list(get_inner_dirs(models_dir))
    folder_name = 'gs_{}'.format(len(inner_dirs))

    model_dir = os.path.join(models_dir, folder_name)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    return model_dir, folder_name

In [7]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [16]:
def get_X_y(data):
    #     data = data[data.date_block_num > 11]
    X = data.drop(['date_block_num', 'item_cnt_month', 'shop_id', 'item_id'], axis=1)
    y = data['item_cnt_month']
    return X, y

In [8]:
import json

In [9]:
search_results = []
search_results_file = os.path.join(trained_models_dir, 'search_results.json')

In [13]:
for params in sampler:
        print('current parameters : {}'.format(params))

        model_dir, folder_name = __create_model_directory(trained_models_dir)
        print('model directory : {}'.format(model_dir))

        bst = lgb.train(params, train_data, num_round, valid_sets=[train_data, valid_data], early_stopping_rounds=early_stopping_rounds)
        bst.save_model(os.path.join(model_dir, 'lgb_model_filtered.txt'))
        
        _train_data = pd.read_pickle('pd_data_train.pkl')
        _valid_data = pd.read_pickle('pd_data_valid.pkl')
        _test_data = pd.read_pickle('pd_data_test.pkl')
        
        train_X, train_y = get_X_y(_train_data)
        valid_X, valid_y = get_X_y(_valid_data)
        test_X, test_y = get_X_y(_test_data)
        
        del _train_data
        del _valid_data
        del _test_data
        
        mean_cv_scores = [
            rmse(train_y, bst.predict(train_X)),
            rmse(valid_y, bst.predict(valid_X)),
            rmse(test_y, bst.predict(test_X)),
        ]
        
        write_message = dict(
            params=params,
            scores=dict(
                train=mean_cv_scores[0],
                val=mean_cv_scores[1],
                test=mean_cv_scores[2],
            ),
            model_path=model_dir,
        )

        search_results.append(write_message)

        with open(search_results_file, 'w') as file:
            json.dump(search_results, file)
            print('overwrite model parameters file ({})'.format(search_results_file))

current parameters : {'subsample': 0.5, 'reg_lambda': 1, 'reg_alpha': 2, 'objective': None, 'num_threads': 4, 'num_leaves': 128, 'metric': 'rmse', 'max_depth': 7}
model directory : trained_models_dir/lgb\gs_0
[1]	training's rmse: 1.27486	valid_1's rmse: 1.0985
Training until validation scores don't improve for 10 rounds
[2]	training's rmse: 1.2256	valid_1's rmse: 1.06527
[3]	training's rmse: 1.18356	valid_1's rmse: 1.0375
[4]	training's rmse: 1.14787	valid_1's rmse: 1.01458
[5]	training's rmse: 1.11706	valid_1's rmse: 0.996947
[6]	training's rmse: 1.09089	valid_1's rmse: 0.979421
[7]	training's rmse: 1.06901	valid_1's rmse: 0.967127
[8]	training's rmse: 1.04937	valid_1's rmse: 0.955443
[9]	training's rmse: 1.03325	valid_1's rmse: 0.947147
[10]	training's rmse: 1.01951	valid_1's rmse: 0.939646
[11]	training's rmse: 1.00778	valid_1's rmse: 0.933298
[12]	training's rmse: 0.997571	valid_1's rmse: 0.92762
[13]	training's rmse: 0.988772	valid_1's rmse: 0.923798
[14]	training's rmse: 0.981037

[88]	training's rmse: 0.975801	valid_1's rmse: 0.922611
[89]	training's rmse: 0.975569	valid_1's rmse: 0.92241
[90]	training's rmse: 0.975367	valid_1's rmse: 0.922359
[91]	training's rmse: 0.975226	valid_1's rmse: 0.922371
[92]	training's rmse: 0.974831	valid_1's rmse: 0.921995
[93]	training's rmse: 0.974617	valid_1's rmse: 0.921992
[94]	training's rmse: 0.974401	valid_1's rmse: 0.922066
[95]	training's rmse: 0.974282	valid_1's rmse: 0.922066
[96]	training's rmse: 0.973902	valid_1's rmse: 0.922006
[97]	training's rmse: 0.973483	valid_1's rmse: 0.921986
[98]	training's rmse: 0.973169	valid_1's rmse: 0.921969
[99]	training's rmse: 0.972893	valid_1's rmse: 0.92174
[100]	training's rmse: 0.972698	valid_1's rmse: 0.921784
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.972698	valid_1's rmse: 0.921784
overwrite model parameters file (trained_models_dir/lgb\search_results.json)
current parameters : {'subsample': 0.5, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'objective': '

[20]	training's rmse: 0.987333	valid_1's rmse: 0.920727
[21]	training's rmse: 0.984065	valid_1's rmse: 0.919543
[22]	training's rmse: 0.981012	valid_1's rmse: 0.918451
[23]	training's rmse: 0.978296	valid_1's rmse: 0.917301
[24]	training's rmse: 0.975566	valid_1's rmse: 0.916229
[25]	training's rmse: 0.973287	valid_1's rmse: 0.915935
[26]	training's rmse: 0.971639	valid_1's rmse: 0.915493
[27]	training's rmse: 0.969626	valid_1's rmse: 0.91508
[28]	training's rmse: 0.968211	valid_1's rmse: 0.915046
[29]	training's rmse: 0.966494	valid_1's rmse: 0.914246
[30]	training's rmse: 0.964831	valid_1's rmse: 0.913873
[31]	training's rmse: 0.96379	valid_1's rmse: 0.913845
[32]	training's rmse: 0.962505	valid_1's rmse: 0.913566
[33]	training's rmse: 0.961548	valid_1's rmse: 0.913375
[34]	training's rmse: 0.960317	valid_1's rmse: 0.913079
[35]	training's rmse: 0.959225	valid_1's rmse: 0.912623
[36]	training's rmse: 0.957987	valid_1's rmse: 0.912406
[37]	training's rmse: 0.957106	valid_1's rmse: 0.9

[60]	training's rmse: 0.940462	valid_1's rmse: 0.909842
[61]	training's rmse: 0.940051	valid_1's rmse: 0.909764
[62]	training's rmse: 0.939676	valid_1's rmse: 0.90973
[63]	training's rmse: 0.939226	valid_1's rmse: 0.909489
[64]	training's rmse: 0.938829	valid_1's rmse: 0.909715
[65]	training's rmse: 0.938261	valid_1's rmse: 0.90988
[66]	training's rmse: 0.937722	valid_1's rmse: 0.909822
[67]	training's rmse: 0.937377	valid_1's rmse: 0.909691
[68]	training's rmse: 0.936959	valid_1's rmse: 0.90955
[69]	training's rmse: 0.93669	valid_1's rmse: 0.909537
[70]	training's rmse: 0.935919	valid_1's rmse: 0.90914
[71]	training's rmse: 0.935039	valid_1's rmse: 0.908619
[72]	training's rmse: 0.934426	valid_1's rmse: 0.908284
[73]	training's rmse: 0.934067	valid_1's rmse: 0.907928
[74]	training's rmse: 0.933706	valid_1's rmse: 0.908078
[75]	training's rmse: 0.93323	valid_1's rmse: 0.906719
[76]	training's rmse: 0.932945	valid_1's rmse: 0.906576
[77]	training's rmse: 0.932592	valid_1's rmse: 0.90663

[4]	training's rmse: 1.14764	valid_1's rmse: 1.01452
[5]	training's rmse: 1.11683	valid_1's rmse: 0.996839
[6]	training's rmse: 1.09065	valid_1's rmse: 0.979593
[7]	training's rmse: 1.06902	valid_1's rmse: 0.967331
[8]	training's rmse: 1.04934	valid_1's rmse: 0.95571
[9]	training's rmse: 1.0332	valid_1's rmse: 0.946977
[10]	training's rmse: 1.01918	valid_1's rmse: 0.938938
[11]	training's rmse: 1.00719	valid_1's rmse: 0.932917
[12]	training's rmse: 0.996892	valid_1's rmse: 0.927882
[13]	training's rmse: 0.987001	valid_1's rmse: 0.924376
[14]	training's rmse: 0.979214	valid_1's rmse: 0.921028
[15]	training's rmse: 0.972616	valid_1's rmse: 0.917882
[16]	training's rmse: 0.966387	valid_1's rmse: 0.915683
[17]	training's rmse: 0.961488	valid_1's rmse: 0.913913
[18]	training's rmse: 0.956889	valid_1's rmse: 0.912733
[19]	training's rmse: 0.9527	valid_1's rmse: 0.911079
[20]	training's rmse: 0.949149	valid_1's rmse: 0.909649
[21]	training's rmse: 0.945676	valid_1's rmse: 0.908839
[22]	traini

[96]	training's rmse: 0.974006	valid_1's rmse: 0.922543
[97]	training's rmse: 0.973797	valid_1's rmse: 0.922514
[98]	training's rmse: 0.9735	valid_1's rmse: 0.922398
[99]	training's rmse: 0.973279	valid_1's rmse: 0.922466
[100]	training's rmse: 0.972916	valid_1's rmse: 0.922476
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.972916	valid_1's rmse: 0.922476
overwrite model parameters file (trained_models_dir/lgb\search_results.json)
current parameters : {'subsample': 0.9, 'reg_lambda': 5, 'reg_alpha': 0.1, 'objective': 'regression_l2', 'num_threads': 4, 'num_leaves': 512, 'metric': 'rmse', 'max_depth': 7}
model directory : trained_models_dir/lgb\gs_8
[1]	training's rmse: 1.27516	valid_1's rmse: 1.09856
Training until validation scores don't improve for 10 rounds
[2]	training's rmse: 1.22606	valid_1's rmse: 1.06591
[3]	training's rmse: 1.18425	valid_1's rmse: 1.03817
[4]	training's rmse: 1.14861	valid_1's rmse: 1.01586
[5]	training's rmse: 1.11782	valid_1's rmse:

[85]	training's rmse: 0.976524	valid_1's rmse: 0.921383
[86]	training's rmse: 0.976327	valid_1's rmse: 0.921241
[87]	training's rmse: 0.975943	valid_1's rmse: 0.920781
[88]	training's rmse: 0.975327	valid_1's rmse: 0.920586
[89]	training's rmse: 0.975013	valid_1's rmse: 0.920767
[90]	training's rmse: 0.974799	valid_1's rmse: 0.920746
[91]	training's rmse: 0.974527	valid_1's rmse: 0.920619
[92]	training's rmse: 0.974154	valid_1's rmse: 0.920591
[93]	training's rmse: 0.973858	valid_1's rmse: 0.920683
[94]	training's rmse: 0.973724	valid_1's rmse: 0.920582
[95]	training's rmse: 0.973511	valid_1's rmse: 0.920619
[96]	training's rmse: 0.973308	valid_1's rmse: 0.920567
[97]	training's rmse: 0.973079	valid_1's rmse: 0.920476
[98]	training's rmse: 0.972721	valid_1's rmse: 0.920364
[99]	training's rmse: 0.972527	valid_1's rmse: 0.920363
[100]	training's rmse: 0.972263	valid_1's rmse: 0.920245
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.972263	valid_1's rmse: 0.92024

[28]	training's rmse: 0.967351	valid_1's rmse: 0.915606
[29]	training's rmse: 0.965808	valid_1's rmse: 0.915119
[30]	training's rmse: 0.964663	valid_1's rmse: 0.915051
[31]	training's rmse: 0.963221	valid_1's rmse: 0.91445
[32]	training's rmse: 0.962139	valid_1's rmse: 0.914089
[33]	training's rmse: 0.96093	valid_1's rmse: 0.913661
[34]	training's rmse: 0.959781	valid_1's rmse: 0.913261
[35]	training's rmse: 0.958681	valid_1's rmse: 0.913067
[36]	training's rmse: 0.957815	valid_1's rmse: 0.913149
[37]	training's rmse: 0.956994	valid_1's rmse: 0.912819
[38]	training's rmse: 0.956336	valid_1's rmse: 0.912588
[39]	training's rmse: 0.955155	valid_1's rmse: 0.912194
[40]	training's rmse: 0.954055	valid_1's rmse: 0.911838
[41]	training's rmse: 0.953127	valid_1's rmse: 0.911661
[42]	training's rmse: 0.952266	valid_1's rmse: 0.911539
[43]	training's rmse: 0.951648	valid_1's rmse: 0.911223
[44]	training's rmse: 0.950953	valid_1's rmse: 0.911796
[45]	training's rmse: 0.950205	valid_1's rmse: 0.9

[28]	training's rmse: 0.928758	valid_1's rmse: 0.903284
[29]	training's rmse: 0.926645	valid_1's rmse: 0.902927
[30]	training's rmse: 0.924951	valid_1's rmse: 0.903682
[31]	training's rmse: 0.923143	valid_1's rmse: 0.902644
[32]	training's rmse: 0.92126	valid_1's rmse: 0.903033
[33]	training's rmse: 0.920194	valid_1's rmse: 0.902948
[34]	training's rmse: 0.918543	valid_1's rmse: 0.902208
[35]	training's rmse: 0.917635	valid_1's rmse: 0.902001
[36]	training's rmse: 0.916438	valid_1's rmse: 0.901773
[37]	training's rmse: 0.915523	valid_1's rmse: 0.901814
[38]	training's rmse: 0.914503	valid_1's rmse: 0.901732
[39]	training's rmse: 0.91328	valid_1's rmse: 0.901498
[40]	training's rmse: 0.911997	valid_1's rmse: 0.901718
[41]	training's rmse: 0.911317	valid_1's rmse: 0.901632
[42]	training's rmse: 0.910156	valid_1's rmse: 0.902193
[43]	training's rmse: 0.909557	valid_1's rmse: 0.902129
[44]	training's rmse: 0.908464	valid_1's rmse: 0.902444
[45]	training's rmse: 0.907918	valid_1's rmse: 0.9

[11]	training's rmse: 1.04008	valid_1's rmse: 0.946935
[12]	training's rmse: 1.03105	valid_1's rmse: 0.942223
[13]	training's rmse: 1.02257	valid_1's rmse: 0.937368
[14]	training's rmse: 1.01563	valid_1's rmse: 0.933998
[15]	training's rmse: 1.00947	valid_1's rmse: 0.931442
[16]	training's rmse: 1.00407	valid_1's rmse: 0.929074
[17]	training's rmse: 0.999237	valid_1's rmse: 0.92676
[18]	training's rmse: 0.994822	valid_1's rmse: 0.924843
[19]	training's rmse: 0.991438	valid_1's rmse: 0.923913
[20]	training's rmse: 0.988044	valid_1's rmse: 0.9223
[21]	training's rmse: 0.984782	valid_1's rmse: 0.92079
[22]	training's rmse: 0.982073	valid_1's rmse: 0.919608
[23]	training's rmse: 0.979641	valid_1's rmse: 0.91895
[24]	training's rmse: 0.976575	valid_1's rmse: 0.917619
[25]	training's rmse: 0.974118	valid_1's rmse: 0.916739
[26]	training's rmse: 0.972469	valid_1's rmse: 0.916629
[27]	training's rmse: 0.970266	valid_1's rmse: 0.915795
[28]	training's rmse: 0.968774	valid_1's rmse: 0.915585
[29

[51]	training's rmse: 0.993934	valid_1's rmse: 0.927363
[52]	training's rmse: 0.99229	valid_1's rmse: 0.92626
[53]	training's rmse: 0.991594	valid_1's rmse: 0.92608
[54]	training's rmse: 0.989979	valid_1's rmse: 0.925467
[55]	training's rmse: 0.989289	valid_1's rmse: 0.924849
[56]	training's rmse: 0.988811	valid_1's rmse: 0.925258
[57]	training's rmse: 0.988284	valid_1's rmse: 0.925475
[58]	training's rmse: 0.987747	valid_1's rmse: 0.925459
[59]	training's rmse: 0.987431	valid_1's rmse: 0.925365
[60]	training's rmse: 0.987132	valid_1's rmse: 0.925337
[61]	training's rmse: 0.986552	valid_1's rmse: 0.925111
[62]	training's rmse: 0.986062	valid_1's rmse: 0.924898
[63]	training's rmse: 0.985576	valid_1's rmse: 0.924849
[64]	training's rmse: 0.984669	valid_1's rmse: 0.924522
[65]	training's rmse: 0.984232	valid_1's rmse: 0.924414
[66]	training's rmse: 0.983832	valid_1's rmse: 0.924172
[67]	training's rmse: 0.983505	valid_1's rmse: 0.923866
[68]	training's rmse: 0.983297	valid_1's rmse: 0.92

[91]	training's rmse: 0.974527	valid_1's rmse: 0.920619
[92]	training's rmse: 0.974154	valid_1's rmse: 0.920591
[93]	training's rmse: 0.973858	valid_1's rmse: 0.920683
[94]	training's rmse: 0.973724	valid_1's rmse: 0.920582
[95]	training's rmse: 0.973511	valid_1's rmse: 0.920619
[96]	training's rmse: 0.973308	valid_1's rmse: 0.920567
[97]	training's rmse: 0.973079	valid_1's rmse: 0.920476
[98]	training's rmse: 0.972721	valid_1's rmse: 0.920364
[99]	training's rmse: 0.972527	valid_1's rmse: 0.920363
[100]	training's rmse: 0.972263	valid_1's rmse: 0.920245
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.972263	valid_1's rmse: 0.920245
overwrite model parameters file (trained_models_dir/lgb\search_results.json)
current parameters : {'subsample': 0.5, 'reg_lambda': 2, 'reg_alpha': 0.1, 'objective': None, 'num_threads': 4, 'num_leaves': 512, 'metric': 'rmse', 'max_depth': 5}
model directory : trained_models_dir/lgb\gs_18
[1]	training's rmse: 1.27977	valid_1's rmse: 

[25]	training's rmse: 1.03063	valid_1's rmse: 0.948627
[26]	training's rmse: 1.02858	valid_1's rmse: 0.947303
[27]	training's rmse: 1.02666	valid_1's rmse: 0.945891
[28]	training's rmse: 1.02489	valid_1's rmse: 0.944908
[29]	training's rmse: 1.02119	valid_1's rmse: 0.94173
[30]	training's rmse: 1.01993	valid_1's rmse: 0.941028
[31]	training's rmse: 1.01866	valid_1's rmse: 0.940676
[32]	training's rmse: 1.01764	valid_1's rmse: 0.940386
[33]	training's rmse: 1.01371	valid_1's rmse: 0.937972
[34]	training's rmse: 1.01048	valid_1's rmse: 0.935668
[35]	training's rmse: 1.00806	valid_1's rmse: 0.933975
[36]	training's rmse: 1.00719	valid_1's rmse: 0.933409
[37]	training's rmse: 1.00632	valid_1's rmse: 0.93331
[38]	training's rmse: 1.00519	valid_1's rmse: 0.932635
[39]	training's rmse: 1.0032	valid_1's rmse: 0.93137
[40]	training's rmse: 1.00245	valid_1's rmse: 0.931082
[41]	training's rmse: 1.00188	valid_1's rmse: 0.930934
[42]	training's rmse: 1.00141	valid_1's rmse: 0.930772
[43]	training'

[66]	training's rmse: 0.983904	valid_1's rmse: 0.925238
[67]	training's rmse: 0.98344	valid_1's rmse: 0.925367
[68]	training's rmse: 0.983078	valid_1's rmse: 0.925417
[69]	training's rmse: 0.982554	valid_1's rmse: 0.924851
[70]	training's rmse: 0.982236	valid_1's rmse: 0.924574
[71]	training's rmse: 0.981965	valid_1's rmse: 0.924513
[72]	training's rmse: 0.981681	valid_1's rmse: 0.924428
[73]	training's rmse: 0.981448	valid_1's rmse: 0.924372
[74]	training's rmse: 0.980987	valid_1's rmse: 0.924791
[75]	training's rmse: 0.980623	valid_1's rmse: 0.924572
[76]	training's rmse: 0.980439	valid_1's rmse: 0.924562
[77]	training's rmse: 0.97968	valid_1's rmse: 0.924152
[78]	training's rmse: 0.979252	valid_1's rmse: 0.924006
[79]	training's rmse: 0.97894	valid_1's rmse: 0.924014
[80]	training's rmse: 0.978642	valid_1's rmse: 0.923949
[81]	training's rmse: 0.978258	valid_1's rmse: 0.924135
[82]	training's rmse: 0.978023	valid_1's rmse: 0.924004
[83]	training's rmse: 0.977741	valid_1's rmse: 0.92

[13]	training's rmse: 1.07433	valid_1's rmse: 0.975212
[14]	training's rmse: 1.06868	valid_1's rmse: 0.972296
[15]	training's rmse: 1.06372	valid_1's rmse: 0.969
[16]	training's rmse: 1.05877	valid_1's rmse: 0.96569
[17]	training's rmse: 1.05472	valid_1's rmse: 0.963342
[18]	training's rmse: 1.05057	valid_1's rmse: 0.960502
[19]	training's rmse: 1.04751	valid_1's rmse: 0.959484
[20]	training's rmse: 1.0444	valid_1's rmse: 0.957097
[21]	training's rmse: 1.04115	valid_1's rmse: 0.955163
[22]	training's rmse: 1.03749	valid_1's rmse: 0.95312
[23]	training's rmse: 1.03493	valid_1's rmse: 0.95121
[24]	training's rmse: 1.03262	valid_1's rmse: 0.949404
[25]	training's rmse: 1.03064	valid_1's rmse: 0.948632
[26]	training's rmse: 1.02858	valid_1's rmse: 0.947308
[27]	training's rmse: 1.02667	valid_1's rmse: 0.945896
[28]	training's rmse: 1.02489	valid_1's rmse: 0.944912
[29]	training's rmse: 1.0212	valid_1's rmse: 0.941734
[30]	training's rmse: 1.01994	valid_1's rmse: 0.941032
[31]	training's rm

[54]	training's rmse: 0.944111	valid_1's rmse: 0.910308
[55]	training's rmse: 0.943261	valid_1's rmse: 0.909873
[56]	training's rmse: 0.942827	valid_1's rmse: 0.909852
[57]	training's rmse: 0.942279	valid_1's rmse: 0.909794
[58]	training's rmse: 0.94172	valid_1's rmse: 0.909725
[59]	training's rmse: 0.941295	valid_1's rmse: 0.90967
[60]	training's rmse: 0.94046	valid_1's rmse: 0.909842
[61]	training's rmse: 0.940049	valid_1's rmse: 0.909764
[62]	training's rmse: 0.939673	valid_1's rmse: 0.90973
[63]	training's rmse: 0.939223	valid_1's rmse: 0.909489
[64]	training's rmse: 0.938826	valid_1's rmse: 0.909715
[65]	training's rmse: 0.938258	valid_1's rmse: 0.90988
[66]	training's rmse: 0.937719	valid_1's rmse: 0.909822
[67]	training's rmse: 0.937375	valid_1's rmse: 0.909691
[68]	training's rmse: 0.936957	valid_1's rmse: 0.90955
[69]	training's rmse: 0.936688	valid_1's rmse: 0.909537
[70]	training's rmse: 0.935917	valid_1's rmse: 0.90914
[71]	training's rmse: 0.935036	valid_1's rmse: 0.908619

In [10]:
with open(search_results_file, 'r') as file:
    search_results = json.load(file)

In [11]:
val_scores = [record['scores']['val'] for record in search_results]
min_index = val_scores.index(min(val_scores))

In [12]:
best_model = search_results[min_index]
best_model

{'params': {'subsample': 0.5,
  'reg_lambda': 1,
  'reg_alpha': 2,
  'objective': None,
  'num_threads': 4,
  'num_leaves': 128,
  'metric': 'rmse',
  'max_depth': 7},
 'scores': {'train': 1.0427203767267288,
  'val': 0.89985226706254,
  'test': 0.8557145824524979},
 'model_path': 'trained_models_dir/lgb\\gs_0'}

In [13]:
bst = lgb.Booster(model_file=os.path.join(best_model['model_path'], 'lgb_model_filtered.txt'))

In [15]:
_train_data = pd.read_pickle('D:/ml_data/pd_data_train.pkl')
_valid_data = pd.read_pickle('D:/ml_data/pd_data_valid.pkl')
_test_data = pd.read_pickle('D:/ml_data/pd_data_test.pkl')

NameError: name 'get_X_y' is not defined

In [17]:
train_X, train_y = get_X_y(_train_data)
valid_X, valid_y = get_X_y(_valid_data)
test_X, test_y = get_X_y(_test_data)

del _train_data
del _valid_data
del _test_data

train_pred = bst.predict(train_X)
valid_pred = bst.predict(valid_X)
test_pred = bst.predict(test_X)

mean_cv_scores = [
    rmse(train_y, train_pred),
    rmse(valid_y, valid_pred),
    rmse(test_y, test_pred),
]

In [22]:
mean_cv_scores

[1.0427203767267288, 0.89985226706254, 0.8557145824524979]

In [20]:
import pickle

model_name = 'lgb'
with open('{}_train_pred.pickle'.format(model_name), 'wb') as f:
    pickle.dump(train_pred, f)

with open('{}_valid_pred.pickle'.format(model_name), 'wb') as f:
    pickle.dump(valid_pred, f)

with open('{}_test_pred.pickle'.format(model_name), 'wb') as f:
    pickle.dump(test_pred, f)

In [23]:
with open('{}_train.pickle'.format(model_name), 'wb') as f:
    pickle.dump(train_y, f)

with open('{}_valid.pickle'.format(model_name), 'wb') as f:
    pickle.dump(valid_y, f)

In [31]:
from module.data.read_data import *

test = test_file_processing().set_index('ID')
test.shape

(214200, 3)

In [3]:
test_data = pd.read_pickle('pd_data_test.pkl')

FileNotFoundError: [Errno 2] No such file or directory: 'pd_data_test.pkl'

In [33]:
test_X, test_y = get_X_y(test_data)

In [34]:
import pickle

Y_test = bst.predict(test_X).clip(0, 20)

print(test.shape)
print(Y_test.shape)

submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})

submission.to_csv('lgb_submission_filtered.csv', index=False)
pickle.dump(Y_test, open('lgb_test_filtered.pickle', 'wb'))

(214200, 3)
(214200,)


In [35]:
submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.916558
1,1,0.369198
2,2,1.303552
3,3,0.493867
4,4,4.457535


In [36]:
submission.min()

ID                0.0
item_cnt_month    0.0
dtype: float64

In [37]:
submission.max()

ID                214199.0000
item_cnt_month        19.8314
dtype: float64