In [190]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

In [191]:
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [192]:
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # Leave only relevant store
    df = df[df['store_id']==store]

    # With memory limits we have to read 
    # lags and mean encoding features
    # separately and drop items that we don't need.
    # As our Features Grids are aligned 
    # we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.
    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

In [4]:
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

# Let's look closer on params

## 'boosting_type': 'gbdt'
# we have 'goss' option for faster training
# but it normally leads to underfit.
# Also there is good 'dart' mode
# but it takes forever to train
# and model performance depends 
# a lot on random factor 
# https://www.kaggle.com/c/home-credit-default-risk/discussion/60921

## 'objective': 'tweedie'
# Tweedie Gradient Boosting for Extremely
# Unbalanced Zero-inflated Data
# https://arxiv.org/pdf/1811.10192.pdf
# and many more articles about tweediie
#
# Strange (for me) but Tweedie is close in results
# to my own ugly loss.
# My advice here - make OWN LOSS function
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/140564
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/143070
# I think many of you already using it (after poisson kernel appeared) 
# (kagglers are very good with "params" testing and tuning).
# Try to figure out why Tweedie works.
# probably it will show you new features options
# or data transformation (Target transformation?).

## 'tweedie_variance_power': 1.1
# default = 1.5
# set this closer to 2 to shift towards a Gamma distribution
# set this closer to 1 to shift towards a Poisson distribution
# my CV shows 1.1 is optimal 
# but you can make your own choice

## 'metric': 'rmse'
# Doesn't mean anything to us
# as competition metric is different
# and we don't use early stoppings here.
# So rmse serves just for general 
# model performance overview.
# Also we use "fake" validation set
# (as it makes part of the training set)
# so even general rmse score doesn't mean anything))
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133834

## 'subsample': 0.5
# Serves to fight with overfit
# this will randomly select part of data without resampling
# Chosen by CV (my CV can be wrong!)
# Next kernel will be about CV

##'subsample_freq': 1
# frequency for bagging
# default value - seems ok

## 'learning_rate': 0.03
# Chosen by CV
# Smaller - longer training
# but there is an option to stop 
# in "local minimum"
# Bigger - faster training
# but there is a chance to
# not find "global minimum" minimum

## 'num_leaves': 2**11-1
## 'min_data_in_leaf': 2**12-1
# Force model to use more features
# We need it to reduce "recursive"
# error impact.
# Also it leads to overfit
# that's why we use small 

# 'max_bin': 100
## l1, l2 regularizations
# https://towardsdatascience.com/l1-and-l2-regularization-methods-ce25e7fc831c
# Good tiny explanation
# l2 can work with bigger num_leaves
# but my CV doesn't show boost
                    
## 'n_estimators': 1400
# CV shows that there should be
# different values for each state/store.
# Current value was chosen 
# for general purpose.
# As we don't use any early stopings
# careful to not overfit Public LB.

##'feature_fraction': 0.5
# LightGBM will randomly select 
# part of features on each iteration (tree).
# We have maaaany features
# and many of them are "duplicates"
# and many just "noise"
# good values here - 0.5-0.7 (by CV)

## 'boost_from_average': False
# There is some "problem"
# to code boost_from_average for 
# custom loss
# 'True' makes training faster
# BUT carefull use it
# https://github.com/microsoft/LightGBM/issues/1514

In [193]:
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913               # End day of our train set
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = True               # Use or not pretrained models

#FEATURES to remove
## These features lead to overfit
## or values not present in test set
remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

#PATHS for Features
BASE     = 'grid_part_1.pkl'
PRICE    = 'grid_part_2.pkl'
CALENDAR = 'grid_part_3.pkl'
LAGS     = 'lags_df_28.pkl'
MEAN_ENC = 'mean_encoding_df.pkl'


# AUX(pretrained) Models paths

#STORES ids
STORES_IDS = pd.read_csv('sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [8]:
for store_id in STORES_IDS:
    print('Train', store_id)
    
    # Get grid for current store
    grid_df, features_columns = get_data_by_store(store_id)
    
    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validation set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                       label=grid_df[train_mask][TARGET])
    train_data.save_binary('train_data.bin')
    train_data = lgb.Dataset('train_data.bin')
    
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    print(grid_df.head())
    print(grid_df.tail())
    print(grid_df.info())
    grid_df.to_pickle('test_'+store_id+'.pkl')
    del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
    model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
    !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

Train CA_1
                              id     d  sales        item_id    dept_id  \
0  HOBBIES_1_001_CA_1_validation  1814    0.0  HOBBIES_1_001  HOBBIES_1   
1  HOBBIES_1_002_CA_1_validation  1814    0.0  HOBBIES_1_002  HOBBIES_1   
2  HOBBIES_1_003_CA_1_validation  1814    0.0  HOBBIES_1_003  HOBBIES_1   
3  HOBBIES_1_004_CA_1_validation  1814    3.0  HOBBIES_1_004  HOBBIES_1   
4  HOBBIES_1_005_CA_1_validation  1814    0.0  HOBBIES_1_005  HOBBIES_1   

    cat_id  release  sell_price  price_max  price_min  ...  rolling_mean_7  \
0  HOBBIES      224    8.257812   9.578125   8.257812  ...        0.714355   
1  HOBBIES       20    3.970703   3.970703   3.970703  ...        1.428711   
2  HOBBIES      300    2.970703   2.970703   2.970703  ...        1.857422   
3  HOBBIES        5    4.640625   4.640625   4.339844  ...        1.713867   
4  HOBBIES       16    2.880859   3.080078   2.480469  ...        2.572266   

   rolling_std_7  rolling_mean_14  rolling_std_14  rolling_mean_30  \

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390216 entries, 0 to 390215
Data columns (total 63 columns):
id                  390216 non-null category
d                   390216 non-null int16
sales               304844 non-null float64
item_id             390216 non-null category
dept_id             390216 non-null category
cat_id              390216 non-null category
release             390216 non-null int16
sell_price          390216 non-null float16
price_max           390216 non-null float16
price_min           390216 non-null float16
price_std           390216 non-null float16
price_mean          390216 non-null float16
price_norm          390216 non-null float16
price_nunique       390216 non-null float16
item_nunique        390216 non-null int16
price_momentum      390195 non-null float16
price_momentum_m    390216 non-null float16
price_momentum_y    390216 non-null float16
event_name_1        39631 non-null category
event_type_1        39631 non-null category
event_name_

[100]	valid_0's rmse: 2.5055
[200]	valid_0's rmse: 2.46208
[300]	valid_0's rmse: 2.43993
[400]	valid_0's rmse: 2.42454
[500]	valid_0's rmse: 2.41549
[600]	valid_0's rmse: 2.40771
[700]	valid_0's rmse: 2.40195
[800]	valid_0's rmse: 2.39547
[900]	valid_0's rmse: 2.38985
[1000]	valid_0's rmse: 2.38472
[1100]	valid_0's rmse: 2.38037
[1200]	valid_0's rmse: 2.37576
[1300]	valid_0's rmse: 2.37126
[1400]	valid_0's rmse: 2.36679
Train CA_4
                              id     d  sales        item_id    dept_id  \
0  HOBBIES_1_001_CA_4_validation  1814    1.0  HOBBIES_1_001  HOBBIES_1   
1  HOBBIES_1_002_CA_4_validation  1814    0.0  HOBBIES_1_002  HOBBIES_1   
2  HOBBIES_1_003_CA_4_validation  1814    0.0  HOBBIES_1_003  HOBBIES_1   
3  HOBBIES_1_004_CA_4_validation  1814    0.0  HOBBIES_1_004  HOBBIES_1   
4  HOBBIES_1_005_CA_4_validation  1814    0.0  HOBBIES_1_005  HOBBIES_1   

    cat_id  release  sell_price  price_max  price_min  ...  rolling_mean_7  \
0  HOBBIES      224    8.257812   9.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390272 entries, 0 to 390271
Data columns (total 63 columns):
id                  390272 non-null category
d                   390272 non-null int16
sales               304900 non-null float64
item_id             390272 non-null category
dept_id             390272 non-null category
cat_id              390272 non-null category
release             390272 non-null int16
sell_price          390272 non-null float16
price_max           390272 non-null float16
price_min           390272 non-null float16
price_std           390272 non-null float16
price_mean          390272 non-null float16
price_norm          390272 non-null float16
price_nunique       390272 non-null float16
item_nunique        390272 non-null int16
price_momentum      390272 non-null float16
price_momentum_m    390272 non-null float16
price_momentum_y    390272 non-null float16
event_name_1        39637 non-null category
event_type_1        39637 non-null category
event_name_

[100]	valid_0's rmse: 1.71833
[200]	valid_0's rmse: 1.69794
[300]	valid_0's rmse: 1.68458
[400]	valid_0's rmse: 1.67864
[500]	valid_0's rmse: 1.67315
[600]	valid_0's rmse: 1.66851
[700]	valid_0's rmse: 1.66399
[800]	valid_0's rmse: 1.66003
[900]	valid_0's rmse: 1.65663
[1000]	valid_0's rmse: 1.65273
[1100]	valid_0's rmse: 1.64889
[1200]	valid_0's rmse: 1.6459
[1300]	valid_0's rmse: 1.64224
[1400]	valid_0's rmse: 1.63887
Train TX_3
                              id     d  sales        item_id    dept_id  \
0  HOBBIES_1_001_TX_3_validation  1814    0.0  HOBBIES_1_001  HOBBIES_1   
1  HOBBIES_1_002_TX_3_validation  1814    0.0  HOBBIES_1_002  HOBBIES_1   
2  HOBBIES_1_003_TX_3_validation  1814    0.0  HOBBIES_1_003  HOBBIES_1   
3  HOBBIES_1_004_TX_3_validation  1814    1.0  HOBBIES_1_004  HOBBIES_1   
4  HOBBIES_1_005_TX_3_validation  1814    1.0  HOBBIES_1_005  HOBBIES_1   

    cat_id  release  sell_price  price_max  price_min  ...  rolling_mean_7  \
0  HOBBIES      226    8.257812   8.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390244 entries, 0 to 390243
Data columns (total 63 columns):
id                  390244 non-null category
d                   390244 non-null int16
sales               304872 non-null float64
item_id             390244 non-null category
dept_id             390244 non-null category
cat_id              390244 non-null category
release             390244 non-null int16
sell_price          390244 non-null float16
price_max           390244 non-null float16
price_min           390244 non-null float16
price_std           390244 non-null float16
price_mean          390244 non-null float16
price_norm          390244 non-null float16
price_nunique       390244 non-null float16
item_nunique        390244 non-null int16
price_momentum      390237 non-null float16
price_momentum_m    390244 non-null float16
price_momentum_y    390244 non-null float16
event_name_1        39634 non-null category
event_type_1        39634 non-null category
event_name_

[100]	valid_0's rmse: 2.69105
[200]	valid_0's rmse: 2.58171
[300]	valid_0's rmse: 2.54574
[400]	valid_0's rmse: 2.52407
[500]	valid_0's rmse: 2.50655
[600]	valid_0's rmse: 2.49078
[700]	valid_0's rmse: 2.47712
[800]	valid_0's rmse: 2.46442
[900]	valid_0's rmse: 2.45332
[1000]	valid_0's rmse: 2.44306
[1100]	valid_0's rmse: 2.43294
[1200]	valid_0's rmse: 2.42588
[1300]	valid_0's rmse: 2.41622
[1400]	valid_0's rmse: 2.40552
Train WI_3
                              id     d  sales        item_id    dept_id  \
0  HOBBIES_1_001_WI_3_validation  1814    2.0  HOBBIES_1_001  HOBBIES_1   
1  HOBBIES_1_002_WI_3_validation  1814    0.0  HOBBIES_1_002  HOBBIES_1   
2  HOBBIES_1_003_WI_3_validation  1814    1.0  HOBBIES_1_003  HOBBIES_1   
3  HOBBIES_1_004_WI_3_validation  1814    3.0  HOBBIES_1_004  HOBBIES_1   
4  HOBBIES_1_005_WI_3_validation  1814    3.0  HOBBIES_1_005  HOBBIES_1   

    cat_id  release  sell_price  price_max  price_min  ...  rolling_mean_7  \
0  HOBBIES      227    8.257812   8

In [9]:
# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day
for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
        
    for store_id in STORES_IDS:
        
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 
        
        estimator = pickle.load(open(model_path, 'rb'))
        
        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
    
    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds.head()
all_preds.tail()

Predict | Day: 1
##########  0.77 min round |  0.77 min total |  37308.80 day sales |
Predict | Day: 2
##########  0.76 min round |  1.53 min total |  35335.42 day sales |
Predict | Day: 3
##########  0.74 min round |  2.27 min total |  34783.90 day sales |
Predict | Day: 4
##########  0.74 min round |  3.01 min total |  35285.85 day sales |
Predict | Day: 5
##########  0.75 min round |  3.76 min total |  41724.47 day sales |
Predict | Day: 6
##########  0.74 min round |  4.50 min total |  50966.54 day sales |
Predict | Day: 7
##########  0.74 min round |  5.24 min total |  53580.33 day sales |
Predict | Day: 8
##########  0.76 min round |  6.00 min total |  44119.60 day sales |
Predict | Day: 9
##########  0.74 min round |  6.74 min total |  44431.43 day sales |
Predict | Day: 10
##########  0.75 min round |  7.49 min total |  38864.02 day sales |
Predict | Day: 11
##########  0.75 min round |  8.24 min total |  40720.81 day sales |
Predict | Day: 12
##########  0.75 min round |  8.99

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30485,FOODS_3_823_WI_3_validation,0.373569,0.341489,0.343651,0.330851,0.378356,0.431511,0.42312,0.463942,0.487156,...,0.406368,0.521859,0.573814,0.373389,0.333836,0.301846,0.283167,0.321126,0.396755,0.410307
30486,FOODS_3_824_WI_3_validation,0.304983,0.266182,0.243182,0.240461,0.294768,0.369417,0.331245,0.434473,0.442417,...,0.339828,0.490687,0.559283,0.36313,0.275342,0.245768,0.248248,0.271593,0.345999,0.36134
30487,FOODS_3_825_WI_3_validation,0.674252,0.539741,0.48385,0.46328,0.636023,0.735115,0.909188,1.16686,1.090199,...,1.004481,1.54184,1.688544,1.077601,0.795546,0.742795,0.649474,0.735476,0.864346,0.916464
30488,FOODS_3_826_WI_3_validation,0.944403,0.93521,0.80811,0.792187,0.94941,1.160681,1.16634,1.236102,1.229442,...,0.986986,1.370988,1.43766,1.000928,0.884545,0.845057,0.827194,0.98056,1.152899,1.231655
30489,FOODS_3_827_WI_3_validation,0.239573,1.224351,1.22415,2.056664,2.718922,2.7482,2.40568,2.231635,2.395505,...,1.971739,2.425102,2.260457,1.649259,1.674562,1.553669,1.488599,1.722757,2.069605,1.927087


In [11]:
# Reading competition sample submission and
# merging our predictions
# As we have predictions only for "_validation" data
# we need to do fillna() for "_evaluation" items
submission = pd.read_csv('sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v'+str(VER)+'.csv', index=False) #0.47388

In [None]:
# next step:

# Improvement should come from:
# bayesian optimization: store_id
# Loss function: rmse-wrmse
# Stable CV: by week-year
# Good features reduction strategy
# Predictions stabilization with NN
# Trend prediction

In [6]:
#bayesian optimization
# CA_1
grid_df, features_columns = get_data_by_store('CA_1')

In [7]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4788267 entries, 0 to 4788266
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [13]:
grid_df.head()

Unnamed: 0,id,d,sales,item_id,dept_id,cat_id,release,sell_price,price_max,price_min,...,rolling_mean_tmp_1_30,rolling_mean_tmp_1_60,rolling_mean_tmp_7_7,rolling_mean_tmp_7_14,rolling_mean_tmp_7_30,rolling_mean_tmp_7_60,rolling_mean_tmp_14_7,rolling_mean_tmp_14_14,rolling_mean_tmp_14_30,rolling_mean_tmp_14_60
0,HOBBIES_1_008_CA_1_validation,1,12.0,HOBBIES_1_008,HOBBIES_1,HOBBIES,0,0.459961,0.5,0.419922,...,,,,,,,,,,
1,HOBBIES_1_009_CA_1_validation,1,2.0,HOBBIES_1_009,HOBBIES_1,HOBBIES,0,1.55957,1.769531,1.55957,...,,,,,,,,,,
2,HOBBIES_1_010_CA_1_validation,1,0.0,HOBBIES_1_010,HOBBIES_1,HOBBIES,0,3.169922,3.169922,2.970703,...,,,,,,,,,,
3,HOBBIES_1_012_CA_1_validation,1,0.0,HOBBIES_1_012,HOBBIES_1,HOBBIES,0,5.980469,6.519531,5.980469,...,,,,,,,,,,
4,HOBBIES_1_015_CA_1_validation,1,4.0,HOBBIES_1_015,HOBBIES_1,HOBBIES,0,0.700195,0.720215,0.680176,...,,,,,,,,,,


In [14]:
grid_df.tail()

Unnamed: 0,id,d,sales,item_id,dept_id,cat_id,release,sell_price,price_max,price_min,...,rolling_mean_tmp_1_30,rolling_mean_tmp_1_60,rolling_mean_tmp_7_7,rolling_mean_tmp_7_14,rolling_mean_tmp_7_30,rolling_mean_tmp_7_60,rolling_mean_tmp_14_7,rolling_mean_tmp_14_14,rolling_mean_tmp_14_30,rolling_mean_tmp_14_60
4788262,FOODS_3_823_CA_1_validation,1941,,FOODS_3_823,FOODS_3,FOODS,127,2.980469,2.980469,2.480469,...,,,,,,,,,,
4788263,FOODS_3_824_CA_1_validation,1941,,FOODS_3_824,FOODS_3,FOODS,0,2.480469,2.679688,2.470703,...,,,,,,,,,,
4788264,FOODS_3_825_CA_1_validation,1941,,FOODS_3_825,FOODS_3,FOODS,1,3.980469,4.378906,3.980469,...,,,,,,,,,,
4788265,FOODS_3_826_CA_1_validation,1941,,FOODS_3_826,FOODS_3,FOODS,211,1.280273,1.280273,1.280273,...,,,,,,,,,,
4788266,FOODS_3_827_CA_1_validation,1941,,FOODS_3_827,FOODS_3,FOODS,403,1.0,1.0,1.0,...,,,,,,,,,,


In [8]:
features_columns

['item_id',
 'dept_id',
 'cat_id',
 'release',
 'sell_price',
 'price_max',
 'price_min',
 'price_std',
 'price_mean',
 'price_norm',
 'price_nunique',
 'item_nunique',
 'price_momentum',
 'price_momentum_m',
 'price_momentum_y',
 'event_name_1',
 'event_type_1',
 'event_name_2',
 'event_type_2',
 'snap_CA',
 'snap_TX',
 'snap_WI',
 'tm_d',
 'tm_w',
 'tm_m',
 'tm_y',
 'tm_wm',
 'tm_dw',
 'tm_w_end',
 'enc_cat_id_mean',
 'enc_cat_id_std',
 'enc_dept_id_mean',
 'enc_dept_id_std',
 'enc_item_id_mean',
 'enc_item_id_std',
 'sales_lag_28',
 'sales_lag_29',
 'sales_lag_30',
 'sales_lag_31',
 'sales_lag_32',
 'sales_lag_33',
 'sales_lag_34',
 'sales_lag_35',
 'sales_lag_36',
 'sales_lag_37',
 'sales_lag_38',
 'sales_lag_39',
 'sales_lag_40',
 'sales_lag_41',
 'sales_lag_42',
 'rolling_mean_7',
 'rolling_std_7',
 'rolling_mean_14',
 'rolling_std_14',
 'rolling_mean_30',
 'rolling_std_30',
 'rolling_mean_60',
 'rolling_std_60',
 'rolling_mean_180',
 'rolling_std_180',
 'rolling_mean_tmp_1_7',
 

In [23]:
END_TRAIN-P_HORIZON #1913-28

1885

In [72]:
END_TRAIN

1913

In [73]:
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)

In [74]:
grid_df[train_mask][features_columns].shape

(4617523, 72)

In [75]:
grid_df[train_mask][TARGET].shape

(4617523,)

In [76]:
grid_df[valid_mask][features_columns].shape

(85372, 72)

In [77]:
grid_df[valid_mask][TARGET].shape

(85372,)

In [78]:
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])

In [96]:
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

In [97]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error
from math import sqrt

In [98]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    tweedie_variance_power,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': tweedie_variance_power,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse

In [100]:
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9),
    'tweedie_variance_power': (1, 2),   
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}

In [101]:
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [102]:
print(LGB_BO.space.keys)

['bagging_fraction', 'bagging_freq', 'colsample_bytree', 'max_bin', 'max_depth', 'min_data_in_leaf', 'min_split_gain', 'num_leaves', 'reg_alpha', 'reg_lambda', 'tweedie_variance_power']


In [103]:
init_points = 3
n_iter = 7

In [104]:
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... | tweedi... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.96584	valid_1's rmse: 2.17239
[200]	training's rmse: 2.69563	valid_1's rmse: 2.05824
[300]	training's rmse: 2.60735	valid_1's rmse: 2.04816
[400]	training's rmse: 2.5637	valid_1's rmse: 2.04004
[500]	training's rmse: 2.53777	valid_1's rmse: 2.03699
Early stopping, best iteration is:
[457]	training's rmse: 2.54956	valid_1's rmse: 2.03638
| [0m 1       [0m | [0m-2.036   [0m | [0m 0.3492  [0m | [0m 14.38   [0m | [0m 0.1043  [0m | [0m 183.1   [0m | [0m

In [None]:
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... | tweedi... |
|  8        | -2.024    |  0.8807   |  3.052    |  0.8889   |  115.7    |  7.263    |  3.016e+0 |  0.7596   |  1.018e+0 |  0.7816   |  0.5243   |  1.345    |

In [106]:
del train_data, valid_data, X_valid, y_valid

In [107]:
gc.collect()

637

In [108]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.345,
                    'metric': 'rmse',
                    'subsample': 0.8807,
                    'subsample_freq': 3,
                    'learning_rate': 0.03,
                    'num_leaves': 1018, 
                    'min_data_in_leaf': 3016, 
                    'feature_fraction': 0.8889,
                    'max_bin': 115,
                    'n_estimators': 1300,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':7,
                    'min_split_gain':0.7596,
                    'reg_alpha':0.7816,
                    'reg_lambda':0.5243,
                } 

In [109]:
store_id = 'CA_1'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#1.93576
#100: 2.02701

[100]	valid_0's rmse: 2.05702
[200]	valid_0's rmse: 2.02822
[300]	valid_0's rmse: 2.02647
[400]	valid_0's rmse: 2.02562
[500]	valid_0's rmse: 2.02493
[600]	valid_0's rmse: 2.02443
[700]	valid_0's rmse: 2.02385
[800]	valid_0's rmse: 2.02292
[900]	valid_0's rmse: 2.02236
[1000]	valid_0's rmse: 2.02229
[1100]	valid_0's rmse: 2.02208
[1200]	valid_0's rmse: 2.02166
[1300]	valid_0's rmse: 2.02141


In [119]:
# CA_2
grid_df, features_columns = get_data_by_store('CA_2')
print(grid_df.info())
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)
print(grid_df[train_mask][features_columns].shape)
print(grid_df[valid_mask][features_columns].shape)
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4361148 entries, 0 to 4361147
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [123]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': 1.1,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9), 
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [124]:
print(LGB_BO.space.keys)

['bagging_fraction', 'bagging_freq', 'colsample_bytree', 'max_bin', 'max_depth', 'min_data_in_leaf', 'min_split_gain', 'num_leaves', 'reg_alpha', 'reg_lambda']


In [125]:
init_points = 3
n_iter = 7
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.16031	valid_1's rmse: 2.00461
[200]	training's rmse: 2.09438	valid_1's rmse: 1.92762
[300]	training's rmse: 2.06843	valid_1's rmse: 1.90423
[400]	training's rmse: 2.05283	valid_1's rmse: 1.89178
[500]	training's rmse: 2.04255	valid_1's rmse: 1.88507
[600]	training's rmse: 2.03561	valid_1's rmse: 1.88174
[700]	training's rmse: 2.02934	valid_1's rmse: 1.87792
[800]	training's rmse: 2.02377	valid_1's rmse: 1.87501
[900]	training's rmse: 2.01995	valid_1's rmse: 1.87264
[1000]	training's r

[1400]	training's rmse: 1.95906	valid_1's rmse: 1.85692
[1500]	training's rmse: 1.95566	valid_1's rmse: 1.85686
[1600]	training's rmse: 1.95211	valid_1's rmse: 1.85628
[1700]	training's rmse: 1.94847	valid_1's rmse: 1.85649
Early stopping, best iteration is:
[1643]	training's rmse: 1.95055	valid_1's rmse: 1.85621
| [95m 6       [0m | [95m-1.856   [0m | [95m 0.5212  [0m | [95m 5.112   [0m | [95m 0.1614  [0m | [95m 298.3   [0m | [95m 0.3673  [0m | [95m 5.096e+0[0m | [95m 0.7937  [0m | [95m 1.014e+0[0m | [95m 0.2466  [0m | [95m 0.8621  [0m |
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.10025	valid_1's rmse: 1.95437
[200]	training's rmse: 2.05984	valid_1's rmse: 1.89991
[300]	training's rmse: 2.05205	valid_1's rmse: 1.89198
[400]	training's rmse: 2.04626	valid_1's rmse: 1.88706
[500]	training's rmse: 2.04215	valid_1's rmse: 1.8846
[600]	training's rmse: 2.03963	valid_1's rmse: 1.88193
[700]	training's rmse: 2.03716	valid_1'

In [None]:
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
------------------------------
|  6        | -1.856    |  0.5212   |  5.112    |  0.1614   |  298.3    |  0.3673   |  5.096e+0 |  0.7937   |  1.014e+0 |  0.2466   |  0.8621   |

In [126]:
del train_data, valid_data, X_valid, y_valid
gc.collect()

726

In [131]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample':  0.5212,
                    'subsample_freq': 5,
                    'learning_rate': 0.03,
                    'num_leaves': 1014, 
                    'min_data_in_leaf': 5096, 
                    'feature_fraction': 0.1614,
                    'max_bin': 298,
                    'n_estimators': 1700,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':0,
                    'min_split_gain':0.7937,
                    'reg_alpha':0.2466,
                    'reg_lambda':0.8621,
                } 

In [132]:
grid_df, features_columns = get_data_by_store('CA_2')
store_id = 'CA_2'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#1.78449
#100:1.89284

[100]	valid_0's rmse: 1.95079
[200]	valid_0's rmse: 1.88472
[300]	valid_0's rmse: 1.86512
[400]	valid_0's rmse: 1.85509
[500]	valid_0's rmse: 1.84702
[600]	valid_0's rmse: 1.8418
[700]	valid_0's rmse: 1.83643
[800]	valid_0's rmse: 1.83189
[900]	valid_0's rmse: 1.82839
[1000]	valid_0's rmse: 1.82515
[1100]	valid_0's rmse: 1.82187
[1200]	valid_0's rmse: 1.81818
[1300]	valid_0's rmse: 1.81514
[1400]	valid_0's rmse: 1.81263
[1500]	valid_0's rmse: 1.81043
[1600]	valid_0's rmse: 1.80831
[1700]	valid_0's rmse: 1.80575


In [133]:
grid_df, features_columns = get_data_by_store('CA_3')
print(grid_df.info())
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)
print(grid_df[train_mask][features_columns].shape)
print(grid_df[valid_mask][features_columns].shape)
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4757313 entries, 0 to 4757312
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [137]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': 1.1,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9), 
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [138]:
print(LGB_BO.space.keys)

['bagging_fraction', 'bagging_freq', 'colsample_bytree', 'max_bin', 'max_depth', 'min_data_in_leaf', 'min_split_gain', 'num_leaves', 'reg_alpha', 'reg_lambda']


In [139]:
init_points = 3
n_iter = 7
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.70364	valid_1's rmse: 2.55534
[200]	training's rmse: 3.51676	valid_1's rmse: 2.55846
Early stopping, best iteration is:
[105]	training's rmse: 3.68429	valid_1's rmse: 2.5538
| [0m 1       [0m | [0m-2.554   [0m | [0m 0.3714  [0m | [0m 10.4    [0m | [0m 0.7481  [0m | [0m 163.8   [0m | [0m 6.045   [0m | [0m 3.586e+0[0m | [0m 0.1033  [0m | [0m 2.709e+0[0m | [0m 0.7466  [0m | [0m 0.1662  [0m |
Training until validation scores don't improve for 100 rounds.
[100]	trai

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.97497	valid_1's rmse: 2.59331
[200]	training's rmse: 3.69989	valid_1's rmse: 2.65561
Early stopping, best iteration is:
[111]	training's rmse: 3.89801	valid_1's rmse: 2.57805
| [0m 10      [0m | [0m-2.578   [0m | [0m 0.1976  [0m | [0m 18.8    [0m | [0m 0.5336  [0m | [0m 297.7   [0m | [0m 4.085   [0m | [0m 5.987e+0[0m | [0m 0.1706  [0m | [0m 2.991e+0[0m | [0m 0.3584  [0m | [0m 0.105   [0m |


In [140]:
del train_data, valid_data, X_valid, y_valid
gc.collect()

292

In [None]:
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |

|  9        | -2.463    |  0.7422   |  2.388    |  0.7504   |  299.2    |  8.297    |  4.621e+0 |  0.3224   |  1.342e+0 |  0.1614   |  0.6339   |

In [141]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample':  0.7422,
                    'subsample_freq': 2,
                    'learning_rate': 0.03,
                    'num_leaves': 1342, 
                    'min_data_in_leaf': 4621, 
                    'feature_fraction': 0.7504,
                    'max_bin': 299,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':8,
                    'min_split_gain':0.3224,
                    'reg_alpha':0.1614,
                    'reg_lambda':0.6339,
                } 

In [142]:
grid_df, features_columns = get_data_by_store('CA_3')
store_id = 'CA_3'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#2.36679
#100:2.5055

[100]	valid_0's rmse: 2.52203
[200]	valid_0's rmse: 2.49136
[300]	valid_0's rmse: 2.46984
[400]	valid_0's rmse: 2.46083
[500]	valid_0's rmse: 2.45542
[600]	valid_0's rmse: 2.45215
[700]	valid_0's rmse: 2.44896
[800]	valid_0's rmse: 2.44647
[900]	valid_0's rmse: 2.44459
[1000]	valid_0's rmse: 2.44284
[1100]	valid_0's rmse: 2.44105
[1200]	valid_0's rmse: 2.44029
[1300]	valid_0's rmse: 2.43914
[1400]	valid_0's rmse: 2.4384


In [143]:
# CA_4
grid_df, features_columns = get_data_by_store('CA_4')
print(grid_df.info())
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)
print(grid_df[train_mask][features_columns].shape)
print(grid_df[valid_mask][features_columns].shape)
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4652558 entries, 0 to 4652557
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [144]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': 1.1,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9), 
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [145]:
init_points = 3
n_iter = 7
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.54042	valid_1's rmse: 1.34456
[200]	training's rmse: 1.51072	valid_1's rmse: 1.3379
[300]	training's rmse: 1.5005	valid_1's rmse: 1.33624
[400]	training's rmse: 1.49378	valid_1's rmse: 1.33476
[500]	training's rmse: 1.48939	valid_1's rmse: 1.33395
[600]	training's rmse: 1.48607	valid_1's rmse: 1.33319
[700]	training's rmse: 1.48317	valid_1's rmse: 1.33278
[800]	training's rmse: 1.48107	valid_1's rmse: 1.33248
[900]	training's rmse: 1.47857	valid_1's rmse: 1.33234
[1000]	training's rms

[1200]	training's rmse: 1.50419	valid_1's rmse: 1.33885
[1300]	training's rmse: 1.5032	valid_1's rmse: 1.33852
[1400]	training's rmse: 1.50195	valid_1's rmse: 1.33815
[1500]	training's rmse: 1.501	valid_1's rmse: 1.33801
[1600]	training's rmse: 1.50014	valid_1's rmse: 1.33779
[1700]	training's rmse: 1.49925	valid_1's rmse: 1.33764
[1800]	training's rmse: 1.4984	valid_1's rmse: 1.33733
[1900]	training's rmse: 1.49774	valid_1's rmse: 1.33727
[2000]	training's rmse: 1.49693	valid_1's rmse: 1.33703
[2100]	training's rmse: 1.49609	valid_1's rmse: 1.33689
Early stopping, best iteration is:
[2077]	training's rmse: 1.4963	valid_1's rmse: 1.33685
| [0m 10      [0m | [0m-1.337   [0m | [0m 0.8469  [0m | [0m 16.59   [0m | [0m 0.1789  [0m | [0m 281.3   [0m | [0m 3.993   [0m | [0m 5.976e+0[0m | [0m 0.7909  [0m | [0m 1.002e+0[0m | [0m 0.2905  [0m | [0m 0.4242  [0m |


In [None]:
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
|  7        | -1.331    |  0.3801   |  4.931    |  0.2262   |  299.5    |  10.99    |  4.055e+0 |  0.8417   |  1.011e+0 |  0.1893   |  0.3858   |

In [146]:
del train_data, valid_data, X_valid, y_valid
gc.collect()

425

In [147]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.3801,
                    'subsample_freq': 4,
                    'learning_rate': 0.03,
                    'num_leaves': 1011, 
                    'min_data_in_leaf': 4055, 
                    'feature_fraction': 0.2262,
                    'max_bin': 299,
                    'n_estimators': 1600,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':10,
                    'min_split_gain':0.8417,
                    'reg_alpha': 0.1893,
                    'reg_lambda':0.3858,
                } 

In [148]:
grid_df, features_columns = get_data_by_store('CA_4')
store_id = 'CA_4'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#1.28865
#100:1.3326

[100]	valid_0's rmse: 1.33982
[200]	valid_0's rmse: 1.33359
[300]	valid_0's rmse: 1.33138
[400]	valid_0's rmse: 1.32898
[500]	valid_0's rmse: 1.32795
[600]	valid_0's rmse: 1.32686
[700]	valid_0's rmse: 1.3261
[800]	valid_0's rmse: 1.32499
[900]	valid_0's rmse: 1.32439
[1000]	valid_0's rmse: 1.32361
[1100]	valid_0's rmse: 1.323
[1200]	valid_0's rmse: 1.3222
[1300]	valid_0's rmse: 1.32154
[1400]	valid_0's rmse: 1.32092
[1500]	valid_0's rmse: 1.32019
[1600]	valid_0's rmse: 1.31947


In [149]:
# TX_1
grid_df, features_columns = get_data_by_store('TX_1')
print(grid_df.info())
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)
print(grid_df[train_mask][features_columns].shape)
print(grid_df[valid_mask][features_columns].shape)
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4797955 entries, 0 to 4797954
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [150]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': 1.1,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9), 
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [151]:
init_points = 3
n_iter = 7
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.23722	valid_1's rmse: 1.66354
[200]	training's rmse: 2.16178	valid_1's rmse: 1.63279
[300]	training's rmse: 2.12833	valid_1's rmse: 1.62103
[400]	training's rmse: 2.10818	valid_1's rmse: 1.61476
[500]	training's rmse: 2.09568	valid_1's rmse: 1.6104
[600]	training's rmse: 2.08511	valid_1's rmse: 1.60665
[700]	training's rmse: 2.07579	valid_1's rmse: 1.60363
[800]	training's rmse: 2.06958	valid_1's rmse: 1.60171
[900]	training's rmse: 2.06331	valid_1's rmse: 1.60045
[1000]	training's rm

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.13491	valid_1's rmse: 1.60854
[200]	training's rmse: 2.09375	valid_1's rmse: 1.59751
[300]	training's rmse: 2.08115	valid_1's rmse: 1.5956
[400]	training's rmse: 2.07342	valid_1's rmse: 1.59437
[500]	training's rmse: 2.06666	valid_1's rmse: 1.5934
[600]	training's rmse: 2.06007	valid_1's rmse: 1.59344
Early stopping, best iteration is:
[534]	training's rmse: 2.06458	valid_1's rmse: 1.59313
| [95m 5       [0m | [95m-1.593   [0m | [95m 0.7015  [0m | [95m 12.96   [0m | [95m 0.4496  [0m | [95m 112.5   [0m | [95m 6.688   [0m | [95m 3.016e+0[0m | [95m 0.1475  [0m | [95m 1.009e+0[0m | [95m 0.8076  [0m | [95m 0.88    [0m |
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.14747	valid_1's rmse: 1.61588
[200]	training's rmse: 2.08973	valid_1's rmse: 1.60501
[300]	training's rmse: 2.07572	valid_1's rmse: 1.60248
[400]	training's rmse: 2.06582	valid_

In [152]:
del train_data, valid_data, X_valid, y_valid
gc.collect()

60

In [None]:
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
|  5        | -1.593    |  0.7015   |  12.96    |  0.4496   |  112.5    |  6.688    |  3.016e+0 |  0.1475   |  1.009e+0 |  0.8076   |  0.88     |

In [153]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample':  0.7015 ,
                    'subsample_freq': 12,
                    'learning_rate': 0.03,
                    'num_leaves': 1099, 
                    'min_data_in_leaf': 3016, 
                    'feature_fraction':  0.4496,
                    'max_bin': 112,
                    'n_estimators': 600,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':6,
                    'min_split_gain':0.1475,
                    'reg_alpha': 0.8076,
                    'reg_lambda': 0.88,
                } 

In [154]:
grid_df, features_columns = get_data_by_store('TX_1')
store_id = 'TX_1'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#1.5354
#100:1.60744

[100]	valid_0's rmse: 1.59904
[200]	valid_0's rmse: 1.58891
[300]	valid_0's rmse: 1.58737
[400]	valid_0's rmse: 1.58611
[500]	valid_0's rmse: 1.58533
[600]	valid_0's rmse: 1.58475


In [155]:
# TX_2
grid_df, features_columns = get_data_by_store('TX_2')
print(grid_df.info())
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)
print(grid_df[train_mask][features_columns].shape)
print(grid_df[valid_mask][features_columns].shape)
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4807881 entries, 0 to 4807880
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [156]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': 1.1,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9), 
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [157]:
init_points = 3
n_iter = 7
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.53618	valid_1's rmse: 1.7273
[200]	training's rmse: 2.46001	valid_1's rmse: 1.72316
[300]	training's rmse: 2.44144	valid_1's rmse: 1.71919
[400]	training's rmse: 2.4307	valid_1's rmse: 1.71792
[500]	training's rmse: 2.41913	valid_1's rmse: 1.71542
[600]	training's rmse: 2.41131	valid_1's rmse: 1.71448
Early stopping, best iteration is:
[549]	training's rmse: 2.41473	valid_1's rmse: 1.71396
| [0m 1       [0m | [0m-1.714   [0m | [0m 0.7902  [0m | [0m 16.11   [0m | [0m 0.6931  

In [158]:
del train_data, valid_data, X_valid, y_valid
gc.collect()

229

In [None]:
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
|  4        | -1.712    |  0.8611   |  13.63    |  0.8275   |  298.4    |  0.7706   |  3.021e+0 |  0.6468   |  2.962e+0 |  0.2014   |  0.8631   |

In [159]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample':  0.8611 ,
                    'subsample_freq': 13,
                    'learning_rate': 0.03,
                    'num_leaves': 2962, 
                    'min_data_in_leaf': 3021, 
                    'feature_fraction': 0.8275,
                    'max_bin': 298,
                    'n_estimators': 150,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':0,
                    'min_split_gain':0.6468,
                    'reg_alpha': 0.2014,
                    'reg_lambda':  0.8631,
                }

In [160]:
grid_df, features_columns = get_data_by_store('TX_2')
store_id = 'TX_2'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#.163887
#1.71833

[100]	valid_0's rmse: 1.69645


In [161]:
# TX_3
grid_df, features_columns = get_data_by_store('TX_3')
print(grid_df.info())
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)
print(grid_df[train_mask][features_columns].shape)
print(grid_df[valid_mask][features_columns].shape)
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4737167 entries, 0 to 4737166
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [162]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': 1.1,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9), 
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [163]:
init_points = 3
n_iter = 7
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.35054	valid_1's rmse: 1.73475
[200]	training's rmse: 2.22485	valid_1's rmse: 1.73453
Early stopping, best iteration is:
[106]	training's rmse: 2.32758	valid_1's rmse: 1.73315
| [0m 1       [0m | [0m-1.733   [0m | [0m 0.1514  [0m | [0m 11.54   [0m | [0m 0.5272  [0m | [0m 278.7   [0m | [0m 7.944   [0m | [0m 5.486e+0[0m | [0m 0.5564  [0m | [0m 1.601e+0[0m | [0m 0.5006  [0m | [0m 0.1111  [0m |
Training until validation scores don't improve for 100 rounds.
[100]	tra

In [164]:
del train_data, valid_data, X_valid, y_valid
gc.collect()

107

In [None]:
|  4        | -1.685    |  0.8975   |  1.521    |  0.2709   |  127.3    | -0.5098   |  5.995e+0 |  0.7792   |  2.992e+0 |  0.14     |  0.3187   |

In [165]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample':  0.8975  ,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2992, 
                    'min_data_in_leaf': 5995, 
                    'feature_fraction': 0.2709,
                    'max_bin': 127,
                    'n_estimators': 400,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':0,
                    'min_split_gain':0.7792,
                    'reg_alpha': 0.14 ,
                    'reg_lambda':  0.3187,
                }

In [166]:
grid_df, features_columns = get_data_by_store('TX_3')
store_id = 'TX_3'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#1.62453
#1.6915

[100]	valid_0's rmse: 1.68803
[200]	valid_0's rmse: 1.67352
[300]	valid_0's rmse: 1.6638
[400]	valid_0's rmse: 1.65826


In [170]:
# WI_1
grid_df, features_columns = get_data_by_store('WI_1')
print(grid_df.info())
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)
print(grid_df[train_mask][features_columns].shape)
print(grid_df[valid_mask][features_columns].shape)
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4560767 entries, 0 to 4560766
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [171]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': 1.1,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9), 
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [172]:
init_points = 3
n_iter = 7
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.70708	valid_1's rmse: 1.60331
[200]	training's rmse: 1.66748	valid_1's rmse: 1.59281
[300]	training's rmse: 1.64965	valid_1's rmse: 1.59201
[400]	training's rmse: 1.63769	valid_1's rmse: 1.59142
[500]	training's rmse: 1.62792	valid_1's rmse: 1.59082
Early stopping, best iteration is:
[474]	training's rmse: 1.63037	valid_1's rmse: 1.59053
| [0m 1       [0m | [0m-1.591   [0m | [0m 0.7091  [0m | [0m 11.13   [0m | [0m 0.6197  [0m | [0m 175.1   [0m | [0m 0.8421  [0m | [0m 4.

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.71193	valid_1's rmse: 1.60351
[200]	training's rmse: 1.67154	valid_1's rmse: 1.59234
[300]	training's rmse: 1.65547	valid_1's rmse: 1.59086
[400]	training's rmse: 1.64468	valid_1's rmse: 1.58931
[500]	training's rmse: 1.63574	valid_1's rmse: 1.58923
Early stopping, best iteration is:
[485]	training's rmse: 1.63682	valid_1's rmse: 1.58909
| [95m 10      [0m | [95m-1.589   [0m | [95m 0.3897  [0m | [95m 10.74   [0m | [95m 0.6065  [0m | [95m 101.7   [0m | [95m-0.6873  [0m | [95m 3.006e+0[0m | [95m 0.4203  [0m | [95m 1e+03   [0m | [95m 0.384   [0m | [95m 0.562   [0m |


In [173]:
del train_data, valid_data, X_valid, y_valid
gc.collect()

462

In [None]:
|  10       | -1.589    |  0.3897   |  10.74    |  0.6065   |  101.7    | -0.6873   |  3.006e+0 |  0.4203   |  1e+03    |  0.384    |  0.562    |

In [174]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.3897 ,
                    'subsample_freq': 10,
                    'learning_rate': 0.03,
                    'num_leaves': 1000, 
                    'min_data_in_leaf': 3006, 
                    'feature_fraction': 0.6065,
                    'max_bin': 101,
                    'n_estimators': 500,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':0,
                    'min_split_gain':0.4203,
                    'reg_alpha': 0.384,
                    'reg_lambda': 0.562,
                }

In [175]:
grid_df, features_columns = get_data_by_store('WI_1')
store_id = 'WI_1'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#.152818
#100:1.59878

[100]	valid_0's rmse: 1.59786
[200]	valid_0's rmse: 1.57898
[300]	valid_0's rmse: 1.57013
[400]	valid_0's rmse: 1.56394
[500]	valid_0's rmse: 1.55878


In [176]:
# WI_2
grid_df, features_columns = get_data_by_store('WI_2')
print(grid_df.info())
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)
print(grid_df[train_mask][features_columns].shape)
print(grid_df[valid_mask][features_columns].shape)
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4646580 entries, 0 to 4646579
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [177]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': 1.1,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9), 
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [178]:
init_points = 3
n_iter = 7
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.68086	valid_1's rmse: 2.71022
[200]	training's rmse: 2.63946	valid_1's rmse: 2.63889
[300]	training's rmse: 2.6293	valid_1's rmse: 2.62378
[400]	training's rmse: 2.62124	valid_1's rmse: 2.61617
[500]	training's rmse: 2.6152	valid_1's rmse: 2.61159
[600]	training's rmse: 2.61044	valid_1's rmse: 2.60814
[700]	training's rmse: 2.60625	valid_1's rmse: 2.60451
[800]	training's rmse: 2.60246	valid_1's rmse: 2.60162
[900]	training's rmse: 2.59887	valid_1's rmse: 2.59938
[1000]	training's rms

[400]	training's rmse: 2.56555	valid_1's rmse: 2.5894
[500]	training's rmse: 2.55552	valid_1's rmse: 2.58716
[600]	training's rmse: 2.54729	valid_1's rmse: 2.58679
[700]	training's rmse: 2.53931	valid_1's rmse: 2.58425
[800]	training's rmse: 2.53158	valid_1's rmse: 2.5823
Early stopping, best iteration is:
[794]	training's rmse: 2.53217	valid_1's rmse: 2.58203
| [0m 9       [0m | [0m-2.582   [0m | [0m 0.9     [0m | [0m 1.0     [0m | [0m 0.9     [0m | [0m 100.0   [0m | [0m 12.0    [0m | [0m 4.176e+0[0m | [0m 0.9     [0m | [0m 3e+03   [0m | [0m 0.1     [0m | [0m 0.1     [0m |
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.65951	valid_1's rmse: 2.68838
[200]	training's rmse: 2.6127	valid_1's rmse: 2.61598
[300]	training's rmse: 2.59311	valid_1's rmse: 2.59978
[400]	training's rmse: 2.57923	valid_1's rmse: 2.59401
[500]	training's rmse: 2.56902	valid_1's rmse: 2.59208
[600]	training's rmse: 2.56137	valid_1's rmse: 2.5911
[700

In [179]:
del train_data, valid_data, X_valid, y_valid
gc.collect()

293

In [None]:
|  10       | -2.581    |  0.9      |  1.0      |  0.8677   |  100.0    |  12.0     |  6e+03    |  0.1323   |  2.193e+0 |  0.1      |  0.1      |

In [180]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.9 ,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2193, 
                    'min_data_in_leaf': 6000, 
                    'feature_fraction': 0.8677,
                    'max_bin': 100,
                    'n_estimators': 1300,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':12,
                    'min_split_gain':0.1323,
                    'reg_alpha': 0.1,
                    'reg_lambda': 0.1,
                }

In [181]:
grid_df, features_columns = get_data_by_store('WI_2')
store_id = 'WI_2'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#1.76026
#100:2.69105

[100]	valid_0's rmse: 2.66479
[200]	valid_0's rmse: 2.57496
[300]	valid_0's rmse: 2.54272
[400]	valid_0's rmse: 2.52461
[500]	valid_0's rmse: 2.51358
[600]	valid_0's rmse: 2.50669
[700]	valid_0's rmse: 2.49751
[800]	valid_0's rmse: 2.49216
[900]	valid_0's rmse: 2.48606
[1000]	valid_0's rmse: 2.48031
[1100]	valid_0's rmse: 2.47645
[1200]	valid_0's rmse: 2.4702
[1300]	valid_0's rmse: 2.46611


In [182]:
# WI_3
grid_df, features_columns = get_data_by_store('WI_3')
print(grid_df.info())
train_mask = grid_df['d']<=(END_TRAIN-P_HORIZON)
valid_mask = (grid_df['d']>(END_TRAIN-P_HORIZON)) & (grid_df['d']<=END_TRAIN)
print(grid_df[train_mask][features_columns].shape)
print(grid_df[valid_mask][features_columns].shape)
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
X_valid=grid_df[valid_mask][features_columns]
y_valid=grid_df[valid_mask][TARGET]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4772041 entries, 0 to 4772040
Data columns (total 75 columns):
id                        category
d                         int16
sales                     float64
item_id                   category
dept_id                   category
cat_id                    category
release                   int16
sell_price                float16
price_max                 float16
price_min                 float16
price_std                 float16
price_mean                float16
price_norm                float16
price_nunique             float16
item_nunique              int16
price_momentum            float16
price_momentum_m          float16
price_momentum_y          float16
event_name_1              category
event_type_1              category
event_name_2              category
event_type_2              category
snap_CA                   category
snap_TX                   category
snap_WI                   category
tm_d                      int8
t

In [183]:
def LGB_bayesian(
    num_leaves, 
    bagging_fraction,
    bagging_freq,
    colsample_bytree,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda,
    min_split_gain,
    max_bin
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    bagging_freq=int(bagging_freq)
    max_bin=int(max_bin)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    assert type(bagging_freq)==int
    assert type(max_bin)==int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'tweedie_variance_power': 1.1,
              'bagging_fraction' : bagging_fraction,
              'bagging_freq':bagging_freq,
              'colsample_bytree' : colsample_bytree,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'max_bin':max_bin,
              'min_split_gain':min_split_gain,
              'objective': 'tweedie',
              'boosting_type':'gbdt',
              'boost_from_average': False,
              'learning_rate': 0.03,
              'metric':'rmse',
              'verbose': -1}    


    model= lgb.train(param, train_data, num_boost_round = 10000, early_stopping_rounds = 100, valid_sets = [train_data, valid_data], verbose_eval = 100)
    val_pred = model.predict(X_valid, num_iteration=model.best_iteration)  
    
    rmse =sqrt(mean_squared_error(val_pred, y_valid))

    return -rmse
bounds_LGB = {
    'num_leaves': (1000, 3000), 
    'min_data_in_leaf': (3000,6000),
    'bagging_fraction' : (0.1,0.9),
    'bagging_freq':(1,20),
    'max_bin':(100,300),
    'min_split_gain':(0.1,0.9),
    'colsample_bytree' : (0.1,0.9), 
    'reg_alpha': (0.1, 1), 
    'reg_lambda': (0.1, 1),
    'max_depth':(-1,12),
}
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB)

In [184]:
init_points = 3
n_iter = 7
import warnings
warnings.simplefilter('ignore')
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | baggin... | colsam... |  max_bin  | max_depth | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.53321	valid_1's rmse: 1.97885
[200]	training's rmse: 2.42836	valid_1's rmse: 1.90522
[300]	training's rmse: 2.38866	valid_1's rmse: 1.88852
[400]	training's rmse: 2.36621	valid_1's rmse: 1.88078
[500]	training's rmse: 2.35214	valid_1's rmse: 1.87409
[600]	training's rmse: 2.34267	valid_1's rmse: 1.87066
[700]	training's rmse: 2.33468	valid_1's rmse: 1.86736
[800]	training's rmse: 2.32799	valid_1's rmse: 1.86547
[900]	training's rmse: 2.32226	valid_1's rmse: 1.86358
[1000]	training's r

[900]	training's rmse: 2.32427	valid_1's rmse: 1.87496
[1000]	training's rmse: 2.32184	valid_1's rmse: 1.87402
[1100]	training's rmse: 2.31824	valid_1's rmse: 1.87278
[1200]	training's rmse: 2.31407	valid_1's rmse: 1.86967
[1300]	training's rmse: 2.31097	valid_1's rmse: 1.86868
[1400]	training's rmse: 2.30784	valid_1's rmse: 1.86681
[1500]	training's rmse: 2.30559	valid_1's rmse: 1.86575
[1600]	training's rmse: 2.30249	valid_1's rmse: 1.86499
[1700]	training's rmse: 2.2996	valid_1's rmse: 1.86382
[1800]	training's rmse: 2.29684	valid_1's rmse: 1.86185
[1900]	training's rmse: 2.29409	valid_1's rmse: 1.86059
[2000]	training's rmse: 2.2916	valid_1's rmse: 1.85946
[2100]	training's rmse: 2.28979	valid_1's rmse: 1.85932
[2200]	training's rmse: 2.28718	valid_1's rmse: 1.85784
[2300]	training's rmse: 2.28537	valid_1's rmse: 1.85704
[2400]	training's rmse: 2.28298	valid_1's rmse: 1.85593
[2500]	training's rmse: 2.2808	valid_1's rmse: 1.85484
[2600]	training's rmse: 2.27848	valid_1's rmse: 1.85

[4700]	training's rmse: 2.44476	valid_1's rmse: 1.93033
[4800]	training's rmse: 2.4428	valid_1's rmse: 1.92965
[4900]	training's rmse: 2.44143	valid_1's rmse: 1.92916
[5000]	training's rmse: 2.44012	valid_1's rmse: 1.92866
[5100]	training's rmse: 2.43852	valid_1's rmse: 1.92845
[5200]	training's rmse: 2.43744	valid_1's rmse: 1.92795
[5300]	training's rmse: 2.43602	valid_1's rmse: 1.92773
[5400]	training's rmse: 2.43487	valid_1's rmse: 1.92735
[5500]	training's rmse: 2.43374	valid_1's rmse: 1.9271
[5600]	training's rmse: 2.43217	valid_1's rmse: 1.92704
[5700]	training's rmse: 2.43111	valid_1's rmse: 1.92617
[5800]	training's rmse: 2.4298	valid_1's rmse: 1.9263
Early stopping, best iteration is:
[5720]	training's rmse: 2.43097	valid_1's rmse: 1.92606
| [0m 10      [0m | [0m-1.926   [0m | [0m 0.6814  [0m | [0m 5.737   [0m | [0m 0.1786  [0m | [0m 105.6   [0m | [0m 1.465   [0m | [0m 5.993e+0[0m | [0m 0.393   [0m | [0m 2.892e+0[0m | [0m 0.4791  [0m | [0m 0.4711  [0m |

In [None]:
|  2        | -1.845    |  0.83     |  5.833    |  0.6944   |  270.7    |  7.194    |  4.013e+0 |  0.6251   |  1.352e+0 |  0.6124   |  0.83     |

In [185]:
del train_data, valid_data, X_valid, y_valid
gc.collect()

46

In [186]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample':  0.83 ,
                    'subsample_freq': 5,
                    'learning_rate': 0.03,
                    'num_leaves': 1352, 
                    'min_data_in_leaf': 4013, 
                    'feature_fraction': 0.6944,
                    'max_bin': 270,
                    'n_estimators': 2200,
                    'boost_from_average': False,
                    'verbose': -1,
                    'max_depth':7,
                    'min_split_gain': 0.6251,
                    'reg_alpha': 0.6124,
                    'reg_lambda':  0.83,
                }

In [187]:
grid_df, features_columns = get_data_by_store('WI_3')
store_id = 'WI_3'
train_mask = grid_df['d']<=END_TRAIN
valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
train_data = lgb.Dataset(grid_df[train_mask][features_columns], label=grid_df[train_mask][TARGET])
train_data.save_binary('train_data.bin')
train_data = lgb.Dataset('train_data.bin')
    
valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
grid_df = grid_df[preds_mask].reset_index(drop=True)
keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
grid_df = grid_df[keep_cols]
grid_df.to_pickle('test_'+store_id+'.pkl')
del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
seed_everything(SEED)
estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100,)
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
!rm train_data.bin
del train_data, valid_data, estimator
gc.collect()
# "Keep" models features for predictions
MODEL_FEATURES = features_columns
#1.76026
#100:1.92146

[100]	valid_0's rmse: 1.91709
[200]	valid_0's rmse: 1.87335
[300]	valid_0's rmse: 1.86237
[400]	valid_0's rmse: 1.85703
[500]	valid_0's rmse: 1.85066
[600]	valid_0's rmse: 1.84598
[700]	valid_0's rmse: 1.84332
[800]	valid_0's rmse: 1.83943
[900]	valid_0's rmse: 1.83649
[1000]	valid_0's rmse: 1.835
[1100]	valid_0's rmse: 1.83275
[1200]	valid_0's rmse: 1.83058
[1300]	valid_0's rmse: 1.828
[1400]	valid_0's rmse: 1.82607
[1500]	valid_0's rmse: 1.82547
[1600]	valid_0's rmse: 1.82288
[1700]	valid_0's rmse: 1.8216
[1800]	valid_0's rmse: 1.81999
[1900]	valid_0's rmse: 1.81763
[2000]	valid_0's rmse: 1.81648
[2100]	valid_0's rmse: 1.81538
[2200]	valid_0's rmse: 1.81451


In [188]:
# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day
for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
        
    for store_id in STORES_IDS:
        
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 
        
        estimator = pickle.load(open(model_path, 'rb'))
        
        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
    
    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds.head()
all_preds.tail()

Predict | Day: 1
##########  0.65 min round |  0.65 min total |  37004.71 day sales |
Predict | Day: 2
##########  0.67 min round |  1.32 min total |  35473.49 day sales |
Predict | Day: 3
##########  0.64 min round |  1.96 min total |  34997.51 day sales |
Predict | Day: 4
##########  0.69 min round |  2.65 min total |  35528.85 day sales |
Predict | Day: 5
##########  0.64 min round |  3.29 min total |  41545.06 day sales |
Predict | Day: 6
##########  0.65 min round |  3.94 min total |  50514.43 day sales |
Predict | Day: 7
##########  0.68 min round |  4.62 min total |  53595.14 day sales |
Predict | Day: 8
##########  0.66 min round |  5.27 min total |  44255.78 day sales |
Predict | Day: 9
##########  0.65 min round |  5.92 min total |  44148.37 day sales |
Predict | Day: 10
##########  0.67 min round |  6.59 min total |  39039.16 day sales |
Predict | Day: 11
##########  0.67 min round |  7.25 min total |  40913.01 day sales |
Predict | Day: 12
##########  0.66 min round |  7.91

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30485,FOODS_3_823_WI_3_validation,0.35156,0.318356,0.335796,0.337141,0.370245,0.438981,0.433703,0.477256,0.474865,...,0.484331,0.620101,0.629389,0.454026,0.385747,0.389512,0.370834,0.435877,0.501518,0.493276
30486,FOODS_3_824_WI_3_validation,0.265074,0.249082,0.257646,0.260829,0.304125,0.28526,0.298563,0.303073,0.301354,...,0.274593,0.320768,0.358263,0.273464,0.23021,0.22042,0.216433,0.244752,0.284998,0.28522
30487,FOODS_3_825_WI_3_validation,0.710566,0.664698,0.632484,0.630833,0.792454,0.874996,1.002033,1.147045,1.115036,...,0.910525,1.317883,1.371702,0.939114,0.696098,0.662452,0.629567,0.732431,0.890117,0.903989
30488,FOODS_3_826_WI_3_validation,0.84691,0.805053,0.761669,0.753137,0.888614,1.053512,1.120932,1.072092,1.083138,...,1.038968,1.371697,1.366273,0.938775,0.804774,0.770337,0.775346,0.959985,1.064001,1.061433
30489,FOODS_3_827_WI_3_validation,0.149663,0.773651,0.89218,1.143053,1.422115,1.882513,1.766871,1.675501,2.004725,...,1.469069,1.929914,1.881762,1.362755,1.210466,1.171472,1.135632,1.299249,1.714112,1.672511


In [189]:
# Reading competition sample submission and
# merging our predictions
# As we have predictions only for "_validation" data
# we need to do fillna() for "_evaluation" items
submission = pd.read_csv('sample_submission.csv')[['id']]
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v2.csv', index=False)#0.49187