In [1]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

In [2]:
import pandas as pd
import numpy as np
import datetime
from pandas_summary import DataFrameSummary

In [3]:
df = pd.read_feather('train_normalized_data.fth')
df_test = pd.read_feather('test_normalized_data.fth')

In [4]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 
            'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 
            'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw']

In [5]:
contin_vars = ['CompetitionDistance', 
   'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday_bool', 'BeforeStateHoliday_bool', 'Promo', 'SchoolHoliday']

In [6]:
from lightgbm import LGBMRegressor

In [7]:
y_out_columns = ['Sales']

In [8]:
split_type = 'last_week'

In [9]:
# Esta es para entrenar con todo
if split_type == 'no_split':
    df_train = df
elif split_type == 'last_week':
    # Esto divide en train y val
    df_train = df[df.Date < datetime.datetime(2015, 7, 1)]  
    df_val = df[df.Date >= datetime.datetime(2015, 7, 1)]
    print(f'Cantidad en val: {len(df_val)}, porcentaje: {len(df_train)/(len(df_train) + len(df_val))}')
elif split_type == 'random':
    # Splitting aleatorio
    np.random.seed(42)
    indexes = np.arange(len(df))
    np.random.shuffle(indexes)
    N = len(df)//5
    df_train = df[N:]
    df_val = df[:N]
    print(f'Cantidad en val: {len(df_val)}, porcentaje: {len(df_train)/(len(df_train) + len(df_val))}')

Cantidad en val: 30188, porcentaje: 0.9642465458145908


In [10]:
X_train = df_train[cat_vars + contin_vars]
if split_type != 'no_split':
    X_val = df_val[cat_vars + contin_vars]
X_test = df_test[cat_vars + contin_vars]

In [11]:
log_output = True

In [12]:
    
if log_output:
    # Escala logaritmica
    max_log_y = np.max(np.log(df[y_out_columns])).values
    y_train = np.log(df_train[y_out_columns].values)/max_log_y
    if split_type != 'no_split':
        y_val = np.log(df_val[y_out_columns].values)/max_log_y
else:
    # Normalización
    y_mean = df_train[y_out_columns].mean().values
    y_std = df_train[y_out_columns].std().values
    y_train = (df_train[y_out_columns].values - y_mean)/y_std
    if split_type != 'no_split':
        y_val = (df_val[y_out_columns].values - y_mean)/y_std

In [13]:
from sklearn.model_selection import cross_val_score
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
parameters = {'learning_rate':[0.05, 0.1, 0.5], 'max_depth':[1, 7, -1], 'colsample_bytree':[0.75, 0.5]}

In [16]:
n_estimators = 4000
model = LGBMRegressor(n_estimators=n_estimators)

In [17]:
clf = GridSearchCV(model, parameters, n_jobs=-1, verbose=1, cv=3)

In [18]:
fit_params={ "eval_metric" : 'l2', 
            'verbose': 100,
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': cat_vars}

if split_type != 'no_split':
    fit_params["eval_set"] = [(X_val, y_val.reshape(-1))]
    fit_params["early_stopping_rounds"] = 100

In [19]:
clf.fit(X_train, y_train.reshape(-1), **fit_params)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


New categorical_feature is ['Assortment', 'CompetitionMonthsOpen', 'CompetitionOpenSinceYear', 'Day', 'DayOfWeek', 'Events', 'Month', 'Promo2SinceYear', 'Promo2Weeks', 'PromoInterval', 'Promo_bw', 'Promo_fw', 'SchoolHoliday_bw', 'SchoolHoliday_fw', 'State', 'StateHoliday', 'StateHoliday_bool_bw', 'StateHoliday_bool_fw', 'Store', 'StoreType', 'Week', 'Year']


Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.000278391
[200]	valid_0's l2: 0.000218491
[300]	valid_0's l2: 0.00017791
[400]	valid_0's l2: 0.000159896
[500]	valid_0's l2: 0.000147508
[600]	valid_0's l2: 0.000140842
[700]	valid_0's l2: 0.000135917
[800]	valid_0's l2: 0.000132157
[900]	valid_0's l2: 0.000129317
[1000]	valid_0's l2: 0.000127673
[1100]	valid_0's l2: 0.000125337
[1200]	valid_0's l2: 0.000123143
[1300]	valid_0's l2: 0.000122342
[1400]	valid_0's l2: 0.000121807
[1500]	valid_0's l2: 0.000120654
[1600]	valid_0's l2: 0.000119963
[1700]	valid_0's l2: 0.000119497
[1800]	valid_0's l2: 0.000119125
[1900]	valid_0's l2: 0.000118999
[2000]	valid_0's l2: 0.000118526
[2100]	valid_0's l2: 0.000118163
[2200]	valid_0's l2: 0.000117807
[2300]	valid_0's l2: 0.000117589
[2400]	valid_0's l2: 0.00011766
Early stopping, best iteration is:
[2301]	valid_0's l2: 0.000117574


GridSearchCV(cv=3, estimator=LGBMRegressor(n_estimators=4000), n_jobs=-1,
             param_grid={'colsample_bytree': [0.75, 0.5],
                         'learning_rate': [0.05, 0.1, 0.5],
                         'max_depth': [1, 7, -1]},
             verbose=1)

In [20]:
clf

GridSearchCV(cv=3, estimator=LGBMRegressor(n_estimators=4000), n_jobs=-1,
             param_grid={'colsample_bytree': [0.75, 0.5],
                         'learning_rate': [0.05, 0.1, 0.5],
                         'max_depth': [1, 7, -1]},
             verbose=1)

In [21]:
clf.best_score_

0.8909274013558908

In [22]:
clf.best_params_

{'colsample_bytree': 0.75, 'learning_rate': 0.05, 'max_depth': 7}

In [27]:
model = LGBMRegressor(n_estimators = n_estimators, colsample_bytree= 0.75, learning_rate= 0.05, max_depth= 7)

In [28]:
model.fit(X_train, y_train, early_stopping_rounds = 100, eval_set = [(X_val, y_val.reshape(-1))])



[1]	valid_0's l2: 0.0013908
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.00136218
[3]	valid_0's l2: 0.00132783
[4]	valid_0's l2: 0.00129064
[5]	valid_0's l2: 0.00127108
[6]	valid_0's l2: 0.00124758
[7]	valid_0's l2: 0.00122573
[8]	valid_0's l2: 0.00120297
[9]	valid_0's l2: 0.0011795
[10]	valid_0's l2: 0.00116231
[11]	valid_0's l2: 0.00113863
[12]	valid_0's l2: 0.00111801
[13]	valid_0's l2: 0.00110746
[14]	valid_0's l2: 0.00108994
[15]	valid_0's l2: 0.00107546
[16]	valid_0's l2: 0.00106634
[17]	valid_0's l2: 0.00105325
[18]	valid_0's l2: 0.00104353
[19]	valid_0's l2: 0.00103335
[20]	valid_0's l2: 0.00102018
[21]	valid_0's l2: 0.00101569
[22]	valid_0's l2: 0.00100684
[23]	valid_0's l2: 0.00100031
[24]	valid_0's l2: 0.000989773
[25]	valid_0's l2: 0.000986771
[26]	valid_0's l2: 0.000978219
[27]	valid_0's l2: 0.000969663
[28]	valid_0's l2: 0.000968798
[29]	valid_0's l2: 0.000960621
[30]	valid_0's l2: 0.000950323
[31]	valid_0's l2: 0.000942366
[32]	valid_

LGBMRegressor(colsample_bytree=0.75, learning_rate=0.05, max_depth=7,
              n_estimators=4000)

In [29]:
model.score(X_val, y_val)

0.8971530477172273

In [30]:
if log_output:
    y_pred_train = np.exp(model.predict(X_train, verbose=1)*max_log_y)
    y_pred = np.exp(model.predict(X_val, verbose=1)*max_log_y)
    y_pred_test = np.exp(model.predict(X_test, verbose=1)*max_log_y)
else:
    y_pred_train = model.predict(X_train, verbose=1)*y_std + y_mean
    y_pred = model.predict(X_val, verbose=1)*y_std + y_mean
    y_pred_test = model.predict(X_test, verbose=1)*y_std + y_mean

In [31]:
sample_csv = pd.read_csv('dataset/rossmann/sample_submission.csv')
sample_csv['Sales'] = y_pred_test
sample_csv.head()

sample_csv.to_csv(f'submision_lightgbm_gs_lr05_maxdepth7_colsample075.csv', index=False)