# Light GBM

Use tree-based algorithms since they are powerful and do not impose strict assumptions on features like linearity or independence. Light GBM is a fast algorithm with lower memory usage.

In [None]:
#!pip install lightgbm

In [None]:
#conda install numpy

Import libraries

In [1]:
import pandas as pd
import numpy as np
from datetime  import datetime  
from datetime import timedelta  
import lightgbm as lgb
from sklearn import preprocessing, metrics
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.lightgbm

Read data

In [2]:
X_train = pd.read_pickle('../01_preprocessed_data/X_train.pkl')
y_train = pd.read_pickle('../01_preprocessed_data/y_train.pkl')
X_val = pd.read_pickle('../01_preprocessed_data/X_val.pkl')
y_val = pd.read_pickle('../01_preprocessed_data/y_val.pkl')
X_test = pd.read_pickle('../01_preprocessed_data/X_test.pkl')

In [3]:
data_location = '../00_data/sample_submission.csv'
submission = pd.read_csv(data_location)

Set parameters of the light GBM and select features to fit.

In [4]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)

(42644016, 35)
(1706991, 35)
(853720, 35)
(42644016, 1)
(1706991, 1)


In [5]:
params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective':  'poisson',
        'n_jobs': -1,
        'seed': 0,
        'learning_rate': 0.1, 
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75}
not_features = ['d', 'id', 'demand', 'date', 'start_date']

In [6]:
param_grid ={'boosting_type': ['gbdt'],
        'metric': ['rmse'],
        'objective': ['poisson', 'tweedie'],
        'n_jobs': [-1],
        'seed': [0],
        'learning_rate':  [0.05, 0.075, 0.1],
        'bagging_fraction': [0.5, 0.75, 1],
        'bagging_freq': [10], 
        'colsample_bytree': [0.75],
        'num_iterations': [1000],
        'early_stopping_round': [100]}

In [7]:
features = X_test.columns[~X_test.columns.isin(not_features)]

In [8]:
train_set = lgb.Dataset(X_train[features], y_train) #, categorical_feature = categorical_features)
val_set = lgb.Dataset(X_val[features], y_val)#,  categorical_feature = categorical_features)

Train the model

In [None]:
model = lgb.train(params, train_set,  
                  valid_sets = [train_set, val_set], verbose_eval = 100)

In [10]:
i = 0
for g in ParameterGrid(param_grid):
    print(i)
    print(g)
    mlflow.lightgbm.autolog() 
    model = lgb.train(g, train_set,  
                  valid_sets = [train_set, val_set], verbose_eval = 100)
    
    y_test = model.predict(X_test[features])
    X_test['demand'] = y_test

    predictions = X_test[['id', 'days_from_start', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'days_from_start', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    final.head()
    final.to_csv('../04_submissions/lightGBM_{}.csv'.format(i), index = False)
    i = i + 1
    print("------------------------------------")

0
{'bagging_fraction': 0.5, 'bagging_freq': 10, 'boosting_type': 'gbdt', 'colsample_bytree': 0.75, 'early_stopping_round': 100, 'learning_rate': 0.05, 'metric': 'rmse', 'n_jobs': -1, 'num_iterations': 1000, 'objective': 'poisson', 'seed': 0}


  all_param_names, _, _, all_default_values = inspect.getargspec(fn)  # pylint: disable=W1505
  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 2.69298	valid_1's rmse: 2.24843
[200]	training's rmse: 2.63642	valid_1's rmse: 2.21803
[300]	training's rmse: 2.60652	valid_1's rmse: 2.2151
[400]	training's rmse: 2.58173	valid_1's rmse: 2.21307
[500]	training's rmse: 2.56216	valid_1's rmse: 2.21115
Early stopping, best iteration is:
[460]	training's rmse: 2.56838	valid_1's rmse: 2.20982
------------------------------------
1
{'bagging_fraction': 0.5, 'bagging_freq': 10, 'boosting_type': 'gbdt', 'colsample_bytree': 0.75, 'early_stopping_round': 100, 'learning_rate': 0.05, 'metric': 'rmse', 'n_jobs': -1, 'num_iterations': 1000, 'objective': 'tweedie', 'seed': 0}
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 2.72587	valid_1's rmse: 2.22466
[200]	training's rmse: 2.65079	valid_1's rmse: 2.2141
Early stopping, best iteration is:
[141]	training's rmse: 2.68364	valid_1's rmse: 2.2075
-----------------------------------

Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 2.68291	valid_1's rmse: 2.23915
[200]	training's rmse: 2.6232	valid_1's rmse: 2.22732
[300]	training's rmse: 2.57767	valid_1's rmse: 2.22419
Early stopping, best iteration is:
[284]	training's rmse: 2.58499	valid_1's rmse: 2.22277
------------------------------------
15
{'bagging_fraction': 1, 'bagging_freq': 10, 'boosting_type': 'gbdt', 'colsample_bytree': 0.75, 'early_stopping_round': 100, 'learning_rate': 0.075, 'metric': 'rmse', 'n_jobs': -1, 'num_iterations': 1000, 'objective': 'tweedie', 'seed': 0}
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 2.67852	valid_1's rmse: 2.21663
Early stopping, best iteration is:
[94]	training's rmse: 2.68539	valid_1's rmse: 2.21517
------------------------------------
16
{'bagging_fraction': 1, 'bagging_freq': 10, 'boosting_type': 'gbdt', 'colsample_bytree': 0.75, 'early_stopping_round': 100, 'learning_rate': 0.1, 'metric': 'rm

In [11]:
print(len(ParameterGrid(param_grid)))

18


Calculate the RMSE on the validation set

In [None]:
y_test = model.predict(X_test[features])

In [None]:
val_pred = model.predict(X_val[features])
val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
X_val['demand'] = y_val
X_val['demand_pred'] = val_pred
X_val['abs_difference'] = abs(X_val['demand'] - X_val['demand_pred'])
print(f'Our val rmse score is {val_score}')
y_test = model.predict(X_test[features])
X_test['demand'] = y_test


In [None]:
print(f'Our val mae score is {metrics.mean_absolute_error(val_pred, y_val)}')

We look at the mean absolute error by forecastablity in order to indestand what category needs more improvement for prediction.

In [None]:
X_val.groupby(['demand_type'])['demand_type', 'abs_difference'].agg(['mean']).reset_index()

In [None]:
demans_vs_error = sns.scatterplot(x=X_val['demand'], y=X_val['abs_difference'])
demans_vs_error.set_title('Distribution of error over demand')
demans_vs_error.set_xlabel('absolute error')

Surprisingly smooth time series has a large mean absolute error. Erratic time series has a large mean absolute error; typically time series of this type is difficult to predict.

## Feature importance

In [None]:
def plotImp(model, X , num = 20):
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
    plt.figure(figsize=(40, 20))
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()
plotImp(model, X_train[features], 30)

'item_id' and 'id' are most important features. 'days_from_start' reflect the trend of data. 

## Transform prediction

Transform predictions to the right format

In [None]:
predictions = X_test[['id', 'days_from_start', 'demand']]
predictions = pd.pivot(predictions, index = 'id', columns = 'days_from_start', values = 'demand').reset_index()
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
evaluation = submission[submission['id'].isin(evaluation_rows)]

validation = submission[['id']].merge(predictions, on = 'id')
final = pd.concat([validation, evaluation])
final.head()


Save the submission, the features and the score to files.

In [None]:
final.to_csv('../04_submissions/lightGBM_no_fe.csv', index = False)

In [None]:
features = X_test.columns[~X_test.columns.isin(not_features)]
features = features.to_list()
features.append(str(val_score))
features.append(str(params['objective']))


In [None]:
with open("../04_submissions/lgb_features_score.txt", "a") as outfile:
    outfile.write("\n".join(features))


## Score

The score of this submission is 0.58584, which is better than naive prediction.