# Gradient Boosting model

## Import data and set up

In [1]:
# import relevant packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

%matplotlib inline

In [2]:
# set up paths
data_dir = "../data"

In [3]:
# import data 
X_train = pd.read_csv(os.path.join(data_dir, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(data_dir, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
y_test = pd.read_csv(os.path.join(data_dir, 'y_test.csv'))

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(397900, 25)
(99476, 25)
(397900, 1)
(99476, 1)


In [4]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [5]:
X_train.columns

Index(['Date', 'Store', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Month', 'Year',
       'Weekday', 'Holiday', 'StoreType_enc', 'Assortment_enc', 'Store_enc',
       'Customers_enc'],
      dtype='object')

In [6]:
train_cols = ['Promo', 'SchoolHoliday', 'CompetitionDistance', 'Month',
              'Weekday', 'Holiday', 'Customers_enc', 'StoreType_enc', 
              'Assortment_enc', 'Store_enc']

X_train = X_train[train_cols]
X_test = X_test[train_cols]
print(X_train.columns)
print(X_test.columns)

Index(['Promo', 'SchoolHoliday', 'CompetitionDistance', 'Month', 'Weekday',
       'Holiday', 'Customers_enc', 'StoreType_enc', 'Assortment_enc',
       'Store_enc'],
      dtype='object')
Index(['Promo', 'SchoolHoliday', 'CompetitionDistance', 'Month', 'Weekday',
       'Holiday', 'Customers_enc', 'StoreType_enc', 'Assortment_enc',
       'Store_enc'],
      dtype='object')


Logarithmically transforming y during training (and converting back after prediction) gives better performance.

In [7]:
y_train['Sales'] = y_train['Sales'].apply(np.log)

In [8]:
y_train = y_train.to_numpy().flatten()
y_test = y_test.to_numpy().flatten()

## Gradient Boosting

In [None]:
#xgboost
model = xgb.XGBRegressor(n_jobs=-1, random_state=42)

In [None]:
# define parameter space for grid search
parameter_space = {
    'n_estimators': [500, 1000],
    'eta': [0.03, 0.1],
    'subsample': [0.7],
    'colsample_bytree': [0.7],
    'max_depth': [5, 7]
}

# n_estimators = number of trees in the foreset
# max_features = max number of features considered for splitting a node
# max_depth = max number of levels in each decision tree
# min_samples_split = min number of data points placed in a node before the node is split
# min_samples_leaf = min number of data points allowed in a leaf node
# bootstrap = method for sampling data points (with or without replacement)

In [None]:
my_scorer = make_scorer(metric, greater_is_better=False)

In [None]:
%%time
regr = GridSearchCV(model, parameter_space, scoring=my_scorer, 
                   n_jobs=None, cv=5, return_train_score = True)
regr.fit(X_train, y_train)

In [None]:
#Best estimator
print('Best estimator:\n', regr.best_estimator_)

# Best parameter setting
print('Best parameters found:\n', regr.best_params_)

# Scorer used on the held out data to choose best parameters for the model
print('Scorer used on the held out data to choose the best parameters for the model:', regr.scorer_)

# Mean cross-validated score of the best estimator
print('Best mean cross-validated score:', regr.best_score_)

In [None]:
# All results
cv_results = pd.DataFrame.from_dict(regr.cv_results_)
cv_results

In [None]:
log_y_pred = regr.predict(X_test)
y_pred = np.exp(log_y_pred)
value = metric(y_pred, y_test)
print(value)

In [9]:
# Feature importance
model = xgb.XGBRegressor(n_estimators=1000,
                         eta=0.1,
                         subsample=0.7,
                         colsample_bytree=0.7,
                         max_depth=7,
                         n_jobs=-1, 
                         random_state=42)

model.fit(X_train, y_train)
log_y_pred = model.predict(X_test)
y_pred = np.exp(log_y_pred)
value = metric(y_pred, y_test)
print(value)

print(model.feature_importances_)

19.70245719234581
[0.28944603 0.013727   0.01660584 0.01660286 0.04595874 0.01525655
 0.03869773 0.07505293 0.0292779  0.45937443]


In [None]:
# plot feature importance
plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
plt.show()

In [None]:
X_train.columns