# Demand forecasting construction and fitting

In [None]:
# Import pandas for data manipulation
import pandas as pd

# Import matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Import xgboost for model and plotting util and pickle for saving it
from xgboost import XGBRegressor, plot_importance
import pickle

# Import sklearn cross val score and time series splits for model tuning
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score, GridSearchCV

## Processing input data

In [None]:
file_path = '../data/MergedData2014-2016.xlsx'

train = pd.read_excel(file_path, parse_dates=['Date'])
train.rename(columns={'# items demanded':'demand',
                   'Avg temp in 0.1oC': 'temperature',
                   'Rainfall in 24h in 0.1mm':'precipitation',
                  }, inplace=True)

# Add weekday based on date
train['weekday'] = train['Date'].dt.day_name()

# Obtain dummy columns for weekdays
weekday_dummies = pd.get_dummies(train['weekday'])

# Concatenate dummies to train data
train = pd.concat([train, weekday_dummies], axis=1)

# Add lagged variables
train['t-7'] = train['demand'].shift(7).fillna(train['demand'].mean())
train['t-14'] = train['demand'].shift(14).fillna(train['demand'].mean())

## Split data into features and target

In [None]:
X_train = train.drop(['Date', 'demand', 'weekday'], axis=1)
y_train = train.loc[:, 'demand']

## Train and tune regressor on train set (with time series cross validation)

In [None]:
params_grid = {"n_estimators": [150,300,500],
               "max_depth": [3,4,5],
               "learning_rate": [0.1, 0.01, 0.005],
              }

In [None]:
# for time-series cross-validation set 5 folds 
tscv = TimeSeriesSplit(n_splits=5)

# Params to tweak: max leaf size, max depth, learning rate
xgb = XGBRegressor()

cv = GridSearchCV(xgb, param_grid=params_grid, 
                  scoring='neg_mean_squared_error', cv=tscv)

cv.fit(X_train, y_train)

## Evaluate model

In [None]:
# Collect the gridsearch results in a dataframe
# based on interim test scores
scores = pd.Series(cv.cv_results_['split4_test_score'], name='scores')
GS_results = pd.DataFrame.from_dict(cv.cv_results_['params'])
GS_results['scores'] = scores

In [None]:
# Plot with gridscores 
with sns.axes_style("white"):
    f, g = plt.subplots(figsize=(15, 12))
    g = sns.barplot(x=GS_results.set_index(['learning_rate',
                                           'max_depth',
                                           'n_estimators']).index,
                    y=GS_results['scores']*-1); # Turn into positive errors
    g.set_title("Interim test set errors for all tested parameter sets");
    plt.xticks(rotation=60);
#     g.figure.savefig('../figs/grid_scores_train.png', 
#                      format='png', dpi=250);

## Re-train and save best model for later use

This cell has been turned into raw text because we don't want to overwrite the saved model, because that would harm reproducibility of the results. 

In [None]:
# Train the model again with the best parameters
xgb_tuned = XGBRegressor(**cv.best_params_)
xgb_tuned.fit(X_train, y_train)

#### THESE LINES HAVE BEEN COMMENTED OUT BECAUSE OVERWRITING THE MODEL WOULD
#### HARM REPRODUCIBILITY. PLEASE LEAVE COMMENTED OUT

# Save the tuned and trained xgb model in data/xgb_tuned.p
# pickle.dump(xgb_tuned, open('./xgb_tuned.p', 'wb')) 

## Create importance plot for best model

In [None]:
m = pickle.load(open('./xgb_tuned.p', 'rb'))

with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(15, 12))
    plot_importance(m, importance_type='gain', 
                    ax=ax, title="Feature importance of trained XGB model")
#     ax.figure.savefig('../figs/xgb_feature_importance.png', 
#                      format='png', dpi=250);