# Standard classifiers

In [None]:
import os
import pickle
import datetime
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit¶
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

%matplotlib inline

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [None]:
today = datetime.datetime.now().strftime('%Y%m%d')

## Reading the data

In [None]:
input_file = os.path.join(config.path_data, '')
df = pd.read_pickle(input_file)

## Train/test split

In [None]:
# if the data is already sorted
train_size = 195 # Or better define test size, 60 days
train_df = train_and_test[0:train_size]
test_df = train_and_test[train_size:]
# Or so
#train_X, test_X = np.split(train_and_test_X, [int(0.60 *len(train_and_test_X))])
#train_y, test_y = np.split(train_and_test_y, [int(0.60 *len(train_and_test_y))])

In [None]:
# We have an unbalanced set -- do we need resampling?
if DOWNSAMPLING_RATIO:
    samplerates = [1, 2, 3]

    np.random.seed(29)

    indices = []

    idx_jam = train_df[train_df.jam == 1].index.values
    idx_no_jam = train_df[train_df.jam == 0].index.values

    for samplerate in samplerates:
        no_jam_to_draw =  len(idx_jam) * samplerate
        drawn_idx_no_jam = np.random.choice(
            idx_no_jam, size=no_jam_to_draw
        ).tolist()

        all_idx = idx_jam.tolist() + drawn_idx_no_jam

        indices.append(all_idx)

train_df = train_df.iloc[indices[DOWNSAMPLING_RATIO - 1]]

In [None]:
# Target
train_y = train_df[response].values
train_X = train_df.drop(response, axis=1)

test_y = test_df[response].values
test_X = test_df.drop(response, axis=1)

## Scaler?

## Feature selection and training

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

i = 1
for importance, feature in sorted(zip(rf.feature_importances_, X_train.columns), reverse=True):
    print(i, '-', round(importance, 3), '\t', feature)
    i += 1

In [None]:
mod_RF = RandomForestClassifier() # or GradientBoostingClassifier, or XGBoostClassifier

search_grid_RF = {'bootstrap': [True],
                  'max_depth': [4, 5, 6, 7],
                  'max_features': [3, 4, 5, 6, 7],
                  'n_estimators': [100, 200],
                  'min_samples_leaf': [10, 25, 50],
                  'min_samples_split': [25, 50],
                  'random_state': [29]
                  }

#### Attention: DO NOT USE REGULAR CV WITH TIME DATA!!
param_search_RF = GridSearchCV(estimator=mod_RF, 
                               param_grid=search_grid_RF, 
                               scoring=metrics.make_scorer(metrics.roc_auc_score),
                               cv=3, 
                               n_jobs=-1, 
                               verbose=2, return_train_score=True,
                               iid=True)

param_search_RF.fit(X_train, y_train)        

print(param_search_RF.best_params_)

In [None]:
mod_opt_RF = RandomForestClassifier(**param_search_RF.best_params_)
mod_opt_RF = mod_opt_RF.fit(X_train, y_train)

In [None]:
# Threshold for accuracy calculation
threshold_RF = 0.3

In [None]:
# Distribution of scores by class
score_train_RF = mod_opt_RF.predict_proba(train_X)[:, 1]
score_test_RF = mod_opt_RF.predict_proba(test_X)[:, 1]

fig, ax = plt.subplots(ncols=2, figsize=(15, 5))

(pd.DataFrame({'y': train_y, 'Training score': score_train_RF})
 .boxplot(column='Training score', by='y',
          showfliers=True, ax=ax[0]))
plt.title('Training score')

(pd.DataFrame({'y': test_y, 'Test score': score_test_RF})
 .boxplot(column='Test score', by='y', showfliers=True, ax=ax[1]))
plt.title('Test score')

plt.suptitle('Distribution of scores by class')

plt.show()

In [None]:
# ROC Curve.
fpr_train_RF, tpr_train_RF, thresholds_train_RF = \
    metrics.roc_curve(y_train, score_train_RF, pos_label=1)

fpr_test_RF, tpr_test_RF, thresholds_test_RF = \
    metrics.roc_curve(y_test, score_test_RF, pos_label=1)

fig = plt.figure(figsize=(10, 6))
plt.plot(fpr_train_RF, tpr_train_RF, label='training data')
plt.plot(fpr_test_RF, tpr_test_RF, label='test data')
plt.legend()
plt.title('ROC-curve')
plt.show()

In [None]:
# AUC and accuracy
auc_train_RF = metrics.roc_auc_score(train_y, score_train_RF)
accuractrain_y_RF = metrics.accuracy_score(train_y, (score_train_RF > threshold_RF))

auc_test_RF = metrics.roc_auc_score(test_y, score_test_RF)
accuractest_y_RF = metrics.accuracy_score(test_y, (score_test_RF > threshold_RF))

print('Training data:\nAUC: {auc}\tAccuracy: {acc}\n'
      .format(auc=auc_train_RF, acc=accuractrain_y_RF))

print('Test data:\nAUC: {auc}\tAccuracy: {acc}'
      .format(auc=auc_test_RF, acc=accuractest_y_RF))

In [None]:
# Confusion matrix
cm = metrics.confusion_matrix(test_y, (score_test_RF > threshold_RF))

plot_confusion_matrix(cm, [0, 1])

In [None]:
df_models = pd.DataFrame(
    data=[['GBM', auc_test_GBM, accuracy_test_GBM],
          ['RF', auc_test_RF,accuracy_test_RF]],
    columns=['model', 'AUC', 'accuracy']
)

df_models

In [None]:
plt.plot(fpr_test_RF, tpr_test_RF, label='RF')
plt.plot(fpr_test_GBM, tpr_test_GBM, label='GBM')
plt.legend()
plt.title('ROC-curve on test data for all models')
plt.show()

## Saving the model

In [None]:
output_file_best_model = os.path.join(config.path_model, 'mod_opt_{}.pkl'.format(today))

with open(output_file_best_model, 'wb') as f:
    pickle.dump(mod_opt_GBM, f)

## Or rolling window, retraining

In [None]:
predictions_3 = list()
observations_3 = list()

reg_3 = xgb.XGBClassifier(n_estimators=100, objective='reg:squarederror')

n_records = len(train_X)
n_iterations = round((n_records-(train_size+window_size))/window_size)

for i in range(0, n_iterations):

    y = i * window_size
  
    train_X_iter = train_X.loc[y: y + train_size] 
    train_y_1_iter = train_y_1.loc[y: y + train_size]
    
    eval_X_iter = train_X.loc[1 + y + train_size: y + train_size + window_size]
    eval_y_1_iter = train_y_1.loc[1 + y + train_size : y + train_size + window_size]
    
    reg_3.fit(train_X_iter, train_y_1_iter.values.ravel(),
            eval_metric='mae',
            eval_set=[(train_X_iter, train_y_1_iter.values.ravel()), (eval_X_iter, eval_y_1_iter.values.ravel())],
            early_stopping_rounds=20,
            verbose=False) # fit 
       
    fig, ax = plt.subplots(figsize=(8, 2))
    plot_importance(reg_3, ax = ax, height=0.5,  max_num_features= 12)

    pred = reg_3.predict(eval_X_iter)
    predictions_3.append(pred)
    obs = eval_y_1_iter
    observations_3.append(obs)