In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, \
    GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

from scipy.stats import spearmanr

In [None]:
dataset_path = '../data/model_data.h5'

# Load Data

In [None]:
with pd.HDFStore(dataset_path) as store:
    data = (store['model_data']
            .drop(['Open', 'Close', 'Low', 'High'], axis=1))

In [None]:
data = data.drop([c for c in data.columns if 'lag' in c], axis=1)
data

In [None]:
data.info()

In [None]:
columns_to_drop = ['target_10d', 'target_1d', 'target_21d', 'target_5d']
y = data.filter(like='target_5d')
X = data.drop(columns_to_drop, axis=1)
X = X.drop(['Volume'], axis=1)

In [None]:

class MultipleTimeSeriesCV(BaseCrossValidator):
    """Generates tuples of train_idx, test_idx pairs"""

    def __init__(self,
                 n_splits=3,
                 train_period_length=126,
                 test_period_length=21,
                 lookahead=None,
                 shuffle=False):
        self.n_splits = n_splits
        self.lookahead = lookahead
        self.test_length = test_period_length
        self.train_length = train_period_length
        self.shuffle = shuffle

    def split(self, X, y=None, groups=None):
        unique_dates = X.index.get_level_values('Date').unique()
        days = sorted(unique_dates, reverse=True)

        split_idx = []
        for i in range(self.n_splits):
            test_end_idx = i * self.test_length
            test_start_idx = test_end_idx + self.test_length
            train_end_idx = test_start_idx + self.lookahead - 1
            train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
            split_idx.append([train_start_idx, train_end_idx,
                              test_start_idx, test_end_idx])

        dates = X.reset_index()[['Date']]
        for train_start, train_end, test_start, test_end in split_idx:
            train_idx = dates[(dates.Date > days[train_start])
                              & (dates.Date <= days[train_end])].index
            test_idx = dates[(dates.Date > days[test_start])
                             & (dates.Date <= days[test_end])].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx, test_idx

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

In [None]:
n_splits = 20
train_period_length = 300
test_period_length = 100
lookahead = 5

cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          train_period_length=train_period_length,
                          test_period_length=test_period_length,
                          lookahead=lookahead)

In [None]:
# Utilities functions

def display_score(scores):
    print('scores: ',scores)
    print('mean: ', scores.mean())
    print('standard deviation: ', scores.std())


def rank_correl(y, y_pred):
    return spearmanr(y, y_pred, axis=None)[0]

ic = make_scorer(rank_correl)


def get_cross_val_score(model, X, y, score_fun, cv, n_jobs=-1):
    cv_score = cross_val_score(estimator=model,
                           X=X,
                           y=y,
                           scoring=score_fun,
                           cv=cv,
                           n_jobs=n_jobs,
                           verbose=1)
    display_score(cv_score)

# Decision Tree Regressor

In [None]:
dt_reg = DecisionTreeRegressor(max_depth=None,
                               min_samples_split=2,
                               min_samples_leaf=1,
                               max_features='auto')


In [None]:
get_cross_val_score(dt_reg, X, y, ic, cv)

# Random Forest Regressor

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100,
                                max_depth=None,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0,
                                max_features='auto',
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None,
                                bootstrap=True,
                                oob_score=False,
                                n_jobs=-1,
                                random_state=None,
                                verbose=0,
                                warm_start=False)

In [None]:
get_cross_val_score(rf_reg, X, y, ic, cv)

# Extra Tree Regressor

In [None]:
ext_reg = ExtraTreesRegressor(n_estimators=100,
                                max_depth=None,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0,
                                max_features='auto',
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None,
                                bootstrap=True,
                                oob_score=False,
                                n_jobs=-1,
                                random_state=None,
                                verbose=0,
                                warm_start=False)

In [None]:
get_cross_val_score(ext_reg, X, y, ic, cv)

# Ada Boosting Regressor

In [None]:
ada_reg = AdaBoostRegressor(n_estimators=100,
                            loss='square')

In [None]:
get_cross_val_score(ada_reg, X, y, ic, cv)

# Gradient Boosting Regressor

In [None]:
grad_reg = GradientBoostingRegressor(n_estimators=250,
                                        max_depth=None,
                                        min_samples_split=2,
                                        min_samples_leaf=1,
                                        min_weight_fraction_leaf=0.0,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        random_state=None,
                                        verbose=0,
                                        warm_start=False)

In [None]:
get_cross_val_score(grad_reg, X, y, ic, cv, n_jobs=1)


It seems like that the best learner is the `ExtraTreesRegressor`. Let's fine tune it in order to find
the bests parameters

In [27]:
# param_grid = {'n_estimators': [50, 100, 250],
#               'learning_rate': [0.5, 0.75, 1, 1.25],
#               'loss': ['linear', 'square', 'exponential']}

param_grid = {'n_estimators': [50, 100, 250],
              'max_depth': [5, 15, None],
              'min_samples_leaf': [5, 25, 100],
              'max_features': ['auto', 'sqrt', 'log2']}


In [28]:

from sklearn.model_selection import GridSearchCV

gridsearch_reg = GridSearchCV(estimator=rf_reg,
                              param_grid=param_grid,
                              scoring=ic,
                              n_jobs=-1,
                              cv=cv,
                              refit=True,
                              return_train_score=True,
                              verbose=1)

In [None]:
gridsearch_reg.fit(X, y.target_5d.ravel())

Fitting 20 folds for each of 243 candidates, totalling 4860 fits


In [None]:
best_learner = gridsearch_reg.best_estimator_

In [None]:
import pickle

best_model_filename = './best_tree_model.pkl'

with open(best_model_filename, 'wb') as file:
    pickle.dump(best_learner, file)


In [None]:
gridsearch_reg.best_params_

In [None]:
f'{gridsearch_reg.best_score_:.2f}'

## Evaluate the Best Model

In [None]:
for predicted, actual in zip(best_learner.predict(X.iloc[-15:]), y.target_5d.values[-15:]):
    print('Predicted: ', predicted)
    print('Actual: ', actual)
    print('Spread: ', np.abs(predicted - actual), end='\n\n')

In [None]:
print(rank_correl(best_learner.predict(X.iloc[-15:]), y.target_5d.values[-15:]))


## Parameters importance

In [None]:

fig, ax = plt.subplots(figsize=(12,5))
(pd.Series(best_learner.feature_importances_, index=X.columns)
 .sort_values(ascending=False)
 .iloc[:20]
 .sort_values()
 .plot.barh(ax=ax, title='Feature Importance'))

sns.despine()

fig.tight_layout();

In [None]:
X.columns