In [None]:
%%capture
%run 04_alphaml_feat_eng.ipynb

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid

from scipy.stats import spearmanr
from tqdm import tqdm


In [None]:
from mle_quant_utils import quant_helper, quant_factors, mle_utils
import project_7_helper as project_helper

In [None]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16, 8)
sns.set_context("talk")

In [None]:
import yaml
import os

# Retrieve parameters from configuration file
with open("../conf.yml", "r") as ymlfile:
    cfg = yaml.load(ymlfile)

OUTPATH =  "../data/" + cfg['output']['main'] + "/" + cfg['output']['alpha_factors_ml']['folder'] + "/"
OUTFILE1 = cfg['output']['alpha_factors_ml']['features']
OUTFILE2 = cfg['output']['alpha_factors_ml']['targets']

In [None]:
target_col = cfg['models']['alpha_ml']['target_col']
splits = cfg['models']['alpha_ml']['splits']

# Data Partitioning

In [None]:
X_train, X_valid, X_test, y_train, y_valid, y_test = mle_utils.train_valid_test_split(all_factors, targets_df['target'],
                                                                                      splits['train'], splits['valid'], splits['test'])

# Model Building: Hyper Parameter Tunning

In [None]:
n_days = 10
n_stocks = 500
clf_random_state = 123
clf_parameters = ParameterGrid({
    'min_samples_leaf': n_stocks * np.array([10, 20]),
    'max_features': [0.5, 0.75, 1.0],
    'n_estimators': [250],
    'criterion': ['entropy'],
    'oob_score': [True],
    'n_jobs': [-1],
    'random_state': [clf_random_state]})

Recall from the lesson, that we’ll choose a min_samples_leaf parameter to be small enough to allow the tree to fit the data with as much detail as possible, but not so much that it overfits.  We can first propose 500, which is the number of assets in the estimation universe. Since we have about 500 stocks in the stock universe, we’ll want at least 500 stocks in a leaf for the leaf to make a prediction that is representative.  It’s common to multiply this by 2,3,5 or 10, so we’d have min samples leaf of 500, 1000, 1500, 2500, and 5000. If we were to try these values, we’d notice that the model is “too good to be true” on the training data.  A good rule of thumb for what is considered “too good to be true”, and therefore a sign of overfitting, is if the sharpe ratio is greater than 4.  Based on this, we recommend using min_sampes_leaf of 10 * 500, or 5,000.

Feel free to try other values for these parameters, but also keep in mind that making too many small adjustments to hyper-parameters can lead to overfitting even the validation data, and therefore lead to less generalizable performance on the out-of-sample test set.  So when trying different parameter values, choose values that are different enough in scale (i.e. 10, 20, 100 instead of 10,11,12).

In [None]:
for n_trees in tqdm(n_trees_l, desc='Training Models', unit='Model'):
    clf_nov = mle_utils.NoOverlapVoter(RandomForestClassifier(n_trees, **clf_parameters), n_skip_samples=4)
    clf_nov.fit(X_train, y_train)
    results['voting'].loc[n_trees, :] = [clf_nov.score(X_train, y_train.values), clf_nov.score(X_valid, y_valid.values), clf_nov.oob_score_]
    models['voting'].append(clf_nov)