This module presents two methods to find the best hyperparameters for teh random forest and gradient boosting

In [1]:
import matplotlib.pyplot as plt
from prettytable import PrettyTable
from classes.DataLoader import DataLoader

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

import pandas as pd
import numpy as np
from classes import GradientBoosting, RandomForestClassifier
from classes.hyperparameters_tuning import HyperparametersTuning as hp

# Import data and create in sample and out sample data

In [2]:
data = pd.read_csv("data/dataset.csv", index_col=0)
df_trv = data.rolling(5).std()  # standart deviation of the last 5 days
df_trv.index = pd.to_datetime(df_trv.index)
df_trv = df_trv.iloc[4:]
df = df_trv["S&P500"].dropna()
# Min max scaling
mi = df.min()
ma = df.max()
df = (df - mi)/(ma-mi)

df = df["2000":"2007"]

pivot_index_training_1 = round(df.shape[0] * 0.25)
pivot_index_training_2 = round(df.shape[0] * 0.75)


In [3]:
df_train_1 = df.iloc[:pivot_index_training_1]
df_train_2 = df.iloc[pivot_index_training_1:pivot_index_training_2]
df_test = df.iloc[pivot_index_training_2:]

ite_train_1 = DataLoader(df=df_train_1, Y=df_train_1.values, window_size=30, batch_size=df_train_1.shape[0])
for batch in ite_train_1: # Only one batch there
    X_train_1 , y_train_1 = batch

In [4]:
ite_train_2 = DataLoader(df=df_train_2, Y=df_train_2.values, window_size=30, batch_size=df_train_2.shape[0])
ite_test = DataLoader(df=df_test, Y=df_test.values, window_size=30, batch_size=df_test.shape[0])

for batch in ite_train_2: # Only one batch there
    X_train_2,y_train_2 = batch

# Finding best hyperparameters for GB with library with CBB

In [5]:
np.random.seed(42)
hyp = {'min_samples_split': [2, 3], 'max_depth': [2, 3, 5], 'n_estimators': [10, 20, 30], 'learning_rate': [0.005, 0.01, 0.015]}
h = hp(n_iter=20, model=GradientBoostingRegressor, dicto=hyp)
hyper_gb_lib = h.block_bootstrap(X_train_2, y_train_2, size_block=100, type_tree='GB')

In [6]:
hyper_gb_lib

(3, 5, 30, 0.015)

# Finding best hyperparameters for RF with library with CBB

In [7]:
np.random.seed(42)
hyp = {'min_samples_split': [2, 3], 'max_depth': [2, 3, 5], 'n_estimators': [10, 20, 30], 'learning_rate': [0.005, 0.01, 0.015]}
h = hp(n_iter=20, model=RandomForestRegressor, dicto=hyp)
hyper_rf_lib = h.block_bootstrap(X_train_2, y_train_2, size_block=100, type_tree='RF')

In [8]:
hyper_rf_lib

(3, 5, 30, 0.01)

# Finding best hyperparameters for GB with library with stationary bootstrap

In [9]:
np.random.seed(42)
hyp = {'min_samples_split': [2, 3], 'max_depth': [2, 3, 5], 'n_estimators': [10, 20, 30], 'learning_rate': [0.005, 0.01, 0.015]}
h = hp(n_iter=20, model=GradientBoostingRegressor, dicto=hyp)
hyper_gb_lib = h.block_bootstrap(X_train_2, y_train_2, size_block=100, type_tree='GB', type_boot='stationary')

In [10]:
hyper_gb_lib

(2, 5, 30, 0.015)

# Finding best hyperparameters for RF with library with stationary bootstrap

In [11]:
np.random.seed(42)
hyp = {'min_samples_split': [2, 3], 'max_depth': [2, 3, 5], 'n_estimators': [10, 20, 30], 'learning_rate': [0.005, 0.01, 0.015]}
h = hp(n_iter=20, model=RandomForestRegressor, dicto=hyp)
hyper_rf_lib = h.block_bootstrap(X_train_2, y_train_2, size_block=100, type_tree='RF', type_boot='stationary')

In [12]:
hyper_rf_lib

(3, 5, 10, 0.005)