In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import GridSearchCV, PredefinedSplit

pd.set_option('display.max_columns', 500)

def train_test_split(X, y, train_idx=None, test_idx=None):
    X_train = X.loc[train_idx]
    y_train = y.loc[train_idx]
    X_test = X.loc[test_idx]
    y_test = y.loc[test_idx]
    return (X_train, y_train, X_test, y_test)

def load_split_data(suffix, split=False):
    X = pd.read_pickle(f'data/X_{suffix}.pkl')
    y = pd.read_pickle(f'data/y_{suffix}.pkl')
    if split:
        X_train, y_train, X_test, y_test = train_test_split(X, y, X.loc['2018':'2020'].index, X.loc['2021':].index)
        return X_train, y_train, X_test, y_test
    else:
        return X, y

def get_columns(X,lookbacks):
    # Drop columns with lookbacks equal to or greater than X
    columns = list(X.columns)
    for c in X.columns:
        if m := re.match(r'^.*_([0-9]+)$', c):
            if int(m[1]) > lookbacks:
                columns.remove(c)
    return columns

def get_cv_train_test_split(X):
    # Returns a single fold with train/test split
    train_indices = np.full((len(X.loc['2018':'2020']),), -1, dtype=int)
    test_indices =  np.full((len(X.loc['2021':]),), 0, dtype=int)
    test_fold = np.append(train_indices, test_indices)
    
    ps = PredefinedSplit(test_fold)
    ps.get_n_splits()
    return ps

# AdaBoost GridSearch

In [2]:
from sklearn.ensemble import AdaBoostClassifier

X,y = load_split_data('20210806i')
ps = get_cv_train_test_split(X)
columns = get_columns(X, 15)

parameters = {
    'n_estimators': [10,50,100],
    'learning_rate': [0.5,1,3],
    'algorithm': ['SAMME', 'SAMME.R']
}

c = AdaBoostClassifier(random_state=42)
clf = GridSearchCV(c, parameters, verbose=4, cv=ps, scoring='precision', n_jobs=-1) \
        .fit(X.loc['2018':][columns], y.loc['2018':].buy)

Fitting 1 folds for each of 18 candidates, totalling 18 fits


 0.40973783        nan        nan 0.         0.52199413 0.51705566
 0.49160671 0.50972763 0.48931116 0.40973783 0.40973783 0.40973783]


Best performer:
`n_estimators = 100, learning_rate=1, algorithm='SAMME'`

Next steps:
 - check recall
 - increase n_estimators
 - Try wtih SAMME.R

In [3]:
pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_learning_rate,param_n_estimators,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
5,344.774518,0.0,2.652911,0.0,SAMME,1.0,100,"{'algorithm': 'SAMME', 'learning_rate': 1, 'n_...",0.589286,0.589286,0.0,1
10,192.026799,0.0,1.179845,0.0,SAMME.R,0.5,50,"{'algorithm': 'SAMME.R', 'learning_rate': 0.5,...",0.521994,0.521994,0.0,2
4,188.86821,0.0,1.447132,0.0,SAMME,1.0,50,"{'algorithm': 'SAMME', 'learning_rate': 1, 'n_...",0.521739,0.521739,0.0,3
11,345.47764,0.0,2.782592,0.0,SAMME.R,0.5,100,"{'algorithm': 'SAMME.R', 'learning_rate': 0.5,...",0.517056,0.517056,0.0,4
13,189.377849,0.0,1.429181,0.0,SAMME.R,1.0,50,"{'algorithm': 'SAMME.R', 'learning_rate': 1, '...",0.509728,0.509728,0.0,5
12,58.321123,0.0,1.051191,0.0,SAMME.R,1.0,10,"{'algorithm': 'SAMME.R', 'learning_rate': 1, '...",0.491607,0.491607,0.0,6
14,344.901181,0.0,2.732697,0.0,SAMME.R,1.0,100,"{'algorithm': 'SAMME.R', 'learning_rate': 1, '...",0.489311,0.489311,0.0,7
17,77.257511,0.0,2.805502,0.0,SAMME.R,3.0,100,"{'algorithm': 'SAMME.R', 'learning_rate': 3, '...",0.409738,0.409738,0.0,8
15,43.570549,0.0,0.860701,0.0,SAMME.R,3.0,10,"{'algorithm': 'SAMME.R', 'learning_rate': 3, '...",0.409738,0.409738,0.0,8
6,45.217148,0.0,0.871673,0.0,SAMME,3.0,10,"{'algorithm': 'SAMME', 'learning_rate': 3, 'n_...",0.409738,0.409738,0.0,8


# GradientBoost GridSearch

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

X,y = load_split_data('20210806i')
ps = get_cv_train_test_split(X)
columns = get_columns(X, 15)

parameters = {
    #'n_estimators': [100,500,1000],
    #'learning_rate': [0.05,0.1,0.5],
    #'loss': ['deviance', 'exponential'],
    #'criterion': ['friedman_mse', 'mse'],
    #'max_depth': [2,3,4],
    'max_features': ['sqrt','log2',len(columns)]
}

c = GradientBoostingClassifier(random_state=42)
clf = GridSearchCV(c, parameters, verbose=4, cv=ps, scoring='precision', n_jobs=-1) \
        .fit(X.loc['2018':][columns], y.loc['2018':].buy)

Fitting 1 folds for each of 3 candidates, totalling 3 fits


In [5]:
pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,773.067866,0.0,0.110704,0.0,100,{'n_estimators': 100},0.520073,0.520073,0.0,1
1,3849.020907,0.0,0.24136,0.0,500,{'n_estimators': 500},0.492887,0.492887,0.0,2
2,7669.900866,0.0,0.391952,0.0,1000,{'n_estimators': 1000},0.486782,0.486782,0.0,3


In [4]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_loss,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,832.186143,0.0,0.129625,0.0,deviance,{'loss': 'deviance'},0.520073,0.520073,0.0,1
1,832.672845,0.0,0.110704,0.0,exponential,{'loss': 'exponential'},0.514113,0.514113,0.0,2


In [4]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,799.27582,0.0,0.122669,0.0,friedman_mse,{'criterion': 'friedman_mse'},0.520073,0.520073,0.0,1
1,798.065028,0.0,0.115691,0.0,mse,{'criterion': 'mse'},0.520073,0.520073,0.0,1


In [6]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,537.019942,0.0,0.103722,0.0,2,{'max_depth': 2},0.51073,0.51073,0.0,3
1,795.071842,0.0,0.113696,0.0,3,{'max_depth': 3},0.520073,0.520073,0.0,2
2,1045.575934,0.0,0.125664,0.0,4,{'max_depth': 4},0.522759,0.522759,0.0,1


In [8]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,35.75438,0.0,0.128656,0.0,sqrt,{'max_features': 'sqrt'},0.520408,0.520408,0.0,2
1,15.588313,0.0,0.130652,0.0,log2,{'max_features': 'log2'},0.573171,0.573171,0.0,1
2,793.825928,0.0,0.112705,0.0,542,{'max_features': 542},0.520073,0.520073,0.0,3
