In [35]:
#Importing required libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [4]:
dataset= pd.read_csv("SP.csv")
dataset

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1985-01-02,167.199997,167.199997,165.190002,165.369995,165.369995,67820000
1,1985-01-03,165.369995,166.110001,164.380005,164.570007,164.570007,88880000
2,1985-01-04,164.550003,164.550003,163.360001,163.679993,163.679993,77480000
3,1985-01-07,163.679993,164.710007,163.679993,164.240005,164.240005,86190000
4,1985-01-08,164.240005,164.589996,163.910004,163.990005,163.990005,92110000
...,...,...,...,...,...,...,...
8941,2020-06-23,3138.699951,3154.899902,3127.120117,3131.290039,3131.290039,4704830000
8942,2020-06-24,3114.399902,3115.010010,3032.129883,3050.330078,3050.330078,5587200000
8943,2020-06-25,3046.600098,3086.250000,3024.010010,3083.760010,3083.760010,4815420000
8944,2020-06-26,3073.199951,3073.729980,3004.629883,3009.050049,3009.050049,8098120000


In [5]:
df = pd.DataFrame(dataset)

In [6]:
#Removing unneccessary columns
df1=df.drop(['Date','Open','High','Low','Close'], axis=1)

In [7]:
#Creating 'Today' column based on Percentage Change
df1['Today'] = df1['Adj Close'].pct_change(1)

In [8]:
#Creating 'Direction' column based on the percentage change of that day.
df1.loc[df1['Today'] > 0 , 'Direction'] = 1
df1.loc[df1['Today'] < 0 , 'Direction'] = 0

In [10]:
#Dropping unnecessary columns
data = df1.drop(['Adj Close','Volume'], axis=1)

In [11]:
data

Unnamed: 0,Today,Direction
0,,
1,-0.004838,0.0
2,-0.005408,0.0
3,0.003421,1.0
4,-0.001522,0.0
...,...,...
8941,0.004307,1.0
8942,-0.025855,0.0
8943,0.010959,1.0
8944,-0.024227,0.0


In [12]:
#Removing Null values
df3 = data.dropna()

In [14]:
df3['Direction'].value_counts()

1.0    4824
0.0    4112
Name: Direction, dtype: int64

In [13]:
#Over sampling for Balancing the data
from imblearn.over_sampling import SMOTE



In [16]:
#Creating Dependent and Independent variables
X = df3.drop(['Direction'], axis=1)
Y = df3['Direction']

In [17]:
# Implementing Oversampling 
oversample = SMOTE()
X_res,Y_res=oversample.fit_sample(X,Y)



In [18]:
X_res.shape,Y_res.shape

((9648, 1), (9648,))

In [19]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(Y_res)))

Original dataset shape Counter({1.0: 4824, 0.0: 4112})
Resampled dataset shape Counter({0.0: 4824, 1.0: 4824})


In [21]:
X1 = pd.DataFrame(X_res)
Y1 = pd.DataFrame(Y_res)

In [22]:
data2 = pd.concat([X1,Y1],axis=1)
data3 = pd.DataFrame(data2)
data3

Unnamed: 0,0,0.1
0,-0.004838,0.0
1,-0.005408,0.0
2,0.003421,1.0
3,-0.001522,0.0
4,0.007256,1.0
...,...,...
9643,-0.002398,0.0
9644,-0.003066,0.0
9645,-0.005806,0.0
9646,-0.003208,0.0


In [23]:
data3.columns= ['Today','Direction']

In [24]:
#Creating lag features required
data3['lag_1'] = data3['Today'].shift(1)
data3['lag_2'] = data3['Today'].shift(2)
data3['lag_3'] = data3['Today'].shift(3)
data3['lag_4'] = data3['Today'].shift(4)
data3['lag_5'] = data3['Today'].shift(5)

In [26]:
df4 = data3.dropna()
df4

Unnamed: 0,Today,Direction,lag_1,lag_2,lag_3,lag_4,lag_5
5,0.018949,1.0,0.007256,-0.001522,0.003421,-0.005408,-0.004838
6,-0.002377,0.0,0.018949,0.007256,-0.001522,0.003421,-0.005408
7,0.015484,1.0,-0.002377,0.018949,0.007256,-0.001522,0.003421
8,0.001759,1.0,0.015484,-0.002377,0.018949,0.007256,-0.001522
9,0.002225,1.0,0.001759,0.015484,-0.002377,0.018949,0.007256
...,...,...,...,...,...,...,...
9643,-0.002398,0.0,-0.005232,-0.002182,-0.006695,-0.001711,-0.000137
9644,-0.003066,0.0,-0.002398,-0.005232,-0.002182,-0.006695,-0.001711
9645,-0.005806,0.0,-0.003066,-0.002398,-0.005232,-0.002182,-0.006695
9646,-0.003208,0.0,-0.005806,-0.003066,-0.002398,-0.005232,-0.002182


In [27]:
#Correlation plot
rs = np.random.RandomState(0)
corr = df4.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Today,Direction,lag_1,lag_2,lag_3,lag_4,lag_5
Today,1.0,0.654304,-0.031364,0.013759,0.029806,0.003869,0.028811
Direction,0.654304,1.0,0.01567,0.039851,0.041363,0.053528,0.044709
lag_1,-0.031364,0.01567,1.0,-0.031374,0.013773,0.029772,0.00384
lag_2,0.013759,0.039851,-0.031374,1.0,-0.031391,0.013772,0.029772
lag_3,0.029806,0.041363,0.013773,-0.031391,1.0,-0.031418,0.013749
lag_4,0.003869,0.053528,0.029772,0.013772,-0.031418,1.0,-0.031404
lag_5,0.028811,0.044709,0.00384,0.029772,0.013749,-0.031404,1.0


In [28]:
#Removing highly correlatedd feature
df5 = df4.drop(['Today'],axis=1)

In [29]:
finalDF = pd.DataFrame(df5)

In [30]:
finalDF

Unnamed: 0,Direction,lag_1,lag_2,lag_3,lag_4,lag_5
5,1.0,0.007256,-0.001522,0.003421,-0.005408,-0.004838
6,0.0,0.018949,0.007256,-0.001522,0.003421,-0.005408
7,1.0,-0.002377,0.018949,0.007256,-0.001522,0.003421
8,1.0,0.015484,-0.002377,0.018949,0.007256,-0.001522
9,1.0,0.001759,0.015484,-0.002377,0.018949,0.007256
...,...,...,...,...,...,...
9643,0.0,-0.005232,-0.002182,-0.006695,-0.001711,-0.000137
9644,0.0,-0.002398,-0.005232,-0.002182,-0.006695,-0.001711
9645,0.0,-0.003066,-0.002398,-0.005232,-0.002182,-0.006695
9646,0.0,-0.005806,-0.003066,-0.002398,-0.005232,-0.002182


In [32]:
#Create pre-processed dataset
finalDF.to_csv('Pre-Processed S&P 500.csv')

In [33]:
#Creating Input and Output variables
X2= finalDF.drop(['Direction'], axis=1)
Y2= finalDF['Direction']

In [34]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate

In [41]:
#Hyper Parameter Optimization
from sklearn.model_selection import GridSearchCV

In [37]:
# Splitting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y2, test_size=0.20)
print (X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(7714, 5) (7714,)
(1929, 5) (1929,)


In [39]:
#Adaptive Boosting
model= DecisionTreeClassifier(criterion='entropy', max_depth=7)
#List of Hyper Parameters
param_grid = {
    'n_estimators': [100,200],
    'learning_rate': [0.001,0.01,0.1,0.2,0.5]
}
#GridSearchCV
GridAdaBoost = GridSearchCV(AdaBoostClassifier(base_estimator=model),param_grid=param_grid,cv=5)
GridAdaBoost.fit(X_train,Y_train)
Pred = GridAdaBoost.predict(X_test)
GridAdaBoost.best_score_


0.5624856996115675

In [45]:
GridAdaBoost.best_params_

{'learning_rate': 0.001, 'n_estimators': 100}

In [47]:
#Optimised Parameters
OptAdaBoost= AdaBoostClassifier(base_estimator=model,n_estimators=100,learning_rate=0.001)
OptAdaBoost.fit(X_train,Y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='entropy',
                                                         max_depth=7,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                       

In [48]:
ABpred = OptAdaBoost.predict(X_test)
accuracy = accuracy_score(Y_test,ABpred)
accuracy

0.543286677034733

In [49]:
# Gradient Boost Optimisation
parameters = {
    "n_estimators":[5,50,100,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,0.5,1,10]
    }
#passing the scoring function in the GridSearchCV
GridGradBoost = GridSearchCV(GradientBoostingClassifier(), parameters,cv=5, n_jobs=-1)
GridGradBoost.fit(X_train, Y_train)
GridGradBoost.best_score_

0.5653361946718917

In [50]:
GridGradBoost.best_params_

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}

In [51]:
OptGradBoost= GridGradBoost.best_estimator_

In [52]:
OptGradBoost.fit(X_train,Y_train)
GBpred = OptGradBoost.predict(X_test)
accuracy = accuracy_score(Y_test,GBpred)

In [53]:
accuracy

0.5609123898392949

In [54]:
param_grid={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}
GridXGBoost = GridSearchCV(XGBClassifier(), param_grid=param_grid, n_jobs= -1,cv=5, verbose=3)
GridXGBoost.fit(X_train,Y_train)

Fitting 5 folds for each of 3840 candidates, totalling 19200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 508 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 1148 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 1564 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 2044 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 3868 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 4604 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done 5404 tasks      | elapsed: 25.8min
[Parallel(n_jobs=-1)]: Done 6268 tasks      | elapsed: 31.5min
[Parallel(n_jobs=-1)]: Done 7196 tasks      | e

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_po...ght=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
                         'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                         'learning_rate': [0.05, 0.1,

In [55]:
GridXGBoost.best_params_

{'colsample_bytree': 0.7,
 'gamma': 0.2,
 'learning_rate': 0.05,
 'max_depth': 12,
 'min_child_weight': 7}

In [58]:
# Instantiate the machine learning classifiers
AdaBoostModel = AdaBoostClassifier(base_estimator=model, n_estimators=100, learning_rate=0.001)
GradientBoostModel = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=9)
XGBoostModel = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.3,
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [60]:
scoring = {'accuracy':make_scorer(accuracy_score),
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score),
           'AUC':make_scorer(roc_auc_score)}

In [59]:
# Define the models evaluation function
def models_evaluation(X2, Y2, folds):
    
    '''
    X2 : data set features
    Y2 : data set target
    folds : number of cross-validation folds
    
    '''
    
    # Perform cross-validation to each machine learning classifier
    Ada = cross_validate(AdaBoostModel, X2, Y2, cv=folds, scoring=scoring)
    Grad = cross_validate(GradientBoostModel, X2, Y2, cv=folds, scoring=scoring)
    XG = cross_validate(XGBoostModel, X2, Y2, cv=folds, scoring=scoring)
    

    # Create a data frame with the models perfoamnce metrics scores
    models_scores_table = pd.DataFrame({'Ada Boost Classifier':[Ada['test_accuracy'].mean(),
                                                               Ada['test_precision'].mean(),
                                                               Ada['test_recall'].mean(),
                                                               Ada['test_f1_score'].mean(),
                                                                Ada['test_AUC'].mean()],
                                       
                                      'Gradient Boost Classifier':[Grad['test_accuracy'].mean(),
                                                                   Grad['test_precision'].mean(),
                                                                   Grad['test_recall'].mean(),
                                                                   Grad['test_f1_score'].mean(),
                                                                   Grad['test_AUC'].mean()],
                                       
                                      'XG Boost Classifier':[XG['test_accuracy'].mean(),
                                                       XG['test_precision'].mean(),
                                                       XG['test_recall'].mean(),
                                                       XG['test_f1_score'].mean(),
                                                       XG['test_AUC'].mean()]},
                                       
                                      
                                      
                                      index= ['Accuracy', 'Precision', 'Recall', 'F1 Score','AUC'])
    
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
    
    # Return models performance metrics scores data frame
    return(models_scores_table)
  
# Run models_evaluation function
models_evaluation(X2, Y2, 10)

Unnamed: 0,Ada Boost Classifier,Gradient Boost Classifier,XG Boost Classifier,Best Score
Accuracy,0.525677,0.52578,0.518935,Gradient Boost Classifier
Precision,0.549132,0.548411,0.532359,Ada Boost Classifier
Recall,0.724169,0.615296,0.574657,Ada Boost Classifier
F1 Score,0.604435,0.568671,0.54609,Ada Boost Classifier
AUC,0.525661,0.525768,0.518931,Gradient Boost Classifier


In [61]:
!pip install h2o

Collecting h2o
[?25l  Downloading https://files.pythonhosted.org/packages/b0/e7/b7057e4a6832f3bec0cb36fda4913bf84a6dc610c92a2d3543442f4154a1/h2o-3.30.1.1.tar.gz (129.3MB)
[K     |████████████████████████████████| 129.3MB 1.4MB/s 
Collecting colorama>=0.3.8
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.30.1.1-py2.py3-none-any.whl size=129358602 sha256=3aed56dc35858933c8e6b756f4650cd80fcd9fd7a054fb4ee36e429fcf224f10
  Stored in directory: /root/.cache/pip/wheels/33/ac/52/165c35d747abdb629c3c9fb7e087f360c662d8cb58824caed8
Successfully built h2o
Installing collected packages: colorama, h2o
Successfully installed colorama-0.4.3 h2o-3.30.1.1


In [78]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 min 29 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.1
H2O_cluster_version_age:,20 days
H2O_cluster_name:,H2O_from_python_unknownUser_to8sxn
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.179 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [63]:
from sklearn.model_selection import train_test_split

In [64]:
Y_train = Y_train.astype('category')
Y_test = Y_test.astype('category')

In [65]:
nfolds = 5

In [66]:
X_train

Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5
6508,0.007759,0.000947,-0.000439,0.001125,-0.002691
2980,-0.004621,-0.001365,0.005417,0.003663,0.002619
5349,-0.001028,0.000983,-0.002582,0.006021,-0.006015
9412,-0.006071,-0.005257,-0.014285,-0.004149,-0.001479
7047,-0.002440,-0.009379,0.005488,-0.007589,0.011487
...,...,...,...,...,...
2636,0.000783,0.009739,0.005569,-0.008284,-0.001463
7395,-0.004701,0.000422,0.009673,0.001519,-0.001374
9015,-0.002837,-0.009121,-0.001690,-0.008606,-0.000668
707,-0.039206,0.090994,0.053327,-0.204669,-0.051597


In [67]:
train_data = pd.concat([X_train,Y_train],axis=1)

In [68]:
train_data

Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5,Direction
6508,0.007759,0.000947,-0.000439,0.001125,-0.002691,1.0
2980,-0.004621,-0.001365,0.005417,0.003663,0.002619,1.0
5349,-0.001028,0.000983,-0.002582,0.006021,-0.006015,0.0
9412,-0.006071,-0.005257,-0.014285,-0.004149,-0.001479,0.0
7047,-0.002440,-0.009379,0.005488,-0.007589,0.011487,0.0
...,...,...,...,...,...,...
2636,0.000783,0.009739,0.005569,-0.008284,-0.001463,1.0
7395,-0.004701,0.000422,0.009673,0.001519,-0.001374,0.0
9015,-0.002837,-0.009121,-0.001690,-0.008606,-0.000668,0.0
707,-0.039206,0.090994,0.053327,-0.204669,-0.051597,0.0


In [69]:
test_data = pd.concat([X_test,Y_test],axis=1)

In [84]:
train= h2o.H2OFrame(train_data)
test= h2o.H2OFrame(test_data)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [85]:
train

lag_1,lag_2,lag_3,lag_4,lag_5,Direction
0.00775928,0.000946533,-0.000439287,0.00112485,-0.00269058,1
-0.00462065,-0.00136466,0.00541736,0.00366266,0.00261891,1
-0.00102841,0.000983281,-0.00258229,0.00602057,-0.00601494,0
-0.00607125,-0.00525669,-0.0142847,-0.00414929,-0.00147875,0
-0.0024403,-0.00937869,0.00548811,-0.0075892,0.0114867,0
-0.00843206,0.00801616,-0.00908366,-0.00639784,-0.00761525,1
-0.011725,-0.00445815,0.00412338,-0.0015629,0.00063883,1
-0.00381122,-0.00185194,0.00203303,0.00161005,0.00503296,1
-0.00526923,-0.00716818,-0.00126063,-0.0139172,-0.000539655,0
-0.00705464,0.0022295,0.00169042,0.00495363,0.00189021,1




In [86]:
x=train.columns

In [87]:
y="Direction"

In [88]:
x.remove(y)

In [89]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [90]:
train[y]

Direction
1
1
0
0
0
1
1
1
0
1




In [91]:
#1. Generate a 2-model ensemble (GBM + RF)

# Train and cross-validate a GBM
my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                      ntrees=100,
                                      max_depth=3,
                                      min_rows=2,
                                      learn_rate=0.001,
                                      nfolds=nfolds,
                                      fold_assignment="Stratified",
                                      keep_cross_validation_predictions=True,
                                      seed=1)
my_gbm.train(x=x, y=y, training_frame=train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [92]:
# Train and cross-validate a RF
my_rf = H2ORandomForestEstimator(ntrees=50,
                                 nfolds=nfolds,
                                 fold_assignment="Stratified",
                                 keep_cross_validation_predictions=True,
                                 seed=1)
my_rf.train(x=x, y=y, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [93]:
from h2o.estimators import H2OXGBoostEstimator

In [94]:
my_xgb = H2OXGBoostEstimator(booster='gbtree',normalize_type="tree",nfolds=nfolds,fold_assignment="Stratified",keep_cross_validation_predictions=True,seed=None,
                             ntrees=100,max_depth=10,min_rows=1 ,min_split_improvement=0.3, learn_rate=0.1)
my_xgb.train(x=x,y=y,training_frame=train)


xgboost Model Build progress: |███████████████████████████████████████████| 100%


In [95]:
# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
                                       base_models=[my_gbm, my_rf,my_xgb])
ensemble.train(x=x, y=y, training_frame=train)

# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [96]:
# Compare to base learner performance on the test set
perf_gbm_test = my_gbm.model_performance(test)
perf_rf_test = my_rf.model_performance(test)
perf_xgb_test = my_xgb.model_performance(test)
baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc(),perf_xgb_test.auc())
stack_auc_test = perf_stack_test.auc()
print("Ensemble Test AUC:  {0}".format(stack_auc_test))

Ensemble Test AUC:  0.581866689315879


In [None]:
# Generate predictions on a test set
pred = ensemble.predict(test)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [97]:
from h2o.automl import H2OAutoML

In [98]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train, leaderboard_frame=test)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [103]:
# View the AutoML Leaderboard
lb = aml.leaderboard
Results= lb.head()

In [106]:
AutoMLresults= h2o.as_list(Results)
AutoMLresults.to_csv('S&P 500 AutoML.csv')
