In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
dataset= pd.read_csv("NYSE.csv")
dataset

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1985-01-02,1009.049988,1009.049988,1009.049988,1009.049988,1009.049988,0
1,1985-01-03,1005.030029,1005.030029,1005.030029,1005.030029,1005.030029,0
2,1985-01-04,1000.270020,1000.270020,1000.270020,1000.270020,1000.270020,0
3,1985-01-07,1003.340027,1003.340027,1003.340027,1003.340027,1003.340027,0
4,1985-01-08,1002.500000,1002.500000,1002.500000,1002.500000,1002.500000,0
...,...,...,...,...,...,...,...
8941,2020-06-23,12157.370117,12176.429688,12068.089844,12077.740234,12077.740234,4704830000
8942,2020-06-24,11956.290039,11961.620117,11650.910156,11726.540039,11726.540039,5587200000
8943,2020-06-25,11680.389648,11866.200195,11640.019531,11865.110352,11865.110352,4815420000
8944,2020-06-26,11796.400391,11814.009766,11578.809570,11604.429688,11604.429688,8098120000


In [3]:
df = pd.DataFrame(dataset)

In [4]:
#Removing unneccessary columns
df1=df.drop(['Date','Open','High','Low','Close'], axis=1)

In [5]:
#Creating 'Today' column based on Percentage Change
df1['Today'] = df1['Adj Close'].pct_change(1)

In [6]:
#Creating 'Direction' column based on the percentage change of that day.
df1.loc[df1['Today'] > 0 , 'Direction'] = 1
df1.loc[df1['Today'] < 0 , 'Direction'] = 0

In [7]:
#Dropping unnecessary columns
data = df1.drop(['Adj Close','Volume'], axis=1)

In [8]:
data

Unnamed: 0,Today,Direction
0,,
1,-0.003984,0.0
2,-0.004736,0.0
3,0.003069,1.0
4,-0.000837,0.0
...,...,...
8941,0.004059,1.0
8942,-0.029078,0.0
8943,0.011817,1.0
8944,-0.021970,0.0


In [9]:
#Removing Null values
df3 = data.dropna()

In [10]:
df3['Direction'].value_counts()

1.0    4816
0.0    4117
Name: Direction, dtype: int64

In [11]:
#Over sampling for Balancing the data
from imblearn.over_sampling import SMOTE



In [12]:
#Creating Dependent and Independent variables
X = df3.drop(['Direction'], axis=1)
Y = df3['Direction']

In [13]:
# Implementing Oversampling 
oversample = SMOTE()
X_res,Y_res=oversample.fit_sample(X,Y)



In [14]:
X_res.shape,Y_res.shape

((9632, 1), (9632,))

In [15]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(Y_res)))

Original dataset shape Counter({1.0: 4816, 0.0: 4117})
Resampled dataset shape Counter({0.0: 4816, 1.0: 4816})


In [16]:
X1 = pd.DataFrame(X_res)
Y1 = pd.DataFrame(Y_res)

In [17]:
data2 = pd.concat([X1,Y1],axis=1)
data3 = pd.DataFrame(data2)
data3

Unnamed: 0,0,0.1
0,-0.003984,0.0
1,-0.004736,0.0
2,0.003069,1.0
3,-0.000837,0.0
4,0.006643,1.0
...,...,...
9627,-0.005868,0.0
9628,-0.001447,0.0
9629,-0.001313,0.0
9630,-0.001471,0.0


In [18]:
data3.columns= ['Today','Direction']

In [19]:
#Creating lag features required
data3['lag_1'] = data3['Today'].shift(1)
data3['lag_2'] = data3['Today'].shift(2)
data3['lag_3'] = data3['Today'].shift(3)
data3['lag_4'] = data3['Today'].shift(4)
data3['lag_5'] = data3['Today'].shift(5)

In [20]:
df4 = data3.dropna()
df4

Unnamed: 0,Today,Direction,lag_1,lag_2,lag_3,lag_4,lag_5
5,0.017708,1.0,0.006643,-0.000837,0.003069,-0.004736,-0.003984
6,-0.001548,0.0,0.017708,0.006643,-0.000837,0.003069,-0.004736
7,0.014433,1.0,-0.001548,0.017708,0.006643,-0.000837,0.003069
8,0.002134,1.0,0.014433,-0.001548,0.017708,0.006643,-0.000837
9,0.003146,1.0,0.002134,0.014433,-0.001548,0.017708,0.006643
...,...,...,...,...,...,...,...
9627,-0.005868,0.0,-0.021188,-0.014360,-0.006349,-0.022368,-0.005237
9628,-0.001447,0.0,-0.005868,-0.021188,-0.014360,-0.006349,-0.022368
9629,-0.001313,0.0,-0.001447,-0.005868,-0.021188,-0.014360,-0.006349
9630,-0.001471,0.0,-0.001313,-0.001447,-0.005868,-0.021188,-0.014360


In [21]:
#Correlation plot
rs = np.random.RandomState(0)
corr = df4.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Today,Direction,lag_1,lag_2,lag_3,lag_4,lag_5
Today,1.0,0.635064,-0.00796,0.016637,0.029453,0.014161,0.026756
Direction,0.635064,1.0,0.039331,0.042291,0.04061,0.043078,0.034297
lag_1,-0.00796,0.039331,1.0,-0.007967,0.016651,0.029424,0.014121
lag_2,0.016637,0.042291,-0.007967,1.0,-0.00797,0.016652,0.02942
lag_3,0.029453,0.04061,0.016651,-0.00797,1.0,-0.007983,0.016637
lag_4,0.014161,0.043078,0.029424,0.016652,-0.007983,1.0,-0.007975
lag_5,0.026756,0.034297,0.014121,0.02942,0.016637,-0.007975,1.0


In [22]:
#Removing highly correlatedd feature
df5 = df4.drop(['Today'],axis=1)

In [23]:
finalDF = pd.DataFrame(df5)

In [24]:
finalDF

Unnamed: 0,Direction,lag_1,lag_2,lag_3,lag_4,lag_5
5,1.0,0.006643,-0.000837,0.003069,-0.004736,-0.003984
6,0.0,0.017708,0.006643,-0.000837,0.003069,-0.004736
7,1.0,-0.001548,0.017708,0.006643,-0.000837,0.003069
8,1.0,0.014433,-0.001548,0.017708,0.006643,-0.000837
9,1.0,0.002134,0.014433,-0.001548,0.017708,0.006643
...,...,...,...,...,...,...
9627,0.0,-0.021188,-0.014360,-0.006349,-0.022368,-0.005237
9628,0.0,-0.005868,-0.021188,-0.014360,-0.006349,-0.022368
9629,0.0,-0.001447,-0.005868,-0.021188,-0.014360,-0.006349
9630,0.0,-0.001313,-0.001447,-0.005868,-0.021188,-0.014360


In [26]:
#Create pre-processed dataset
finalDF.to_csv('Pre-Processed NYSE.csv')

In [27]:
#Creating Input and Output variables
X2= finalDF.drop(['Direction'], axis=1)
Y2= finalDF['Direction']

In [28]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate

In [29]:
#Hyper Parameter Optimization
from sklearn.model_selection import GridSearchCV

In [30]:
# Splitting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y2, test_size=0.20)
print (X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(7701, 5) (7701,)
(1926, 5) (1926,)


In [31]:
#Adaptive Boosting
model= DecisionTreeClassifier(criterion='entropy', max_depth=7)
#List of Hyper Parameters
param_grid = {
    'n_estimators': [100,200],
    'learning_rate': [0.001,0.01,0.1,0.2,0.5]
}
#GridSearchCV
GridAdaBoost = GridSearchCV(AdaBoostClassifier(base_estimator=model),param_grid=param_grid,cv=5)
GridAdaBoost.fit(X_train,Y_train)
Pred = GridAdaBoost.predict(X_test)
GridAdaBoost.best_score_


0.5494099800264628

In [32]:
GridAdaBoost.best_params_

{'learning_rate': 0.001, 'n_estimators': 100}

In [33]:
#Optimised Parameters
OptAdaBoost= AdaBoostClassifier(base_estimator=model,n_estimators=100,learning_rate=0.001)
OptAdaBoost.fit(X_train,Y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='entropy',
                                                         max_depth=7,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                       

In [34]:
ABpred = OptAdaBoost.predict(X_test)
accuracy = accuracy_score(Y_test,ABpred)
accuracy

0.5695742471443406

In [35]:
# Gradient Boost Optimisation
parameters = {
    "n_estimators":[5,50,100,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,0.5,1,10]
    }
#passing the scoring function in the GridSearchCV
GridGradBoost = GridSearchCV(GradientBoostingClassifier(), parameters,cv=5, n_jobs=-1)
GridGradBoost.fit(X_train, Y_train)
GridGradBoost.best_score_

0.5570706321582377

In [36]:
GridGradBoost.best_params_

{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50}

In [37]:
OptGradBoost= GridGradBoost.best_estimator_

In [38]:
OptGradBoost.fit(X_train,Y_train)
GBpred = OptGradBoost.predict(X_test)
accuracy = accuracy_score(Y_test,GBpred)

In [39]:
accuracy

0.5597092419522326

In [40]:
param_grid={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}
GridXGBoost = GridSearchCV(XGBClassifier(), param_grid=param_grid, n_jobs= -1,cv=5, verbose=3)
GridXGBoost.fit(X_train,Y_train)

Fitting 5 folds for each of 3840 candidates, totalling 19200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 508 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1148 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1564 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 2044 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 3868 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 4604 tasks      | elapsed: 22.0min
[Parallel(n_jobs=-1)]: Done 5404 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 6268 tasks      | elapsed: 32.8min
[Parallel(n_jobs=-1)]: Done 7196 tasks      | e

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_po...ght=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
                         'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                         'learning_rate': [0.05, 0.1,

In [41]:
GridXGBoost.best_params_

{'colsample_bytree': 0.7,
 'gamma': 0.4,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 5}

In [42]:
GridXGBoost.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.4,
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [43]:
# Instantiate the machine learning classifiers
AdaBoostModel = AdaBoostClassifier(base_estimator=model, n_estimators=100, learning_rate=0.001)
GradientBoostModel = GradientBoostingClassifier(n_estimators=50, learning_rate=0.01, max_depth=7)
XGBoostModel = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.4,
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [44]:
scoring = {'accuracy':make_scorer(accuracy_score),
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score),
           'AUC':make_scorer(roc_auc_score)}

In [45]:
# Define the models evaluation function
def models_evaluation(X2, Y2, folds):
    
    '''
    X2 : data set features
    Y2 : data set target
    folds : number of cross-validation folds
    
    '''
    
    # Perform cross-validation to each machine learning classifier
    Ada = cross_validate(AdaBoostModel, X2, Y2, cv=folds, scoring=scoring)
    Grad = cross_validate(GradientBoostModel, X2, Y2, cv=folds, scoring=scoring)
    XG = cross_validate(XGBoostModel, X2, Y2, cv=folds, scoring=scoring)
    

    # Create a data frame with the models perfoamnce metrics scores
    models_scores_table = pd.DataFrame({'Ada Boost Classifier':[Ada['test_accuracy'].mean(),
                                                               Ada['test_precision'].mean(),
                                                               Ada['test_recall'].mean(),
                                                               Ada['test_f1_score'].mean(),
                                                                Ada['test_AUC'].mean()],
                                       
                                      'Gradient Boost Classifier':[Grad['test_accuracy'].mean(),
                                                                   Grad['test_precision'].mean(),
                                                                   Grad['test_recall'].mean(),
                                                                   Grad['test_f1_score'].mean(),
                                                                   Grad['test_AUC'].mean()],
                                       
                                      'XG Boost Classifier':[XG['test_accuracy'].mean(),
                                                       XG['test_precision'].mean(),
                                                       XG['test_recall'].mean(),
                                                       XG['test_f1_score'].mean(),
                                                       XG['test_AUC'].mean()]},
                                       
                                      
                                      
                                      index= ['Accuracy', 'Precision', 'Recall', 'F1 Score','AUC'])
    
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
    
    # Return models performance metrics scores data frame
    return(models_scores_table)
  
# Run models_evaluation function
models_evaluation(X2, Y2, 10)

Unnamed: 0,Ada Boost Classifier,Gradient Boost Classifier,XG Boost Classifier,Best Score
Accuracy,0.540908,0.531867,0.505578,Ada Boost Classifier
Precision,0.55863,0.555754,0.526032,Ada Boost Classifier
Recall,0.747373,0.748812,0.620441,Gradient Boost Classifier
F1 Score,0.625511,0.617522,0.557263,Ada Boost Classifier
AUC,0.540863,0.53182,0.505554,Ada Boost Classifier


In [46]:
!pip install h2o

Collecting h2o
[?25l  Downloading https://files.pythonhosted.org/packages/b0/e7/b7057e4a6832f3bec0cb36fda4913bf84a6dc610c92a2d3543442f4154a1/h2o-3.30.1.1.tar.gz (129.3MB)
[K     |████████████████████████████████| 129.3MB 85kB/s 
Collecting colorama>=0.3.8
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.30.1.1-py2.py3-none-any.whl size=129358602 sha256=13cdd37cfc61253840fc20d939c1f0758630b08fdb194a2e77c901e306d45a62
  Stored in directory: /root/.cache/pip/wheels/33/ac/52/165c35d747abdb629c3c9fb7e087f360c662d8cb58824caed8
Successfully built h2o
Installing collected packages: colorama, h2o
Successfully installed colorama-0.4.3 h2o-3.30.1.1


In [47]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.8" 2020-07-14; OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1); OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.6/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmprwb1ld0v
  JVM stdout: /tmp/tmprwb1ld0v/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmprwb1ld0v/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.1
H2O_cluster_version_age:,20 days
H2O_cluster_name:,H2O_from_python_unknownUser_ddaoxc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.180 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [48]:
from sklearn.model_selection import train_test_split

In [49]:
Y_train = Y_train.astype('category')
Y_test = Y_test.astype('category')

In [50]:
nfolds = 5

In [51]:
X_train

Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5
2592,-0.005310,0.003503,0.002748,-0.002520,0.000912
6936,0.002383,0.004660,0.006683,-0.001969,0.015664
838,-0.000275,-0.003099,-0.004096,0.000063,0.004862
4834,0.006918,-0.015405,0.010093,-0.016865,-0.015878
7929,0.002479,0.013451,0.018637,0.018812,-0.020619
...,...,...,...,...,...
7156,0.008942,-0.015201,-0.003954,0.005892,-0.016679
8057,0.008265,0.008813,-0.001574,0.001397,-0.007852
7656,0.002441,0.001737,0.000487,-0.006866,-0.000952
1411,0.010630,0.000707,-0.029126,-0.018402,-0.010393


In [52]:
train_data = pd.concat([X_train,Y_train],axis=1)

In [53]:
train_data

Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5,Direction
2592,-0.005310,0.003503,0.002748,-0.002520,0.000912,0.0
6936,0.002383,0.004660,0.006683,-0.001969,0.015664,0.0
838,-0.000275,-0.003099,-0.004096,0.000063,0.004862,1.0
4834,0.006918,-0.015405,0.010093,-0.016865,-0.015878,1.0
7929,0.002479,0.013451,0.018637,0.018812,-0.020619,0.0
...,...,...,...,...,...,...
7156,0.008942,-0.015201,-0.003954,0.005892,-0.016679,1.0
8057,0.008265,0.008813,-0.001574,0.001397,-0.007852,1.0
7656,0.002441,0.001737,0.000487,-0.006866,-0.000952,0.0
1411,0.010630,0.000707,-0.029126,-0.018402,-0.010393,1.0


In [54]:
test_data = pd.concat([X_test,Y_test],axis=1)

In [55]:
train= h2o.H2OFrame(train_data)
test= h2o.H2OFrame(test_data)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [56]:
train

lag_1,lag_2,lag_3,lag_4,lag_5,Direction
-0.00530987,0.00350291,0.00274768,-0.00251987,0.000912085,0
0.00238283,0.00465951,0.00668344,-0.00196948,0.0156638,0
-0.000275051,-0.00309869,-0.0040956,6.34869e-05,0.00486239,1
0.00691839,-0.0154049,0.0100933,-0.0168652,-0.0158777,1
0.00247861,0.0134514,0.0186367,0.0188118,-0.0206186,0
-0.0540926,0.00270943,-0.0259858,-0.00476639,-0.00548864,0
-0.00640101,0.00112896,0.0251351,0.0296398,-0.00996439,0
0.000418277,-0.00190042,0.00929249,0.00517415,-0.00119262,0
-0.000811837,-0.00310562,0.0109445,0.0132051,-0.00998535,1
0.0101248,0.00069824,0.00781078,0.000911729,0.00103402,1




In [57]:
x=train.columns

In [58]:
y="Direction"

In [59]:
x.remove(y)

In [60]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [61]:
train[y]

Direction
0
0
1
1
0
0
0
0
1
1




In [62]:
#1. Generate a 2-model ensemble (GBM + RF)

# Train and cross-validate a GBM
my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                      ntrees=50,
                                      max_depth=7,
                                      min_rows=2,
                                      learn_rate=0.01,
                                      nfolds=nfolds,
                                      fold_assignment="Stratified",
                                      keep_cross_validation_predictions=True,
                                      seed=1)
my_gbm.train(x=x, y=y, training_frame=train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [63]:
# Train and cross-validate a RF
my_rf = H2ORandomForestEstimator(ntrees=50,
                                 nfolds=nfolds,
                                 fold_assignment="Stratified",
                                 keep_cross_validation_predictions=True,
                                 seed=1)
my_rf.train(x=x, y=y, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [64]:
from h2o.estimators import H2OXGBoostEstimator

In [65]:
my_xgb = H2OXGBoostEstimator(booster='gbtree',normalize_type="tree",nfolds=nfolds,fold_assignment="Stratified",keep_cross_validation_predictions=True,seed=None,
                             ntrees=100,max_depth=6,min_rows=1 ,min_split_improvement=0.4, learn_rate=0.05)
my_xgb.train(x=x,y=y,training_frame=train)


xgboost Model Build progress: |███████████████████████████████████████████| 100%


In [66]:
# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
                                       base_models=[my_gbm, my_rf,my_xgb])
ensemble.train(x=x, y=y, training_frame=train)

# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [67]:
# Compare to base learner performance on the test set
perf_gbm_test = my_gbm.model_performance(test)
perf_rf_test = my_rf.model_performance(test)
perf_xgb_test = my_xgb.model_performance(test)
baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc(),perf_xgb_test.auc())
stack_auc_test = perf_stack_test.auc()
print("Ensemble Test AUC:  {0}".format(stack_auc_test))

Ensemble Test AUC:  0.5822839188359243


In [68]:
# Generate predictions on a test set
pred = ensemble.predict(test)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [69]:
from h2o.automl import H2OAutoML

In [70]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train, leaderboard_frame=test)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [71]:
# View the AutoML Leaderboard
lb = aml.leaderboard
Results= lb.head()
Results

In [72]:
AutoMLresults= h2o.as_list(Results)
AutoMLresults.to_csv('NYSE 500 AutoML.csv')
