In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
dataset= pd.read_csv("NASDAQ.csv")
dataset

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1985-01-02,247.100006,247.100006,245.899994,245.899994,245.899994,48210000
1,1985-01-03,246.000000,246.699997,246.000000,246.399994,246.399994,52250000
2,1985-01-04,246.100006,246.100006,245.800003,246.100006,246.100006,55800000
3,1985-01-07,246.000000,246.199997,245.899994,245.899994,245.899994,53200000
4,1985-01-08,246.100006,246.399994,246.000000,246.000000,246.000000,64850000
...,...,...,...,...,...,...,...
8941,2020-06-23,10130.830078,10221.849609,10112.440430,10131.370117,10131.370117,5712450000
8942,2020-06-24,10092.919922,10137.500000,9842.219727,9909.169922,9909.169922,5549440000
8943,2020-06-25,9899.360352,10023.280273,9810.469727,10017.000000,10017.000000,4709620000
8944,2020-06-26,9995.120117,10000.669922,9749.070313,9757.219727,9757.219727,7279230000


In [3]:
df = pd.DataFrame(dataset)

In [4]:
#Removing unneccessary columns
df1=df.drop(['Date','Open','High','Low','Close'], axis=1)

In [5]:
#Creating 'Today' column based on Percentage Change
df1['Today'] = df1['Adj Close'].pct_change(1)

In [6]:
#Creating 'Direction' column based on the percentage change of that day.
df1.loc[df1['Today'] > 0 , 'Direction'] = 1
df1.loc[df1['Today'] < 0 , 'Direction'] = 0

In [7]:
#Dropping unnecessary columns
data = df1.drop(['Adj Close','Volume'], axis=1)

In [8]:
data

Unnamed: 0,Today,Direction
0,,
1,0.002033,1.0
2,-0.001217,0.0
3,-0.000813,0.0
4,0.000407,1.0
...,...,...
8941,0.007447,1.0
8942,-0.021932,0.0
8943,0.010882,1.0
8944,-0.025934,0.0


In [9]:
#Removing Null values
df3 = data.dropna()

In [10]:
df3['Direction'].value_counts()

1.0    4978
0.0    3940
Name: Direction, dtype: int64

In [11]:
#Over sampling for Balancing the data
from imblearn.over_sampling import SMOTE



In [12]:
#Creating Dependent and Independent variables
X = df3.drop(['Direction'], axis=1)
Y = df3['Direction']

In [13]:
# Implementing Oversampling 
oversample = SMOTE()
X_res,Y_res=oversample.fit_sample(X,Y)



In [14]:
X_res.shape,Y_res.shape

((9956, 1), (9956,))

In [15]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(Y_res)))

Original dataset shape Counter({1.0: 4978, 0.0: 3940})
Resampled dataset shape Counter({1.0: 4978, 0.0: 4978})


In [16]:
X1 = pd.DataFrame(X_res)
Y1 = pd.DataFrame(Y_res)

In [17]:
data2 = pd.concat([X1,Y1],axis=1)
data3 = pd.DataFrame(data2)
data3

Unnamed: 0,0,0.1
0,0.002033,1.0
1,-0.001217,0.0
2,-0.000813,0.0
3,0.000407,1.0
4,0.005285,1.0
...,...,...
9951,-0.007819,0.0
9952,-0.011588,0.0
9953,-0.003584,0.0
9954,-0.001231,0.0


In [18]:
data3.columns= ['Today','Direction']

In [19]:
#Creating lag features required
data3['lag_1'] = data3['Today'].shift(1)
data3['lag_2'] = data3['Today'].shift(2)
data3['lag_3'] = data3['Today'].shift(3)
data3['lag_4'] = data3['Today'].shift(4)
data3['lag_5'] = data3['Today'].shift(5)

In [20]:
df4 = data3.dropna()
df4

Unnamed: 0,Today,Direction,lag_1,lag_2,lag_3,lag_4,lag_5
5,0.013748,1.0,0.005285,0.000407,-0.000813,-0.001217,0.002033
6,0.005983,1.0,0.013748,0.005285,0.000407,-0.000813,-0.001217
7,0.013085,1.0,0.005983,0.013748,0.005285,0.000407,-0.000813
8,0.009002,1.0,0.013085,0.005983,0.013748,0.005285,0.000407
9,0.009310,1.0,0.009002,0.013085,0.005983,0.013748,0.005285
...,...,...,...,...,...,...,...
9951,-0.007819,0.0,-0.014856,-0.010798,-0.005267,-0.009359,-0.002977
9952,-0.011588,0.0,-0.007819,-0.014856,-0.010798,-0.005267,-0.009359
9953,-0.003584,0.0,-0.011588,-0.007819,-0.014856,-0.010798,-0.005267
9954,-0.001231,0.0,-0.003584,-0.011588,-0.007819,-0.014856,-0.010798


In [21]:
#Correlation plot
rs = np.random.RandomState(0)
corr = df4.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Today,Direction,lag_1,lag_2,lag_3,lag_4,lag_5
Today,1.0,0.658334,0.030718,0.027095,0.052585,0.047864,0.044771
Direction,0.658334,1.0,0.107247,0.076462,0.075758,0.094461,0.082151
lag_1,0.030718,0.107247,1.0,0.030719,0.027088,0.052562,0.047857
lag_2,0.027095,0.076462,0.030719,1.0,0.030718,0.027084,0.052562
lag_3,0.052585,0.075758,0.027088,0.030718,1.0,0.030702,0.027073
lag_4,0.047864,0.094461,0.052562,0.027084,0.030702,1.0,0.030662
lag_5,0.044771,0.082151,0.047857,0.052562,0.027073,0.030662,1.0


In [22]:
#Removing highly correlatedd feature
df5 = df4.drop(['Today'],axis=1)

In [23]:
finalDF = pd.DataFrame(df5)

In [24]:
finalDF

Unnamed: 0,Direction,lag_1,lag_2,lag_3,lag_4,lag_5
5,1.0,0.005285,0.000407,-0.000813,-0.001217,0.002033
6,1.0,0.013748,0.005285,0.000407,-0.000813,-0.001217
7,1.0,0.005983,0.013748,0.005285,0.000407,-0.000813
8,1.0,0.013085,0.005983,0.013748,0.005285,0.000407
9,1.0,0.009002,0.013085,0.005983,0.013748,0.005285
...,...,...,...,...,...,...
9951,0.0,-0.014856,-0.010798,-0.005267,-0.009359,-0.002977
9952,0.0,-0.007819,-0.014856,-0.010798,-0.005267,-0.009359
9953,0.0,-0.011588,-0.007819,-0.014856,-0.010798,-0.005267
9954,0.0,-0.003584,-0.011588,-0.007819,-0.014856,-0.010798


In [25]:
#Create pre-processed dataset
finalDF.to_csv('Pre-Processed NASDAQ.csv')

In [26]:
#Creating Input and Output variables
X2= finalDF.drop(['Direction'], axis=1)
Y2= finalDF['Direction']

In [27]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate

In [28]:
#Hyper Parameter Optimization
from sklearn.model_selection import GridSearchCV

In [29]:
# Splitting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y2, test_size=0.20)
print (X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(7960, 5) (7960,)
(1991, 5) (1991,)


In [30]:
#Adaptive Boosting
model= DecisionTreeClassifier(criterion='entropy', max_depth=7)
#List of Hyper Parameters
param_grid = {
    'n_estimators': [100,200],
    'learning_rate': [0.001,0.01,0.1,0.2,0.5]
}
#GridSearchCV
GridAdaBoost = GridSearchCV(AdaBoostClassifier(base_estimator=model),param_grid=param_grid,cv=5)
GridAdaBoost.fit(X_train,Y_train)
Pred = GridAdaBoost.predict(X_test)
GridAdaBoost.best_score_


0.5919597989949749

In [31]:
GridAdaBoost.best_params_

{'learning_rate': 0.001, 'n_estimators': 100}

In [32]:
#Optimised Parameters
OptAdaBoost= AdaBoostClassifier(base_estimator=model,n_estimators=100,learning_rate=0.001)
OptAdaBoost.fit(X_train,Y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='entropy',
                                                         max_depth=7,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                       

In [34]:
ABpred = OptAdaBoost.predict(X_test)
accuracy = accuracy_score(Y_test,ABpred)
accuracy

0.5891511803114013

In [35]:
# Gradient Boost Optimisation
parameters = {
    "n_estimators":[5,50,100,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,0.5,1,10]
    }
#passing the scoring function in the GridSearchCV
GridGradBoost = GridSearchCV(GradientBoostingClassifier(), parameters,cv=5, n_jobs=-1)
GridGradBoost.fit(X_train, Y_train)
GridGradBoost.best_score_

0.6052763819095477

In [36]:
GridGradBoost.best_params_

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 5}

In [37]:
OptGradBoost= GridGradBoost.best_estimator_

In [38]:
OptGradBoost.fit(X_train,Y_train)
GBpred = OptGradBoost.predict(X_test)
accuracy = accuracy_score(Y_test,GBpred)

In [39]:
accuracy

0.5806127574083375

In [40]:
param_grid={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}
GridXGBoost = GridSearchCV(XGBClassifier(), param_grid=param_grid, n_jobs= -1,cv=5, verbose=3)
GridXGBoost.fit(X_train,Y_train)

Fitting 5 folds for each of 3840 candidates, totalling 19200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 508 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1148 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 1564 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 2044 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 3868 tasks      | elapsed: 18.8min
[Parallel(n_jobs=-1)]: Done 4604 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 5404 tasks      | elapsed: 27.5min
[Parallel(n_jobs=-1)]: Done 6268 tasks      | elapsed: 33.5min
[Parallel(n_jobs=-1)]: Done 7196 tasks      | e

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_po...ght=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5, 0.7],
                         'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
                         'learning_rate': [0.05, 0.1,

In [43]:
GridXGBoost.best_params_

{'colsample_bytree': 0.7,
 'gamma': 0.2,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 3}

In [44]:
GridXGBoost.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.2,
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [49]:
# Instantiate the machine learning classifiers
AdaBoostModel = AdaBoostClassifier(base_estimator=model, n_estimators=100, learning_rate=0.001)
GradientBoostModel = GradientBoostingClassifier(n_estimators=5, learning_rate=0.01, max_depth=5)
XGBoostModel = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.2,
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [46]:
scoring = {'accuracy':make_scorer(accuracy_score),
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score),
           'AUC':make_scorer(roc_auc_score)}

In [50]:
# Define the models evaluation function
def models_evaluation(X2, Y2, folds):
    
    '''
    X2 : data set features
    Y2 : data set target
    folds : number of cross-validation folds
    
    '''
    
    # Perform cross-validation to each machine learning classifier
    Ada = cross_validate(AdaBoostModel, X2, Y2, cv=folds, scoring=scoring)
    Grad = cross_validate(GradientBoostModel, X2, Y2, cv=folds, scoring=scoring)
    XG = cross_validate(XGBoostModel, X2, Y2, cv=folds, scoring=scoring)
    

    # Create a data frame with the models perfoamnce metrics scores
    models_scores_table = pd.DataFrame({'Ada Boost Classifier':[Ada['test_accuracy'].mean(),
                                                               Ada['test_precision'].mean(),
                                                               Ada['test_recall'].mean(),
                                                               Ada['test_f1_score'].mean(),
                                                                Ada['test_AUC'].mean()],
                                       
                                      'Gradient Boost Classifier':[Grad['test_accuracy'].mean(),
                                                                   Grad['test_precision'].mean(),
                                                                   Grad['test_recall'].mean(),
                                                                   Grad['test_f1_score'].mean(),
                                                                   Grad['test_AUC'].mean()],
                                       
                                      'XG Boost Classifier':[XG['test_accuracy'].mean(),
                                                       XG['test_precision'].mean(),
                                                       XG['test_recall'].mean(),
                                                       XG['test_f1_score'].mean(),
                                                       XG['test_AUC'].mean()]},
                                       
                                      
                                      
                                      index= ['Accuracy', 'Precision', 'Recall', 'F1 Score','AUC'])
    
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
    
    # Return models performance metrics scores data frame
    return(models_scores_table)
  
# Run models_evaluation function
models_evaluation(X2, Y2, 10)

Unnamed: 0,Ada Boost Classifier,Gradient Boost Classifier,XG Boost Classifier,Best Score
Accuracy,0.541858,0.533717,0.542761,XG Boost Classifier
Precision,0.583371,0.584785,0.582143,Gradient Boost Classifier
Recall,0.708104,0.706479,0.682394,Ada Boost Classifier
F1 Score,0.615339,0.606843,0.605584,Ada Boost Classifier
AUC,0.541792,0.533635,0.542701,XG Boost Classifier


In [51]:
!pip install h2o

Collecting h2o
[?25l  Downloading https://files.pythonhosted.org/packages/b0/e7/b7057e4a6832f3bec0cb36fda4913bf84a6dc610c92a2d3543442f4154a1/h2o-3.30.1.1.tar.gz (129.3MB)
[K     |████████████████████████████████| 129.3MB 77kB/s 
Collecting colorama>=0.3.8
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.30.1.1-py2.py3-none-any.whl size=129358602 sha256=b299b4ab13740246f635948a33ed67684ef8e314597893bfebc3f8b0596b73ea
  Stored in directory: /root/.cache/pip/wheels/33/ac/52/165c35d747abdb629c3c9fb7e087f360c662d8cb58824caed8
Successfully built h2o
Installing collected packages: colorama, h2o
Successfully installed colorama-0.4.3 h2o-3.30.1.1


In [52]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.8" 2020-07-14; OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1); OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.6/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpw46ez7jh
  JVM stdout: /tmp/tmpw46ez7jh/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpw46ez7jh/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.1
H2O_cluster_version_age:,20 days
H2O_cluster_name:,H2O_from_python_unknownUser_k496l4
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.180 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [53]:
from sklearn.model_selection import train_test_split

In [54]:
Y_train = Y_train.astype('category')
Y_test = Y_test.astype('category')

In [55]:
nfolds = 5

In [56]:
X_train

Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5
4570,0.004080,-0.002485,0.005947,0.038752,-0.000328
8036,-0.004388,-0.002281,0.004856,0.003730,-0.003608
7879,-0.010227,0.012562,0.002966,0.004043,-0.001809
9939,-0.015460,-0.000748,-0.023726,-0.034702,-0.001445
7460,0.001153,0.007519,-0.008710,0.002049,0.004517
...,...,...,...,...,...
707,-0.014850,-0.008699,-0.090107,-0.022612,-0.044899
6978,-0.015207,-0.007602,-0.004213,0.004539,0.004869
1899,-0.001312,-0.002687,-0.001046,-0.000488,-0.004940
2163,0.003664,-0.003094,-0.002013,0.005856,-0.000137


In [57]:
train_data = pd.concat([X_train,Y_train],axis=1)

In [58]:
train_data

Unnamed: 0,lag_1,lag_2,lag_3,lag_4,lag_5,Direction
4570,0.004080,-0.002485,0.005947,0.038752,-0.000328,1.0
8036,-0.004388,-0.002281,0.004856,0.003730,-0.003608,1.0
7879,-0.010227,0.012562,0.002966,0.004043,-0.001809,0.0
9939,-0.015460,-0.000748,-0.023726,-0.034702,-0.001445,0.0
7460,0.001153,0.007519,-0.008710,0.002049,0.004517,0.0
...,...,...,...,...,...,...
707,-0.014850,-0.008699,-0.090107,-0.022612,-0.044899,1.0
6978,-0.015207,-0.007602,-0.004213,0.004539,0.004869,0.0
1899,-0.001312,-0.002687,-0.001046,-0.000488,-0.004940,1.0
2163,0.003664,-0.003094,-0.002013,0.005856,-0.000137,1.0


In [59]:
test_data = pd.concat([X_test,Y_test],axis=1)

In [60]:
train= h2o.H2OFrame(train_data)
test= h2o.H2OFrame(test_data)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [61]:
train

lag_1,lag_2,lag_3,lag_4,lag_5,Direction
0.00408002,-0.00248481,0.00594714,0.0387517,-0.000328217,1
-0.0043883,-0.00228116,0.00485576,0.00372985,-0.0036083,1
-0.0102269,0.0125615,0.0029665,0.00404281,-0.00180934,0
-0.0154605,-0.000748384,-0.0237262,-0.0347016,-0.00144545,0
0.00115339,0.00751929,-0.00871025,0.00204895,0.00451744,0
0.00784253,0.0128582,0.00628678,-0.00860336,0.0118659,1
0.00500215,0.0104377,0.0592899,0.00801778,-0.0122365,1
0.0225267,-0.0387953,-0.0156614,0.0162738,-0.0247815,0
-0.00954916,0.0035658,0.00990327,-0.00825896,0.00900901,1
-0.0117715,-0.0417432,0.0340138,0.0477642,-0.0493907,1




In [62]:
x=train.columns

In [63]:
y="Direction"

In [64]:
x.remove(y)

In [65]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [66]:
train[y]

Direction
1
1
0
0
0
1
1
0
1
1




In [68]:
#1. Generate a 2-model ensemble (GBM + RF)

# Train and cross-validate a GBM
my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                      ntrees=5,
                                      max_depth=5,
                                      min_rows=2,
                                      learn_rate=0.01,
                                      nfolds=nfolds,
                                      fold_assignment="Stratified",
                                      keep_cross_validation_predictions=True,
                                      seed=1)
my_gbm.train(x=x, y=y, training_frame=train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [69]:
# Train and cross-validate a RF
my_rf = H2ORandomForestEstimator(ntrees=50,
                                 nfolds=nfolds,
                                 fold_assignment="Stratified",
                                 keep_cross_validation_predictions=True,
                                 seed=1)
my_rf.train(x=x, y=y, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [70]:
from h2o.estimators import H2OXGBoostEstimator

In [71]:
my_xgb = H2OXGBoostEstimator(booster='gbtree',normalize_type="tree",nfolds=nfolds,fold_assignment="Stratified",keep_cross_validation_predictions=True,seed=None,
                             ntrees=100,max_depth=6,min_rows=1 ,min_split_improvement=0.3, learn_rate=0.05)
my_xgb.train(x=x,y=y,training_frame=train)


xgboost Model Build progress: |███████████████████████████████████████████| 100%


In [72]:
# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
                                       base_models=[my_gbm, my_rf,my_xgb])
ensemble.train(x=x, y=y, training_frame=train)

# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [73]:
# Compare to base learner performance on the test set
perf_gbm_test = my_gbm.model_performance(test)
perf_rf_test = my_rf.model_performance(test)
perf_xgb_test = my_xgb.model_performance(test)
baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc(),perf_xgb_test.auc())
stack_auc_test = perf_stack_test.auc()
print("Ensemble Test AUC:  {0}".format(stack_auc_test))

Ensemble Test AUC:  0.6245928289427184


In [74]:
# Generate predictions on a test set
pred = ensemble.predict(test)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [75]:
from h2o.automl import H2OAutoML

In [76]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train, leaderboard_frame=test)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [79]:
# View the AutoML Leaderboard
lb = aml.leaderboard
Results= lb.head()

In [80]:
AutoMLresults= h2o.as_list(Results)
AutoMLresults.to_csv('NASDAQ AutoML.csv')
