<a href="https://colab.research.google.com/github/obliquesignal/algo-trading/blob/master/O'Reilly_Non_linear_ML_Class_Gradient_Boosting_Machines_Classification_and_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Market Data

In [0]:
#Import Python Libraries
import numpy as np
import pandas as pd
from datetime import datetime

import pandas_datareader.data as pdr
import fix_yahoo_finance as yf
yf.pdr_override()

import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [0]:
#Import data
start = datetime(2020, 1, 1)
end = datetime(2020, 5, 30)

stock = pdr.get_data_yahoo('AAPL', start, end) #Apple Inc. stock
market = pdr.get_data_yahoo('SPY', start, end) #S&P 500 index
vix = pdr.get_data_yahoo('^VIX', start, end)   #Volatility index
dxy = pdr.get_data_yahoo('UUP', start, end)    #Dollar index
junk = pdr.get_data_yahoo('JNK', start, end)   #Junk bond index

In [0]:
#Create target dataframe
target = pd.DataFrame()
#Use adjusted closing prices instead of closing prices to adjust for corporate actions such as dividends, splits and mergers
target['return'] = (stock['Open']-stock['Adj Close'].shift(1))/stock['Adj Close'].shift(1) #Returns based on buying on the close the day before and selling on the open the day after
target = target.dropna() #get rid of the NaNs
target['direction'] = np.where(target['return'] > 0, 1, -1) #Overnight direction of the stock
target.head()

Unnamed: 0_level_0,return,direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-03,-0.005622,-1
2020-01-06,-0.007214,-1
2020-01-07,0.005221,1
2020-01-08,0.000944,1
2020-01-09,0.018513,1


In [0]:
#Create features dataframe
features = pd.DataFrame()
features['market'] = market['Adj Close'].pct_change(1)*100
#VIX is volatility index and is measured in percentage terms
features['vix'] = vix['Adj Close'].diff()
features['dxy'] = dxy['Adj Close'].pct_change(1)*100
features['junk'] = junk['Adj Close'].pct_change(1)*100
features = features.dropna()
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-05-22,0.189907,-1.370001,0.297287,0.330134
2020-05-26,1.232056,-0.15,-0.741011,0.857514
2020-05-27,1.4879,-0.389999,-0.037327,0.296584
2020-05-28,-0.184495,0.969999,-0.522786,0.059147
2020-05-29,0.445591,-1.08,-0.150146,0.472856


In [0]:
lastknown = features[-1:] #Values of features from the last trading session
features = features[:-1] #Subtracts last row from the features matrix so that it aligns with labels vector
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-05-21,-0.690394,1.540001,0.223464,-0.020008
2020-05-22,0.189907,-1.370001,0.297287,0.330134
2020-05-26,1.232056,-0.15,-0.741011,0.857514
2020-05-27,1.4879,-0.389999,-0.037327,0.296584
2020-05-28,-0.184495,0.969999,-0.522786,0.059147


#Gradient boosting classifier for overnight direction

In [0]:
#Get rid of return column for classifiers
targetclass = target.drop(axis=1, columns='return')
targetclass = targetclass[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetclass.head()                                                                        

Unnamed: 0_level_0,direction
Date,Unnamed: 1_level_1
2020-01-06,-1
2020-01-07,1
2020-01-08,1
2020-01-09,1
2020-01-10,1


In [0]:
#Train and test classifier using Gini impurity performance metric
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

classifier = DecisionTreeClassifier(random_state=1) 
features_train, features_test, targetclass_train, targetclass_test = train_test_split(features, targetclass, test_size = 0.25, random_state=0)
classifier.fit(features_train, targetclass_train)
print("Training score:", classifier.score(features_train, targetclass_train))
print("Testing score:", classifier.score(features_test, targetclass_test))

Training score: 1.0
Testing score: 0.6153846153846154


In [0]:
#Train and test classifier using entropy performance metric
classifier_entropy = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=5) #Maximum depth of tree is used to prevent overfitting of test data 
features_train, features_test, targetclass_train, targetclass_test = train_test_split(features, targetclass, test_size = 0.25, random_state=0)
classifier_entropy.fit(features_train, targetclass_train)
print("Training score:", classifier_entropy.score(features_train, targetclass_train))
print("Testing score:", classifier_entropy.score(features_test, targetclass_test))

Training score: 0.8533333333333334
Testing score: 0.6153846153846154


In [0]:
from sklearn.metrics import confusion_matrix
targetclass_predict = classifier_entropy.predict(features_test)
confusion = confusion_matrix(targetclass_test, targetclass_predict)
confusion

array([[7, 2],
       [8, 9]])

In [0]:
print("Tomorrow's change:", classifier_entropy.predict(lastknown))
print("Probability of change", classifier_entropy.predict_proba(lastknown))
lastknown

Tomorrow's change: [1]
Probability of change [[0.23076923 0.76923077]]


Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-05-29,0.445591,-1.08,-0.150146,0.472856


In [0]:
#Inferring the importance of each feature
print(features.columns)
print(classifier.feature_importances_)
print(classifier_entropy.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.14421886 0.25135688 0.40100262 0.20342164]
[0.13178643 0.43639185 0.43182172 0.        ]


#Gradient boosting regressor for overnight value changes

In [0]:
#Get rid of direction column for regressors
targetvalue = target.drop(axis=1, columns='direction')
targetvalue = targetvalue[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetvalue.head()                                                                        


Unnamed: 0_level_0,return
Date,Unnamed: 1_level_1
2020-01-06,-0.007214
2020-01-07,0.005221
2020-01-08,0.000944
2020-01-09,0.018513
2020-01-10,0.008235


In [0]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

regressor = GradientBoostingRegressor(random_state=1)
targetvalue = np.ravel(targetvalue) #Need to covert column vector into a 1-d array
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=0)
regressor.fit(features_train, targetvalue_train)
print("Training score:", regressor.score(features_train, targetvalue_train))
print("Testing score:", regressor.score(features_test, targetvalue_test))
print("Tomorrow's value change:", regressor.predict(lastknown))

Training score: 0.9837754509542622
Testing score: 0.12510029327343664
Tomorrow's value change: [0.01847716]


In [0]:
regressor_mae = GradientBoostingRegressor(criterion="mae", random_state=1, learning_rate=0.5, max_depth=5, n_estimators=200, min_samples_split=5, min_samples_leaf=5)
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=1)
regressor_mae.fit(features_train, targetvalue_train)
print("Training score:", regressor_mae.score(features_train, targetvalue_train))
print("Testing score:", regressor_mae.score(features_test, targetvalue_test))
print("Tomorrow's value change:", regressor_mae.predict(lastknown))

Training score: 0.7073913786259723
Testing score: -0.19985868607942225
Tomorrow's value change: [0.01451818]


In [0]:
#Inferring the importance of each feature
print(features.columns)
print(regressor.feature_importances_)
print(regressor_mae.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.28852166 0.30128416 0.19779523 0.21239896]
[0.33172201 0.15070916 0.22058911 0.29697972]
