In [3]:
#Use stock indicators with machine learning to try to predict the direction of stock price 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import yfinance as yf

In [5]:
df=yf.Ticker('GOOG').history(start='2019-06-03', end='2019-12-17')
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-06-03,1065.500000,1065.500000,1025.000000,1036.229980,5130600,0,0
2019-06-04,1042.900024,1056.050049,1033.689941,1053.050049,2833500,0,0
2019-06-05,1051.540039,1053.550049,1030.489990,1042.219971,2168400,0,0
2019-06-06,1044.989990,1047.489990,1033.699951,1044.339966,1703200,0,0
2019-06-07,1050.630005,1070.920044,1048.400024,1066.040039,1802400,0,0
...,...,...,...,...,...,...,...
2019-12-10,1341.500000,1349.974976,1336.040039,1344.660034,1094100,0,0
2019-12-11,1350.839966,1351.199951,1342.670044,1345.020020,850400,0,0
2019-12-12,1345.939941,1355.775024,1340.500000,1350.270020,1281000,0,0
2019-12-13,1347.949951,1353.093018,1343.869995,1347.829956,1549600,0,0


In [17]:
#calculate function to calculate the simple Moving Average (SMA) and the Exponential Moving Average(EMA)
#typical time for moving average are 15,20 and 30
#create the simple moving moving average
def SMA(data,period=30,column='Close'):
    return data[column].rolling(window=period).mean()
#create the Exponential Moving Average (EMA)
def EMA(data, period=20,column='Close'):
    return data[column].ewm(span=period, adjust=False).mean()

In [18]:
#creating a function to Calculate the Moving Average Convergence / Divergence(MACD)
def MACD(data,period_long=26,period_short=12,period_signal=9,column='Close'):
    #calculate the Short Term EMA
    ShortEMA= EMA(data,period=period_short,column=column)
    #calculate the long term EMA 
    longEMA = EMA(data,period=period_long,column=column)
    #calculate and store the MACD into the data frame
    data['MACD']= ShortEMA-longEMA
    #calculate the signal line and store it into the data frame 
    data['signal_line']= EMA(data,period=period_signal, column='MACD')
    return data

In [19]:
#create a function to calculate the relative strength index(RSI)
def RSI(data,period=14,column='Close'):
    delta = data[column].diff(1)
    delta = delta.dropna()
    up=delta.copy()
    down=delta.copy()
    up[up<0] = 0
    down[down > 0] = 0
    data['up']= up
    data['down']=down
    AVG_Gain = SMA(data, period, column='up')
    AVG_loss = abs(SMA(data,period, column='down'))
    RS =AVG_Gain /AVG_loss
    RSI =100.0-(100.0/(1.0+RS))
    data['RSI']=RSI
    return data

In [20]:
#Add the indicator to the data set
MACD(df)
RSI(df)
df['SMA']=SMA(df)
df['EMA']=EMA(df)
#show the data 
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,MACD,signal_line,up,down,RSI,SMA,EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-06-03,1065.500000,1065.500000,1025.000000,1036.229980,5130600,0,0,0.000000,0.000000,,,,,1036.229980
2019-06-04,1042.900024,1056.050049,1033.689941,1053.050049,2833500,0,0,1.341772,0.268354,16.820068,0.000000,,,1037.831892
2019-06-05,1051.540039,1053.550049,1030.489990,1042.219971,2168400,0,0,1.513789,0.517441,0.000000,-10.830078,,,1038.249804
2019-06-06,1044.989990,1047.489990,1033.699951,1044.339966,1703200,0,0,1.800425,0.774038,2.119995,0.000000,,,1038.829819
2019-06-07,1050.630005,1070.920044,1048.400024,1066.040039,1802400,0,0,3.735540,1.366339,21.700073,0.000000,,,1041.421269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-10,1341.500000,1349.974976,1336.040039,1344.660034,1094100,0,0,15.726414,13.913536,1.099976,0.000000,62.510735,1304.410335,1313.003203
2019-12-11,1350.839966,1351.199951,1342.670044,1345.020020,850400,0,0,16.384554,14.407740,0.359985,0.000000,70.052514,1307.157003,1316.052424
2019-12-12,1345.939941,1355.775024,1340.500000,1350.270020,1281000,0,0,17.132277,14.952647,5.250000,0.000000,72.606290,1310.123002,1319.311243
2019-12-13,1347.949951,1353.093018,1343.869995,1347.829956,1549600,0,0,17.328210,15.427760,0.000000,-2.440063,75.083593,1313.047001,1322.027311


In [22]:
#create the Target column
df['Target']=np.where(df['Close'].shift(-1)>df['Close'],1,0)

In [23]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,MACD,signal_line,up,down,RSI,SMA,EMA,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-06-03,1065.500000,1065.500000,1025.000000,1036.229980,5130600,0,0,0.000000,0.000000,,,,,1036.229980,1
2019-06-04,1042.900024,1056.050049,1033.689941,1053.050049,2833500,0,0,1.341772,0.268354,16.820068,0.000000,,,1037.831892,0
2019-06-05,1051.540039,1053.550049,1030.489990,1042.219971,2168400,0,0,1.513789,0.517441,0.000000,-10.830078,,,1038.249804,1
2019-06-06,1044.989990,1047.489990,1033.699951,1044.339966,1703200,0,0,1.800425,0.774038,2.119995,0.000000,,,1038.829819,1
2019-06-07,1050.630005,1070.920044,1048.400024,1066.040039,1802400,0,0,3.735540,1.366339,21.700073,0.000000,,,1041.421269,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-10,1341.500000,1349.974976,1336.040039,1344.660034,1094100,0,0,15.726414,13.913536,1.099976,0.000000,62.510735,1304.410335,1313.003203,1
2019-12-11,1350.839966,1351.199951,1342.670044,1345.020020,850400,0,0,16.384554,14.407740,0.359985,0.000000,70.052514,1307.157003,1316.052424,1
2019-12-12,1345.939941,1355.775024,1340.500000,1350.270020,1281000,0,0,17.132277,14.952647,5.250000,0.000000,72.606290,1310.123002,1319.311243,0
2019-12-13,1347.949951,1353.093018,1343.869995,1347.829956,1549600,0,0,17.328210,15.427760,0.000000,-2.440063,75.083593,1313.047001,1322.027311,1


In [27]:
#remove the first 29 days of dat
df=df[29:]#to rid of some null values
#show the data
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,MACD,signal_line,up,down,RSI,SMA,EMA,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-07-15,1146.859985,1150.819946,1139.400024,1150.339966,903800,0,0,20.316557,17.334901,5.439941,0.000000,62.063456,1096.864327,1113.511647,1
2019-07-16,1146.000000,1158.579956,1145.000000,1153.579956,1238800,0,0,21.003375,18.068596,3.239990,0.000000,78.393463,1100.775993,1117.327677,0
2019-07-17,1150.969971,1158.359985,1145.770020,1146.349976,1170000,0,0,20.725375,18.599952,0.000000,-7.229980,77.945728,1103.885990,1120.091705,0
2019-07-18,1141.739990,1147.604980,1132.729980,1146.329956,1290700,0,0,20.269786,18.933919,0.000000,-0.020020,80.494359,1107.356323,1122.590586,0
2019-07-19,1148.189941,1151.140015,1129.619995,1130.099976,1647200,0,0,18.387148,18.824565,0.000000,-16.229980,69.422722,1110.214990,1123.305766,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-10,1341.500000,1349.974976,1336.040039,1344.660034,1094100,0,0,15.726414,13.913536,1.099976,0.000000,62.510735,1304.410335,1313.003203,1
2019-12-11,1350.839966,1351.199951,1342.670044,1345.020020,850400,0,0,16.384554,14.407740,0.359985,0.000000,70.052514,1307.157003,1316.052424,1
2019-12-12,1345.939941,1355.775024,1340.500000,1350.270020,1281000,0,0,17.132277,14.952647,5.250000,0.000000,72.606290,1310.123002,1319.311243,0
2019-12-13,1347.949951,1353.093018,1343.869995,1347.829956,1549600,0,0,17.328210,15.427760,0.000000,-2.440063,75.083593,1313.047001,1322.027311,1


In [28]:
#Split the data set into a feature or independent data srt (x) and a Target or dependent data set (Y)
keep_columns=['Close','MACD','signal_line','RSI','EMA']
x = df[keep_columns].values
y=df['Target'].values

In [38]:
#split the data again
X_train,x_test,y_train, y_test = train_test_split(x,y,test_size=0.2, random_state = 2)

In [32]:
#create and train the  decision tree classifier model
tree= DecisionTreeClassifier().fit(X_train,y_train)

In [33]:
#check the model performance on our training data
print(tree.score(X_train,y_train))

1.0


In [47]:
#check the model performance on our testing data
print(tree.score(x_test,y_test))

0.6818181818181818


In [48]:
# get the model metrics
tree_predictions=tree.predict(x_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,tree_predictions))

              precision    recall  f1-score   support

           0       0.80      0.62      0.70        13
           1       0.58      0.78      0.67         9

    accuracy                           0.68        22
   macro avg       0.69      0.70      0.68        22
weighted avg       0.71      0.68      0.68        22



In [49]:
tree_predictions

array([1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0])

In [50]:
y_test

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1])