In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go

In [2]:
data_path = "Data/Historical_data_5Y.csv"

In [3]:
HD_5Y = pd.read_csv(data_path)
HD_5Y.drop(HD_5Y.columns[HD_5Y.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
HD_5Y['Date'] = pd.to_datetime( HD_5Y['Date'])
HD_5Y

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2022-01-31,13.20,13.20,12.90,13.05,13.05,7582
1,2022-01-28,13.20,13.20,12.95,13.05,13.05,5303
2,2022-01-27,13.05,13.20,12.95,13.05,13.05,5538
3,2022-01-26,13.25,13.25,12.95,13.05,13.05,6805
4,2022-01-25,13.25,13.25,13.05,13.05,13.05,1968
...,...,...,...,...,...,...,...
1266,2017-02-08,12.85,13.14,12.85,13.14,0.03,1770
1267,2017-02-07,13.12,13.13,13.00,13.00,0.03,2882
1268,2017-02-06,13.12,13.12,13.12,13.12,0.03,76
1269,2017-02-03,13.10,13.12,13.10,13.12,0.03,288


In [4]:
fig = px.line(HD_5Y, x=HD_5Y.iloc[:60]['Date'], y=HD_5Y.iloc[:60]['Close'])
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")

fig.update_layout(template="plotly_dark")
fig.show()

In [5]:
HD_5Y_smooth = HD_5Y.ewm(0.7).mean()

In [6]:
HD_5Y_smooth

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,13.200000,13.200000,12.900000,13.050000,13.05,7582.000000
1,13.200000,13.200000,12.935417,13.050000,13.05,5967.708333
2,13.105142,13.200000,12.944639,13.050000,13.05,5695.967177
3,13.192875,13.230282,12.947886,13.050000,13.05,6367.648422
4,13.226880,13.242020,13.008673,13.050000,13.05,3748.618196
...,...,...,...,...,...,...
1266,12.976237,13.151969,12.968241,13.143974,0.03,1094.331792
1267,13.060803,13.139046,12.986923,13.059283,0.03,2145.901326
1268,13.095625,13.127843,13.065203,13.094999,0.03,928.312311
1269,13.098198,13.123229,13.085672,13.109705,0.03,551.658010


In [7]:
fig = px.line(HD_5Y_smooth, x=HD_5Y.iloc[:60]['Date'], y=HD_5Y_smooth.iloc[:60]['Close'])
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")

fig.update_layout(template="plotly_dark")
fig.show()

In [8]:
HD_5Y_smooth.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [9]:
# Calculate RSI
delta = HD_5Y_smooth['Close'].diff()
up = delta.clip(lower=0)
down = -1*delta.clip(upper=0)
ema_up = up.ewm(com=13, adjust=False).mean()
ema_down = down.ewm(com=13, adjust=False).mean()
rs = ema_up/ema_down
HD_5Y_smooth['RSI'] = 100 - (100/(1 + rs))

In [10]:
# Calculate the MACD signal
exp1 = HD_5Y_smooth['Close'].ewm(span=12).mean()
exp2 = HD_5Y_smooth['Close'].ewm(span=26).mean()
macd = exp1 - exp2
macd_signal = macd.ewm(span=9).mean()
HD_5Y_smooth['MACD'] = macd_signal - macd

In [11]:
# Stochastic Oscillator
high14= HD_5Y_smooth['High'].rolling(14).max()
low14 = HD_5Y_smooth['Low'].rolling(14).min()
HD_5Y_smooth['STOCH'] = (HD_5Y_smooth['Close'] - low14)*100/(high14 - low14)

In [12]:
# Williams Percentage Range
HD_5Y_smooth['%R'] = -100*(high14 - HD_5Y_smooth['Close'])/(high14 - low14)

In [13]:
days=6
# Price Rate of Change
ct_n = HD_5Y_smooth['Close'].shift(days)
HD_5Y_smooth['PROC'] = (HD_5Y_smooth['Close'] - ct_n)/ct_n

In [14]:
HD_5Y_smooth.dropna(inplace=True)


In [15]:
HD_5Y_smooth

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,RSI,MACD,STOCH,%R,PROC
13,13.315484,13.317128,13.067778,13.165946,13.165946,8779.133215,64.373716,-0.001980,64.988226,-35.011774,0.002500
14,13.306376,13.307053,13.145556,13.185978,13.185978,5104.342876,66.799156,-0.003998,69.627357,-30.372643,-0.001206
15,13.214390,13.302904,13.148170,13.253050,13.253050,2503.551173,73.342991,-0.008129,85.160277,-14.839723,0.006202
16,13.235337,13.330608,13.149246,13.192432,13.192432,2830.285869,61.537955,-0.007080,68.969212,-31.030788,0.007058
17,13.273374,13.312603,13.120278,13.167472,13.167472,3561.882501,57.438330,-0.004666,63.363784,-36.636216,0.007413
...,...,...,...,...,...,...,...,...,...,...,...
1266,12.976237,13.151969,12.968241,13.143974,0.030000,1094.331792,57.807450,0.003696,53.896063,-46.103937,0.007272
1267,13.060803,13.139046,12.986923,13.059283,0.030000,2145.901326,50.373708,0.010005,27.922096,-72.077904,-0.002407
1268,13.095625,13.127843,13.065203,13.094999,0.030000,928.312311,53.112082,0.011691,38.875820,-61.124180,-0.000990
1269,13.098198,13.123229,13.085672,13.109705,0.030000,551.658010,54.231977,0.011705,43.386177,-56.613823,0.000041


In [16]:
# Also calculate moving averages for features
HD_5Y_smooth['ema50'] = HD_5Y_smooth['Close'] / HD_5Y_smooth['Close'].ewm(50).mean()
HD_5Y_smooth['ema21'] = HD_5Y_smooth['Close'] / HD_5Y_smooth['Close'].ewm(21).mean()
HD_5Y_smooth['ema15'] = HD_5Y_smooth['Close'] / HD_5Y_smooth['Close'].ewm(14).mean()
HD_5Y_smooth['ema5'] = HD_5Y_smooth['Close'] / HD_5Y_smooth['Close'].ewm(5).mean()

In [17]:
HD_5Y_smooth

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,RSI,MACD,STOCH,%R,PROC,ema50,ema21,ema15,ema5
13,13.315484,13.317128,13.067778,13.165946,13.165946,8779.133215,64.373716,-0.001980,64.988226,-35.011774,0.002500,1.000000,1.000000,1.000000,1.000000
14,13.306376,13.307053,13.145556,13.185978,13.185978,5104.342876,66.799156,-0.003998,69.627357,-30.372643,-0.001206,1.000753,1.000742,1.000734,1.000691
15,13.214390,13.302904,13.148170,13.253050,13.253050,2503.551173,73.342991,-0.008129,85.160277,-14.839723,0.006202,1.003849,1.003790,1.003740,1.003486
16,13.235337,13.330608,13.149246,13.192432,13.192432,2830.285869,61.537955,-0.007080,68.969212,-31.030788,0.007058,0.999449,0.999413,0.999384,0.999251
17,13.273374,13.312603,13.120278,13.167472,13.167472,3561.882501,57.438330,-0.004666,63.363784,-36.636216,0.007413,0.998065,0.998064,0.998065,0.998095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266,12.976237,13.151969,12.968241,13.143974,0.030000,1094.331792,57.807450,0.003696,53.896063,-46.103937,0.007272,1.025525,1.012107,1.007537,1.001835
1267,13.060803,13.139046,12.986923,13.059283,0.030000,2145.901326,50.373708,0.010005,27.922096,-72.077904,-0.002407,1.018540,1.005330,1.000975,0.996147
1268,13.095625,13.127843,13.065203,13.094999,0.030000,928.312311,53.112082,0.011691,38.875820,-61.124180,-0.000990,1.020898,1.007710,1.003464,0.999059
1269,13.098198,13.123229,13.085672,13.109705,0.030000,551.658010,54.231977,0.011705,43.386177,-56.613823,0.000041,1.021603,1.008436,1.004284,1.000151


In [18]:
HD_5Y_smooth.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'RSI', 'MACD',
       'STOCH', '%R', 'PROC', 'ema50', 'ema21', 'ema15', 'ema5'],
      dtype='object')

Instead of using the actual volume value (which changes over time), we normalize it with a moving volume average

In [19]:
HD_5Y_smooth['normVol'] = HD_5Y_smooth['Volume'] / HD_5Y_smooth['Volume'].ewm(5).mean()

# Applying Machine Learning model 

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

### RandomForestClassifier: Predict Stock Market Direction

In [21]:
# Set class labels to classify
HD_5Y_smooth['Return'] = HD_5Y_smooth['Close'].pct_change(1).shift(-1)
HD_5Y_smooth['class'] = np.where(HD_5Y_smooth['Return'] > 0, 1, 0)
HD_5Y_smooth.dropna(inplace=True)
# Data to predict
predictors = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'RSI', 'MACD',
       'STOCH', '%R', 'PROC', 'ema50', 'ema21', 'ema15', 'ema5', 'normVol']
X = HD_5Y_smooth[predictors]
y = HD_5Y_smooth['class']

In [22]:
HD_5Y_smooth.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'RSI', 'MACD',
       'STOCH', '%R', 'PROC', 'ema50', 'ema21', 'ema15', 'ema5', 'normVol',
       'Return', 'class'],
      dtype='object')

In [23]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Train the model
rfc = RandomForestClassifier(random_state=0)
rfc = rfc.fit(X_train, y_train)

# Test the model by doing some predictions
y_pred = rfc.predict(X_test)

# See how accurate the predictions are
report = classification_report(y_test, y_pred)
print('Model accuracy', accuracy_score(y_test, y_pred, normalize=True))
print(report)

Model accuracy 0.6613756613756614
              precision    recall  f1-score   support

           0       0.64      0.71      0.67       185
           1       0.69      0.61      0.65       193

    accuracy                           0.66       378
   macro avg       0.66      0.66      0.66       378
weighted avg       0.66      0.66      0.66       378



- Dimentionnality reduction and Prediction

Using components such that 95% of variance is retained

In [24]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
X_pca = pca.fit_transform(X)
X_pca.shape

(1257, 1)

In [25]:
pca.explained_variance_ratio_

array([0.99959599])

In [26]:
pca.n_components_

1

In [27]:
X_pca

array([[ 6815.68217167],
       [ 3140.88743509],
       [  540.07337813],
       ...,
       [  182.50488054],
       [-1035.1001348 ],
       [-1411.76110202]])

In [28]:
pd.DataFrame(X_pca)

Unnamed: 0,0
0,6815.682172
1,3140.887435
2,540.073378
3,866.834598
4,1598.439920
...,...
1252,-1834.340429
1253,-869.104044
1254,182.504881
1255,-1035.100135


In [29]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)

In [30]:
# Train the model
rfc = RandomForestClassifier(random_state=0)
rfc = rfc.fit(X_train_pca, y_train)

# Test the model by doing some predictions
y_pred = rfc.predict(X_test_pca)

# See how accurate the predictions are
report = classification_report(y_test, y_pred)
print('Model accuracy', accuracy_score(y_test, y_pred, normalize=True))
print(report)

Model accuracy 0.5277777777777778
              precision    recall  f1-score   support

           0       0.56      0.54      0.55       134
           1       0.50      0.52      0.51       118

    accuracy                           0.53       252
   macro avg       0.53      0.53      0.53       252
weighted avg       0.53      0.53      0.53       252



 Selecting only two components

In [31]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
X_pca.shape

(1257, 2)

In [32]:
X_pca

array([[ 6815.68217167,   -32.80300819],
       [ 3140.88743509,   -35.85776767],
       [  540.07337813,   -55.98945022],
       ...,
       [  182.50488054,    28.47652198],
       [-1035.1001348 ,    14.18940646],
       [-1411.76110202,     8.17555469]])

In [33]:
pd.DataFrame(X_pca, columns=['principal component 1', 'principal component 2'])

Unnamed: 0,principal component 1,principal component 2
0,6815.682172,-32.803008
1,3140.887435,-35.857768
2,540.073378,-55.989450
3,866.834598,-30.936420
4,1598.439920,-22.920795
...,...,...
1252,-1834.340429,-3.654030
1253,-869.104044,-7.649563
1254,182.504881,28.476522
1255,-1035.100135,14.189406


In [37]:
pca.explained_variance_ratio_

array([9.99595992e-01, 3.77661019e-04])

In [38]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)
# Train the model
rfc = RandomForestClassifier(random_state=0)
rfc = rfc.fit(X_train_pca, y_train)

# Test the model by doing some predictions
y_pred = rfc.predict(X_test_pca)

# See how accurate the predictions are
report = classification_report(y_test, y_pred)
print('Model accuracy', accuracy_score(y_test, y_pred, normalize=True))
print(report)

Model accuracy 0.5198412698412699
              precision    recall  f1-score   support

           0       0.55      0.54      0.54       134
           1       0.49      0.50      0.49       118

    accuracy                           0.52       252
   macro avg       0.52      0.52      0.52       252
weighted avg       0.52      0.52      0.52       252



### Features selection 

In [39]:
predictors = ['RSI', 'MACD','STOCH', '%R', 'PROC', 'ema50', 'ema21', 'ema15', 'ema5', 'normVol']
X = HD_5Y_smooth[predictors]
y = HD_5Y_smooth['class']
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Train the model
rfc = RandomForestClassifier(random_state=0)
rfc = rfc.fit(X_train, y_train)

# Test the model by doing some predictions
y_pred = rfc.predict(X_test)

# See how accurate the predictions are
report = classification_report(y_test, y_pred)
print('Model accuracy', accuracy_score(y_test, y_pred, normalize=True))
print(report)

Model accuracy 0.5846560846560847
              precision    recall  f1-score   support

           0       0.60      0.59      0.60       195
           1       0.57      0.57      0.57       183

    accuracy                           0.58       378
   macro avg       0.58      0.58      0.58       378
weighted avg       0.58      0.58      0.58       378

