In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import yfinance as yf
import math
import plotly_express as px

from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from functions import grangerTests, plot_seasonal_decompose, daySignal, addDateParts

In [37]:
#define the ticker symbol
tickerSymbol = 'ETH-USD' #'MSFT'

# Date period
start = dt.datetime(2010,1,1)
end = dt.datetime.now()

#get data on this ticker
tickerData = yf.Ticker(tickerSymbol)
#get the historical prices for this ticker
df = tickerData.history(period='1D', start=start, end=end)

# Removing features with constant values
df = df.loc[:,df.apply(pd.Series.nunique) != 1]

# Some calculated variables
df['day_change'] = df['Close'] - df['Open']
df['day_change_pct'] = (df['day_change'] / df['Open']) * 100
df['day_change_signal'] = df['day_change'].apply(daySignal)

# Lagging close, high and low by one day
for feature in ['Close', 'High', 'Low', 'Volume']:
    df[f"{feature}_lag1"] = df[feature].shift(1).bfill() 

# Adding date features
df = addDateParts(df)

# Exit datetime index
df = df.reset_index()

# View dataframe
df


weekofyear and week have been deprecated, please use DatetimeIndex.isocalendar().week instead, which returns a Series. To exactly reproduce the behavior of week and weekofyear and return an Index, you may call pd.Int64Index(idx.isocalendar().week)



Unnamed: 0,Date,Open,High,Low,Close,Volume,day_change,day_change_pct,day_change_signal,Close_lag1,High_lag1,Low_lag1,Volume_lag1,day_of_year,day_of_week,week,month
0,2017-11-09 00:00:00+00:00,308.644989,329.451996,307.056000,320.884003,893249984,12.239014,3.965402,1,320.884003,329.451996,307.056000,8.932500e+08,9,3,45,11
1,2017-11-10 00:00:00+00:00,320.670990,324.717987,294.541992,299.252991,885985984,-21.417999,-6.679120,-1,320.884003,329.451996,307.056000,8.932500e+08,10,4,45,11
2,2017-11-11 00:00:00+00:00,298.585999,319.453003,298.191986,314.681000,842300992,16.095001,5.390407,1,299.252991,324.717987,294.541992,8.859860e+08,11,5,45,11
3,2017-11-12 00:00:00+00:00,314.690002,319.153015,298.513000,307.907990,1613479936,-6.782013,-2.155141,-1,314.681000,319.453003,298.191986,8.423010e+08,12,6,45,11
4,2017-11-13 00:00:00+00:00,307.024994,328.415009,307.024994,316.716003,1041889984,9.691010,3.156424,1,307.907990,319.153015,298.513000,1.613480e+09,13,0,46,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1903,2023-01-25 00:00:00+00:00,1556.807495,1632.241699,1530.797852,1611.711060,10598973448,54.903564,3.526677,1,1556.604248,1639.723877,1551.389771,8.180275e+09,25,2,4,1
1904,2023-01-26 00:00:00+00:00,1611.080933,1626.198242,1586.598145,1603.105957,8395315241,-7.974976,-0.495008,-1,1611.711060,1632.241699,1530.797852,1.059897e+10,26,3,4,1
1905,2023-01-27 00:00:00+00:00,1603.080078,1617.000854,1565.244995,1598.156494,8124465373,-4.923584,-0.307133,-1,1603.105957,1626.198242,1586.598145,8.395315e+09,27,4,4,1
1906,2023-01-28 00:00:00+00:00,1598.125366,1604.704102,1565.390137,1572.435059,5803653357,-25.690308,-1.607528,-1,1598.156494,1617.000854,1565.244995,8.124465e+09,28,5,4,1


## Target variable

In [38]:
feat_ex = 'Close'
px.line(df, x='Date', y=[feat_ex])

## Pre-processing

In [39]:
# Last year as testing period
split = '2022-01-31' 
# Drop columns not available at open
drop_cols = ['Date', 'High', 'Low', 'Volume', 'day_change','day_change_pct', 'day_change_signal']
# Split on date
train = df[df['Date'] <= split].drop(columns=drop_cols)
test = df[df['Date'] > split].drop(columns=drop_cols)
# Features and target
X_train = train.drop(columns=feat_ex)
X_test = test.drop(columns=feat_ex)
y_train = train[feat_ex]
y_test = test[feat_ex]
# Scale features
scaler = MinMaxScaler(feature_range=(0,1))
X_train_scaled = scaler.fit_transform(X_train)#.reshape(-1,1))
X_test_scaled = scaler.transform(X_test)
# Target as np array
y_train, y_test = np.array(y_train), np.array(y_test)
# Reshape for LSTM
X_train_lstm = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_lstm = np.reshape(X_test_scaled, (X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

In [40]:
print(f"The shape of the training data is {X_train_lstm.shape} features and {y_train.shape} target")
print("------------------------------")
print(f"The shape of the testing data is {X_test_lstm.shape} features and {y_test.shape} target")

The shape of the training data is (1545, 9, 1) features and (1545,) target
------------------------------
The shape of the testing data is (363, 9, 1) features and (363,) target


## LSTM

In [41]:
model = keras.Sequential()
model.add(layers.LSTM(100, return_sequences=True, input_shape=(X_train_lstm.shape[1], 1)))
model.add(layers.LSTM(100, return_sequences=False))
model.add(layers.Dense(25))
model.add(layers.Dense(1))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 9, 100)            40800     
                                                                 
 lstm_7 (LSTM)               (None, 100)               80400     
                                                                 
 dense_6 (Dense)             (None, 25)                2525      
                                                                 
 dense_7 (Dense)             (None, 1)                 26        
                                                                 
Total params: 123,751
Trainable params: 123,751
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(X_train_lstm, y_train, batch_size= 1, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [44]:
hist_df = pd.DataFrame(history.history)
px.line(hist_df, y=['loss'])

In [45]:
results_df = df[['Date']]
results_df = results_df[results_df['Date'] > split]
predictions = model.predict(X_test_lstm)
results_df['predictions'] = predictions
results_df = results_df.set_index('Date').join(df[['Date', feat_ex]].set_index('Date'), how='outer')
results_df



Unnamed: 0_level_0,predictions,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-11-09 00:00:00+00:00,,320.884003
2017-11-10 00:00:00+00:00,,299.252991
2017-11-11 00:00:00+00:00,,314.681000
2017-11-12 00:00:00+00:00,,307.907990
2017-11-13 00:00:00+00:00,,316.716003
...,...,...
2023-01-25 00:00:00+00:00,1595.706787,1611.711060
2023-01-26 00:00:00+00:00,1651.965210,1603.105957
2023-01-27 00:00:00+00:00,1643.726562,1598.156494
2023-01-28 00:00:00+00:00,1638.670044,1572.435059


In [46]:
px.line(results_df, x=results_df.index, y=[feat_ex, 'predictions'])

In [47]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

perf_dict = dict()
model_name = 'lstm'
perf_dict[model_name] = {
    'r2': r2_score(y_test, predictions),
    'mae': mean_absolute_error(y_test, predictions),
    'mape': mean_absolute_percentage_error(y_test, predictions)*100,
    'rmse': mean_squared_error(y_test, predictions, squared=False),
}

pd.DataFrame(perf_dict).T

Unnamed: 0,mae,mape,r2,rmse
lstm,80.214831,4.173377,0.974303,112.745836
