In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import yfinance as yf
import math
import plotly_express as px
from datetime import datetime, timedelta

from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from functions import grangerTests, plot_seasonal_decompose, daySignal, addDateParts

In [37]:
#define the ticker symbol
tickerSymbol = 'ETH-USD' #'MSFT'

# Date period
start = dt.datetime(2010,1,1)
end = dt.datetime.now()

#get data on this ticker
tickerData = yf.Ticker(tickerSymbol)
#get the historical prices for this ticker
df = tickerData.history(period='1D', start=start, end=end)

# Removing features with constant values
df = df.loc[:,df.apply(pd.Series.nunique) != 1]

# Some calculated variables
df['day_change'] = df['Close'] - df['Open']
df['day_change_pct'] = (df['day_change'] / df['Open']) * 100
df['day_change_signal'] = df['day_change'].apply(daySignal)

# Lagging close, high and low by one day
for feature in ['Close', 'High', 'Low', 'Volume']:
    df[f"{feature}_lag1"] = df[feature].shift(1).bfill() 

# Adding date features
df = addDateParts(df)

# Exit datetime index
df = df.reset_index()

# View dataframe
df


weekofyear and week have been deprecated, please use DatetimeIndex.isocalendar().week instead, which returns a Series. To exactly reproduce the behavior of week and weekofyear and return an Index, you may call pd.Int64Index(idx.isocalendar().week)



Unnamed: 0,Date,Open,High,Low,Close,Volume,day_change,day_change_pct,day_change_signal,Close_lag1,High_lag1,Low_lag1,Volume_lag1,day_of_year,day_of_week,week,month
0,2017-11-09 00:00:00+00:00,308.644989,329.451996,307.056000,320.884003,893249984,12.239014,3.965402,1,320.884003,329.451996,307.056000,8.932500e+08,9,3,45,11
1,2017-11-10 00:00:00+00:00,320.670990,324.717987,294.541992,299.252991,885985984,-21.417999,-6.679120,-1,320.884003,329.451996,307.056000,8.932500e+08,10,4,45,11
2,2017-11-11 00:00:00+00:00,298.585999,319.453003,298.191986,314.681000,842300992,16.095001,5.390407,1,299.252991,324.717987,294.541992,8.859860e+08,11,5,45,11
3,2017-11-12 00:00:00+00:00,314.690002,319.153015,298.513000,307.907990,1613479936,-6.782013,-2.155141,-1,314.681000,319.453003,298.191986,8.423010e+08,12,6,45,11
4,2017-11-13 00:00:00+00:00,307.024994,328.415009,307.024994,316.716003,1041889984,9.691010,3.156424,1,307.907990,319.153015,298.513000,1.613480e+09,13,0,46,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1904,2023-01-26 00:00:00+00:00,1611.080933,1626.198242,1586.598145,1603.105957,8395315241,-7.974976,-0.495008,-1,1611.711060,1632.241699,1530.797852,1.059897e+10,26,3,4,1
1905,2023-01-27 00:00:00+00:00,1603.080078,1617.000854,1565.244995,1598.156494,8124465373,-4.923584,-0.307133,-1,1603.105957,1626.198242,1586.598145,8.395315e+09,27,4,4,1
1906,2023-01-28 00:00:00+00:00,1598.125366,1604.704102,1565.390137,1572.435059,5803653357,-25.690308,-1.607528,-1,1598.156494,1617.000854,1565.244995,8.124465e+09,28,5,4,1
1907,2023-01-29 00:00:00+00:00,1572.629517,1653.724976,1568.984863,1646.155640,8801292300,73.526123,4.675362,1,1572.435059,1604.704102,1565.390137,5.803653e+09,29,6,4,1


## Target variable

In [38]:
feat_ex = 'Close'
px.line(df, x='Date', y=[feat_ex])

## Pre-processing

In [39]:
# Last year as testing period
split = '2022-01-01' 
# Drop columns not available at open
drop_cols = ['Date', 'High', 'Low', 'Volume', 'day_change','day_change_pct', 'day_change_signal']
# Split on date
train = df[df['Date'] <= split].drop(columns=drop_cols)
test = df[df['Date'] > split].drop(columns=drop_cols)
# Features and target
X_train = train.drop(columns=feat_ex)
X_test = test.drop(columns=feat_ex)
y_train = train[feat_ex]
y_test = test[feat_ex]
# Scale features
scaler = MinMaxScaler(feature_range=(0,1))
X_train_scaled = scaler.fit_transform(X_train)#.reshape(-1,1))
X_test_scaled = scaler.transform(X_test)
# Target as np array
y_train, y_test = np.array(y_train), np.array(y_test)
# Reshape for LSTM
X_train_lstm = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_lstm = np.reshape(X_test_scaled, (X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

In [40]:
print(f"The shape of the training data is {X_train_lstm.shape} features and {y_train.shape} target")
print("------------------------------")
print(f"The shape of the testing data is {X_test_lstm.shape} features and {y_test.shape} target")

The shape of the training data is (1515, 9, 1) features and (1515,) target
------------------------------
The shape of the testing data is (394, 9, 1) features and (394,) target


In [41]:
# Create model
model = keras.Sequential()
model.add(layers.LSTM(100, return_sequences=True, input_shape=(X_train_lstm.shape[1], 1)))
model.add(layers.LSTM(100, return_sequences=False))
model.add(layers.Dense(25))
model.add(layers.Dense(1))
# Compile & fit model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train_lstm, y_train, batch_size= 1, epochs=30, verbose=1)
# Predict next point
y_pred = model.predict(X_test_lstm)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [42]:
results_df = df[['Date']]
results_df = results_df[results_df['Date'] > split]
predictions = model.predict(X_test_lstm)
results_df['predictions'] = predictions
results_df = results_df.set_index('Date').join(df[['Date', feat_ex]].set_index('Date'), how='outer')
# Plot results
px.line(results_df, x=results_df.index, y=[feat_ex, 'predictions'])



In [43]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

perf_dict = dict()
model_name = 'lstm'
perf_dict[model_name] = {
    'r2': r2_score(y_test, predictions),
    'mae': mean_absolute_error(y_test, predictions),
    'mape': mean_absolute_percentage_error(y_test, predictions)*100,
    'rmse': mean_squared_error(y_test, predictions, squared=False),
}

pd.DataFrame(perf_dict).T

Unnamed: 0,mae,mape,r2,rmse
lstm,63.872607,3.385038,0.985739,89.991952


### Walk forward LSTM

In [44]:
# Last year as testing period
split = '2023-01-01' 
# Drop columns not available at open
drop_cols = ['Date', 'High', 'Low', 'Volume', 'day_change','day_change_pct', 'day_change_signal']
# Split index
split_idx = df[df['Date']==split].index[0]
# Walk forward predictions
test_idx = df.index[df.index > split_idx]
predictions = list()
for i in test_idx:
    # Split on date
    train = df[df.index <= split_idx].drop(columns=drop_cols)
    test = df[df.index == i].drop(columns=drop_cols)
    # Features and target
    X_train = train.drop(columns=feat_ex)
    X_test = test.drop(columns=feat_ex)
    y_train = train[feat_ex]
    y_test = test[feat_ex]
    # Scale features
    scaler = MinMaxScaler(feature_range=(0,1))
    X_train_scaled = scaler.fit_transform(X_train)#.reshape(-1,1))
    X_test_scaled = scaler.transform(X_test)
    # Target as np array
    y_train, y_test = np.array(y_train), np.array(y_test)
    # Reshape for LSTM
    X_train_lstm = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
    X_test_lstm = np.reshape(X_test_scaled, (X_test_scaled.shape[0], X_test_scaled.shape[1], 1))
    # Iterate split index
    split_idx += 1
    # Create model
    model = keras.Sequential()
    model.add(layers.LSTM(100, return_sequences=True, input_shape=(X_train_lstm.shape[1], 1)))
    model.add(layers.LSTM(100, return_sequences=False))
    model.add(layers.Dense(25))
    model.add(layers.Dense(1))
    # Compile & fit model
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train_lstm, y_train, batch_size= 1, epochs=10, verbose=2)
    # Predict next point
    y_pred = model.predict(X_test_lstm)
    predictions.append(y_pred[0])
    

Epoch 1/10
1880/1880 - 20s - loss: 1088593.8750 - 20s/epoch - 11ms/step
Epoch 2/10
1880/1880 - 15s - loss: 69614.3125 - 15s/epoch - 8ms/step
Epoch 3/10
1880/1880 - 15s - loss: 24051.5547 - 15s/epoch - 8ms/step
Epoch 4/10
1880/1880 - 17s - loss: 20579.8574 - 17s/epoch - 9ms/step
Epoch 5/10
1880/1880 - 17s - loss: 18442.4199 - 17s/epoch - 9ms/step
Epoch 6/10
1880/1880 - 17s - loss: 15603.4883 - 17s/epoch - 9ms/step
Epoch 7/10
1880/1880 - 19s - loss: 12987.7705 - 19s/epoch - 10ms/step
Epoch 8/10
1880/1880 - 17s - loss: 14261.1357 - 17s/epoch - 9ms/step
Epoch 9/10
1880/1880 - 18s - loss: 12446.2480 - 18s/epoch - 10ms/step
Epoch 10/10
1880/1880 - 20s - loss: 12142.2383 - 20s/epoch - 11ms/step
Epoch 1/10
1881/1881 - 25s - loss: 1143457.1250 - 25s/epoch - 13ms/step
Epoch 2/10
1881/1881 - 20s - loss: 71959.4375 - 20s/epoch - 11ms/step
Epoch 3/10
1881/1881 - 20s - loss: 26274.4121 - 20s/epoch - 11ms/step
Epoch 4/10
1881/1881 - 21s - loss: 26814.3457 - 21s/epoch - 11ms/step
Epoch 5/10
1881/1881 

In [48]:
wf_preds = np.array(predictions)

In [54]:
results_df = df[['Date']]
results_df = results_df[results_df['Date'] > split]
results_df['predictions'] = wf_preds
results_df = results_df.set_index('Date').join(df[['Date', feat_ex]].set_index('Date'), how='outer')
px.line(results_df, x=results_df.index, y=[feat_ex, 'predictions'])

In [68]:
results_df.dropna()

Unnamed: 0_level_0,predictions,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-02 00:00:00+00:00,1135.134033,1214.656616
2023-01-03 00:00:00+00:00,1181.916016,1214.778809
2023-01-04 00:00:00+00:00,1087.960815,1256.526611
2023-01-05 00:00:00+00:00,1224.827637,1250.438599
2023-01-06 00:00:00+00:00,1110.419922,1269.379028
2023-01-07 00:00:00+00:00,1118.065063,1264.270386
2023-01-08 00:00:00+00:00,1245.755981,1287.359497
2023-01-09 00:00:00+00:00,1300.317871,1321.53894
2023-01-10 00:00:00+00:00,1246.17627,1336.58606
2023-01-11 00:00:00+00:00,1297.191528,1387.932739


In [78]:
#perf_dict = dict()
model_name = 'wf lstm'
perf_dict[model_name] = {
    'r2': r2_score(results_df.dropna()[feat_ex].values, results_df.dropna()['predictions'].values),
    'mae': mean_absolute_error(results_df.dropna()[feat_ex].values, results_df.dropna()['predictions'].values),
    'mape': mean_absolute_percentage_error(results_df.dropna()[feat_ex].values, results_df.dropna()['predictions'].values)*100,
    'rmse': mean_squared_error(results_df.dropna()[feat_ex].values, results_df.dropna()['predictions'].values, squared=False),
}

pd.DataFrame(perf_dict).T

Unnamed: 0,mae,mape,r2,rmse
wf lstm,63.842984,4.467202,0.642189,90.238848


In [93]:
hist_df = pd.DataFrame(history.history)
px.line(hist_df, y=['loss'])

## Cross validation

In [97]:
drop_cols = ['Date', 'High', 'Low', 'Volume', 'day_change','day_change_pct', 'day_change_signal']
# Drop columns not available at open
X = df.drop(columns=drop_cols)
# Drop target
X = X.drop(columns=feat_ex)
# Create target
y = df[feat_ex]

In [98]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(max_train_size=None, n_splits=5)
split = 1
pred_dict = dict()
for train_index, test_index in tscv.split(X):
    print(f"split: {split}")
    print("TRAIN:", train_index.max(), "TEST:", test_index.max()) 
    #print("TRAIN:", train_index, "TEST:", test_index) 
    X_train, X_test = X[:train_index.max()], X[train_index.max():test_index.max()]
    y_train, y_test = y[:train_index.max()], y[train_index.max():test_index.max()]
    print(f"Training {X_train.shape} features and {y_train.shape} target")
    print("---------------")
    print(f"Testing {X_test.shape} features and {y_test.shape} target")
    # Scale features
    scaler = MinMaxScaler(feature_range=(0,1))
    X_train_scaled = scaler.fit_transform(X_train)#.reshape(-1,1))
    X_test_scaled = scaler.transform(X_test)
    # Target as np array
    y_train, y_test = np.array(y_train), np.array(y_test)
    # Reshape for LSTM
    X_train_lstm = np.reshape(X_train_scaled, (X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
    X_test_lstm = np.reshape(X_test_scaled, (X_test_scaled.shape[0], X_test_scaled.shape[1], 1))
    # Define model
    model = keras.Sequential()
    model.add(layers.LSTM(100, return_sequences=True, input_shape=(X_train_lstm.shape[1], 1)))
    model.add(layers.LSTM(100, return_sequences=False))
    model.add(layers.Dense(25))
    model.add(layers.Dense(1))
    # Compile and fit
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train_lstm, y_train, batch_size= 1, epochs=30)
    # Predict test set
    predictions = model.predict(X_test_lstm)
    #pred_dict[f"split: {split}"] = predictions
    perf_dict[f"split: {split}"] = {
        'r2': r2_score(y_test, predictions),
        'mae': mean_absolute_error(y_test, predictions),
        'mape': mean_absolute_percentage_error(y_test, predictions)*100,
        'rmse': mean_squared_error(y_test, predictions, squared=False),
    }
    # Iterate split
    split += 1

split: 1
TRAIN: 317 TEST: 635
Training (317, 9) features and (317,) target
---------------
Testing (318, 9) features and (318,) target
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
split: 2
TRAIN: 635 TEST: 953
Training (635, 9) features and (635,) target
---------------
Testing (318, 9) features and (318,) target
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
split: 3
TRAIN: 953 TEST: 12

In [101]:
pd.DataFrame(perf_dict).T.sort_values(by='rmse')

Unnamed: 0,r2,mae,mape,rmse
split: 2,0.803235,13.725543,7.995576,16.078393
split: 1,0.258963,41.663847,31.227426,50.849238
split: 5,0.984885,58.036998,3.528295,79.640428
lstm,0.980619,69.24333,3.787606,97.913734
split: 3,0.724314,194.612701,11.501801,376.405679
split: 4,-0.846521,822.937895,22.911566,1025.27943
