### '@author: Andrii Koval, Yulia Khlyaka, Pavlo Mospan'

In [21]:
import numpy as np
import pandas as pd
from math import sqrt
from fbprophet import Prophet
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tensorflow.keras.models import load_model
from datetime import datetime, date, timedelta

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import yfinance as yf
from pathlib import Path

In [2]:
def prepare_data(path=f'{Path().parent.absolute().parent}/data/BTC-USD.csv', start_date='2017-01-01'):
    """Loads .csv file and prepares data for future use.

    Args:
        path (str): path to .csv file

    Returns:
        df: preprocessed DataFrame

    @author: Andrii Koval
    """
    df = pd.read_csv(path)
    df['Date'] = pd.to_datetime(df.Date, format='%Y-%m-%d')
    df['Diff'] = np.diff(df['Close'], prepend=[0])
    df['Direction'] = np.where(df['Diff'] >= 0, 'green', 'red')
    df['Diff_abs'] = np.abs(df['Diff'])
    df = df[df['Date'] > start_date]

    return df

## Prophet

In [3]:
df = prepare_data()
df = df.rename(columns={"Date": "ds", "Close": "y"})

model = Prophet()
model.fit(df)

today = datetime.today()
end = datetime.strptime('2021-03-01', '%Y-%m-%d')
diff = (end - today).days
future = model.make_future_dataframe(periods=diff, freq='D')
forecast = model.predict(future)

forecast['y_actual'] = model.history['y']
forecast = forecast.fillna(0)

forecast = forecast[forecast['ds'] <= '2021-02-01']
rmse = mean_squared_error(forecast['yhat'], forecast['y_actual'], squared=False)
print('Root Mean Squared Error: ', rmse)

forecast.tail()

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
Root Mean Squared Error:  1593.707977659229


Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat,y_actual
1483,2021-01-28,28603.938992,29960.998925,34696.392143,28603.938992,28603.938992,3931.91563,3931.91563,3931.91563,-23.684358,-23.684358,-23.684358,3955.599988,3955.599988,3955.599988,0.0,0.0,0.0,32535.854622,31649.605469
1484,2021-01-29,28691.862442,30383.992891,34966.055908,28691.862442,28691.862442,4003.256325,4003.256325,4003.256325,9.186569,9.186569,9.186569,3994.069756,3994.069756,3994.069756,0.0,0.0,0.0,32695.118767,34316.386719
1485,2021-01-30,28779.785892,30693.67272,35330.917618,28779.785892,28779.785892,4018.342472,4018.342472,4018.342472,-28.512113,-28.512113,-28.512113,4046.854585,4046.854585,4046.854585,0.0,0.0,0.0,32798.128363,34269.523438
1486,2021-01-31,28867.709341,30572.166154,35356.34275,28867.709341,28867.709341,4092.096402,4092.096402,4092.096402,-21.115814,-21.115814,-21.115814,4113.212216,4113.212216,4113.212216,0.0,0.0,0.0,32959.805743,33114.359375
1487,2021-02-01,28955.632791,30884.251275,35624.817958,28955.632791,28955.632791,4236.671158,4236.671158,4236.671158,44.645616,44.645616,44.645616,4192.025542,4192.025542,4192.025542,0.0,0.0,0.0,33192.303949,33537.175781


## ARIMA

In [6]:
def load_data_arima():
    """Loads .csv file and prepares data for future use.

    Args:
        path (str): path to .csv file

    Returns:
        df: preprocessed DataFrame

    @author: Yulia Khlyaka
    """
    # loading data
    path = f'{Path().parent.absolute().parent}/data/BTC-USD.csv'
    df = pd.read_csv(path)

    df['Date'] = pd.to_datetime(df.Date, format='%Y-%m-%d')
    # # Producing and visualizing forecasts
    cols = ['Open', 'Low', 'High', 'Volume', 'Adj Close']
    df.drop(cols, axis=1, inplace=True)   
    return df

# Reading in the data
df_a = load_data_arima()
# Resampling to daily frequency
df_a.index = df_a.Date
df_a = df_a.resample('D').mean()

# Model
mod = SARIMAX(df_a, order=(0, 1, 1), seasonal_order=(1, 1, 1, 12))
res = mod.fit(disp=-1)
# Prediction
start = datetime.strptime('2017-01-01', '%Y-%m-%d')
end = datetime.strptime('2021-02-01', '%Y-%m-%d')
a_forecast = res.get_prediction(start=start, end=end)
a_forecast = a_forecast.predicted_mean
a_forecast = a_forecast.reset_index()
a_forecast.columns = ['ds', 'yhat']

# actual values
history_df = load_data_arima()
start_date = datetime.strptime('2016-12-31', '%Y-%m-%d')
end_date = datetime.strptime('2021-02-02', '%Y-%m-%d')
history_df = df_arima[(df_arima['Date'] > start_date) & (df_arima['Date'] < end_date)]
history_df = history_df.Close
a_forecast['y_actual'] = history_df

# Forecast Error
a_forecast['error'] = pow((a_forecast['y_actual'] - a_forecast['yhat']), 2)
print('Root Mean Squared Error 01.01.2017-01.02.2021: ', sqrt(a_forecast['error'].mean()))


Root Mean Squared Error 01.01.2017-01.02.2021:  646.257946471278


## LSTM - 1 feature (Bidirectional)

In [22]:
df = yf.download('BTC-USD', start='2017-01-01', end = '2021-02-01')
df = df.reset_index()

scaler = MinMaxScaler()

close_price = df.Close.values.reshape(-1, 1)

scaled_close = scaler.fit_transform(close_price)

SEQ_LEN = 20
whole_data = np.empty([1, 20, 1])
whole_targets = np.empty([1, 1])

def to_sequences(data, seq_len):
    """
    Spliting data into sequences of the preset length 
    and obtaining shape: [batch_size, sequence_length, n_features]
    
    :param data: array of values
    :param seq_len: sequence length
    
    :return: array of sequenced values
    
    @author: Pavlo Mospan
    """
    d = []

    for index in range(len(data) - seq_len + 1):
        d.append(data[index: index + seq_len])

    return np.array(d)

def preprocess(data_raw, seq_len, train_split):
    """
    Building sequences by creating a sequence of a specified length at position 0. 
    Then shifting one position to the right (e.g. 1) and creating another sequence. 
    
    :param data_raw: array of values
    :param seq_len: sequence length
    :param train_split: percentage of train/test split
    
    :return: arrays of trained and test values
    
    @author: Pavlo Mospan
    """

    global whole_data, whole_targets
    whole_data = to_sequences(data_raw, seq_len)
    whole_targets = whole_data[:, -1, :]
    print('whole_data: ', whole_data.shape)
    print('whole_targets: ', whole_targets.shape)

    num_train = int(train_split * whole_data.shape[0])

    X_train = whole_data[:num_train, :, :]
    y_train = whole_data[:num_train, -1, :]

    X_test = whole_data[num_train:, :, :]
    y_test = whole_data[num_train:, -1, :]

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = preprocess(scaled_close, SEQ_LEN, train_split = 0.95)

X_train.shape, X_test.shape

[*********************100%***********************]  1 of 1 completed
whole_data:  (1470, 20, 1)
whole_targets:  (1470, 1)


((1396, 20, 1), (74, 20, 1))

In [23]:
def get_prediction_array(X):
  """
  Inserts an other array to the end of the whole_data sequence in order to make
  array for predicting tomorrow's value

  returns: array of shape (None, 20, 1)
  
  @author: Pavlo Mospan
  """
  next = len(X)
  a = np.insert(X,[next],X[next-1], axis=0)
  for i_ in range(len(a[100]) - 1):
    if(i_ == 20):
      a[next][i_][0] == a[next][i_ - 1][0]
    else:
      a[next][i_][0] = a[next][i_ + 1][0]
  
  return a

In [18]:
targ_array = get_prediction_array(whole_targets)
pred_array = get_prediction_array(whole_data)
pred_array.shape, targ_array.shape 

((1471, 20, 1), (1471, 1))

In [19]:
model = load_model('../models/lstm/bidirect_lstm/BidirectLSTM_BTC-1f.h5')

y_hat = model.predict(pred_array)

y_true_inverse = scaler.inverse_transform(targ_array)
y_hat_inverse = scaler.inverse_transform(y_hat)

print('Predicted BTC price for the next day: {} and true BTC price for the next day: {}'.format(y_hat_inverse[-1], y_true_inverse[-1]))

Predicted BTC price for the next day: [34196.31] and true BTC price for the next day: [33114.359375]


In [20]:
print('Root mean squared error: ', mean_squared_error(y_hat_inverse, y_true_inverse, squared=False))

Root mean squared error:  424.92938066897835


## LSTM - 4 features

In [2]:
today = datetime.today().strftime('%Y-%m-%d')
start_date = '2015-08-07'
end_date = '2021-02-01'
idx = pd.date_range(start_date, today)

df_A = yf.download("BTC-USD", start=start_date, end=end_date)
df_A = df_A["Close"]
df_A = df_A.reindex(idx, fill_value=0, method="nearest")

# The S&P 500 - stock market index that measures the stock performance of 500 large companies listed on stock exchanges in the US. 
df_B = yf.download("%5EGSPC", start=start_date, end=end_date)
df_B = df_B["Close"]
df_B = df_B.reindex(idx, fill_value=0, method="nearest")

# US Dollar/USDX - Index - Cash
df_C = yf.download("DX-Y.NYB", start=start_date, end=end_date)
df_C = df_C["Close"]
df_C = df_C.reindex(idx, fill_value=0, method="nearest")

# Gold
df_D = yf.download("GC=F", start=start_date, end=end_date)
df_D = df_D["Close"]
df_D = df_D.reindex(idx, fill_value=0, method="nearest")

timeseries = pd.concat([df_A, df_B, df_C, df_D], axis=1)

timeseries.columns=(["Close-btc", "Close-sp500", "Close-dxy", "Close-gold"])
btc = timeseries['Close-btc'].copy()

scl = MinMaxScaler()

timeseries[["Close-btc", "Close-sp500", "Close-dxy", "Close-gold"]] = scl.fit_transform(timeseries[
                                                ["Close-btc", "Close-sp500", "Close-dxy", "Close-gold"]])

timeseries["Close-btc-output"] = timeseries["Close-btc"]
timeseries["Close-btc-output"] = timeseries["Close-btc-output"].shift(-1)
#timeseries.drop(timeseries.tail(1).index, inplace=True)
timeseries.loc[today, 'Close-btc-output']

yesterday = date.today() - timedelta(days=1)
yesterday = yesterday.strftime('%Y-%m-%d')

timeseries.loc[today, 'Close-btc-output'] = timeseries.loc[yesterday, 'Close-btc-output']
print(timeseries)
array = timeseries.values

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
            Close-btc  Close-sp500  Close-dxy  Close-gold  Close-btc-output
2015-08-07   0.001702     0.122634   0.610204    0.043270          0.001244
2015-08-08   0.001244     0.122634   0.610204    0.043270          0.001345
2015-08-09   0.001345     0.135766   0.582994    0.053363          0.001330
2015-08-10   0.001330     0.135766   0.582994    0.053363          0.001476
2015-08-11   0.001476     0.125841   0.591837    0.056760          0.001377
...               ...          ...        ...         ...               ...
2021-02-13   0.810697     0.930355   0.135374    0.795943          0.810697
2021-02-14   0.810697     0.930355   0.135374    0.795943          0.810697
2021-02-15   0.810697     0.930355  

In [3]:
#number of days to take in a row before Y
X_quantity = 3  
mod = len(array) % X_quantity

# how many predictions to make
times = 7

# deleting first-mod values to have /mod-zero array
for i_ in range(mod):
  array = np.delete(array, 0, 0)

# for splitting into train/test
division = X_quantity * times
split = len(array) - division
print('Division: ', division, 'Split is on: ', split)

predict = array[split:]
train = array[:split]

predict, predict.shape

Division:  21 Split is on:  2001


(array([[0.77460817, 0.96645073, 0.12721103, 0.78654943, 0.84031329],
        [0.84031329, 0.93035508, 0.13537448, 0.79594288, 0.83915866],
        [0.83915866, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731, 0.93035508, 0.13537448, 0.79594288, 0.81069731],
        [0.81069731,

In [4]:
def get_X_values(values):
  """
  Returns numpay.ndarray of sequence of X_quantity days

  Args:
    values: array of predict/train values

  @author: Pavlo Mospan
  """
  x = []
  ready_X = []
  COUNT = 1
  for i_ in values:
    x.extend(i_)
    if COUNT % X_quantity == 0:
      ready_X.append(x)
      x = []

    COUNT += 1

  ready_X = np.array(ready_X)
  return ready_X

def get_Y_targets(targets):
  """
  Returns numpay.ndarray of target values

  Args:
    targets: array of predict/train targets

  @author: Pavlo Mospan
  """
  ready_Y = []
  for i_ in range(int(len(targets) / X_quantity)):
    i_ += 1
    ready_Y.append(targets[i_ * X_quantity -1])

  ready_Y = np.array(ready_Y)
  return ready_Y


# taking 4 features 
predict_values = predict[:, :-1] 
# taking target values
predict_target = predict[:, -1]

# taking 4 features
train_values = train[:, :-1]
# taking target values
train_target = train[:, -1]


In [5]:
def get_array(g):
  """
  Gets an array of type (None, 1), shapes it into the shape (None, 4), then scales and returns only predicted values
  
  :param: array of shape (None, 1)
  :return: array ready for predictions with shape (None, 4)

  @author: Pavlo Mospan
  """
  g = np.insert(g, [1], .4, axis = 1)
  g = np.insert(g, [2], .4, axis = 1)
  g = np.insert(g, [3], .4, axis = 1)

  array_ = scl.inverse_transform(g)
  array_ready = []
  for i in range(len(array_[:, :1])):
    array_ready.append(array_[i,:1][0])

  return array_ready

In [6]:
model = load_model('../models/lstm/LSTM_MULTI-3.h5')



In [7]:
predict_all = get_X_values(array[:,:-1])
predict_all.shape

(674, 12)

In [8]:
predict_all = predict_all.reshape((predict_all.shape[0], 1, predict_all.shape[1]))
predict_all.shape

(674, 1, 12)

In [9]:
yhat = model.predict(predict_all) 
    
pred = get_array(yhat)
yhat.shape , len(pred)

((674, 1), 674)

In [10]:
t = timeseries.reset_index()
timestamp = pd.DataFrame()
timestamp['data'] = t['index'].copy()
timestamp

Unnamed: 0,data
0,2015-08-07
1,2015-08-08
2,2015-08-09
3,2015-08-10
4,2015-08-11
...,...
2017,2021-02-13
2018,2021-02-14
2019,2021-02-15
2020,2021-02-16


In [11]:
DAY = 0
ds = []
y_actual = []
tomorrow = date.today() + timedelta(days=1)
tomorrow = tomorrow.strftime('%Y-%m-%d')

for i_ in range(len(pred)):
  if i_ == (len(pred) - 1):
    ds.append(tomorrow)
    y_actual.append(0)
  else:
    DAY = DAY + 3
    ds.append(timestamp.iloc[DAY][0].strftime('%Y-%m-%d'))
    y_actual.append(btc.iloc[DAY])

In [12]:
print('Root mean squared error: ', mean_squared_error(y_actual, pred, squared=False))

Root mean squared error:  1339.3126780735515
