In [1]:
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import warnings
warnings.filterwarnings("ignore")

<h1 style="text-align:center">🏰 Data Modelling</h1> 

<h3 style="text-align:center">Predict BTC prices</h3> 

# 🎖 1. Get newest data points

## ➡️ Define symbol and time interval

In [2]:
symbol = "BTCUSDT"
PERIOD = "15m"

In [3]:
!jupyter nbconvert --to python ../obtain/get_newest_price.ipynb
!python ../obtain/get_newest_price.py

[NbConvertApp] Converting notebook ../obtain/get_newest_price.ipynb to python
[NbConvertApp] Writing 1547 bytes to ../obtain/get_newest_price.py


In [4]:
pd_df = pd.read_csv(f"../../datastore/price/{symbol}_{PERIOD}.csv")
pd_df = pd_df.iloc[-900:]
pd_df

Unnamed: 0,Kline open time,Open price,High price,Low price,Close price,Volume,Kline Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume
171635,1672440300000,16588.59,16592.48,16581.06,16583.27,1081.54287,1672441199999,1.793969e+07,28099,506.76685,8.406024e+06
171636,1672441200000,16582.81,16599.82,16568.00,16593.88,1710.26619,1672442099999,2.837227e+07,40712,852.56365,1.414426e+07
171637,1672442100000,16593.88,16599.73,16586.94,16595.48,904.07320,1672442999999,1.500147e+07,29503,449.23096,7.454311e+06
171638,1672443000000,16595.03,16618.58,16589.96,16615.56,1374.84320,1672443899999,2.283034e+07,35425,661.07609,1.097769e+07
171639,1672443900000,16615.97,16618.75,16598.58,16607.48,997.15888,1672444799999,1.655963e+07,28520,444.77113,7.386217e+06
...,...,...,...,...,...,...,...,...,...,...,...
172530,1673245800000,17209.58,17218.64,17203.04,17203.43,1333.28379,1673246699999,2.294574e+07,42014,689.32453,1.186355e+07
172531,1673246700000,17199.71,17200.45,17180.00,17192.45,1615.35143,1673247599999,2.776759e+07,37437,767.42892,1.319210e+07
172532,1673247600000,17189.25,17196.91,17183.92,17192.58,1687.41328,1673248499999,2.900809e+07,45603,862.10860,1.482077e+07
172533,1673248500000,17192.59,17215.48,17189.65,17203.17,1906.65641,1673249399999,3.279738e+07,51790,992.10321,1.706594e+07


## ➡️ Select features

### 📌 Using Open time as index to visualize later

In [5]:
pd_df["Open price"] = pd_df["Open price"].apply(lambda price: float(price))
pd_df = pd_df.set_index("Kline open time").sort_index() 
pd_df.index = pd.to_datetime(pd_df.index, unit='ms') + pd.Timedelta('07:00:00')
pd_df

Unnamed: 0_level_0,Open price,High price,Low price,Close price,Volume,Kline Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume
Kline open time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-12-31 05:45:00,16588.59,16592.48,16581.06,16583.27,1081.54287,1672441199999,1.793969e+07,28099,506.76685,8.406024e+06
2022-12-31 06:00:00,16582.81,16599.82,16568.00,16593.88,1710.26619,1672442099999,2.837227e+07,40712,852.56365,1.414426e+07
2022-12-31 06:15:00,16593.88,16599.73,16586.94,16595.48,904.07320,1672442999999,1.500147e+07,29503,449.23096,7.454311e+06
2022-12-31 06:30:00,16595.03,16618.58,16589.96,16615.56,1374.84320,1672443899999,2.283034e+07,35425,661.07609,1.097769e+07
2022-12-31 06:45:00,16615.97,16618.75,16598.58,16607.48,997.15888,1672444799999,1.655963e+07,28520,444.77113,7.386217e+06
...,...,...,...,...,...,...,...,...,...,...
2023-01-09 13:30:00,17209.58,17218.64,17203.04,17203.43,1333.28379,1673246699999,2.294574e+07,42014,689.32453,1.186355e+07
2023-01-09 13:45:00,17199.71,17200.45,17180.00,17192.45,1615.35143,1673247599999,2.776759e+07,37437,767.42892,1.319210e+07
2023-01-09 14:00:00,17189.25,17196.91,17183.92,17192.58,1687.41328,1673248499999,2.900809e+07,45603,862.10860,1.482077e+07
2023-01-09 14:15:00,17192.59,17215.48,17189.65,17203.17,1906.65641,1673249399999,3.279738e+07,51790,992.10321,1.706594e+07


### 📌 Visualize prices as candle stick

In [6]:
fig = go.Figure(data=[go.Candlestick(x=pd_df.index,
                open=pd_df['Open price'],
                high=pd_df['High price'],
                low=pd_df['Low price'],
                close=pd_df['Close price'])])
# set new height and width
fig.update_layout(
    height=800,
    width=1000,
    title_text="BTC/USDT price",
    yaxis_title="Price (BTC/USDT)",
    xaxis_title="Date",
    xaxis_rangeslider_visible=True
)

fig.show()

### 📌 Using open price as feature and target as well

In [7]:
dataset = pd_df.filter(["Open price","High price","Low price","Close price"]).values
dataset[:5]

array([[16588.59, 16592.48, 16581.06, 16583.27],
       [16582.81, 16599.82, 16568.  , 16593.88],
       [16593.88, 16599.73, 16586.94, 16595.48],
       [16595.03, 16618.58, 16589.96, 16615.56],
       [16615.97, 16618.75, 16598.58, 16607.48]])

# 🎖 2. Prepare train-test set

✅ Train-Test ratio: `80%` train, `20%` test <br>
✅ Train-Valid ratio: `70%` train, `30%` valid

In [8]:
TRAIN_TEST_LENGTH = int(len(dataset) * 0.8)
TRAIN_VALID_LENGTH = int(TRAIN_TEST_LENGTH * 0.7)

## ➡️ Scale data

In [9]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(dataset)
scaled_data[:5]

array([[0.11711945, 0.10336799, 0.14650169, 0.10991259],
       [0.1092406 , 0.11318962, 0.12927396, 0.12438123],
       [0.12433037, 0.11306919, 0.15425813, 0.12656312],
       [0.12589796, 0.13829232, 0.15824187, 0.15394581],
       [0.15444173, 0.1385198 , 0.16961271, 0.14292727]])

## ➡️ Prepare time series data

In [10]:
def get_split_data(start, end = None):
    data = scaled_data[start:end]
    X = []
    Y = []
    for i in range(60,len(data)):
        X.append(data[i-60:i,:])
        Y.append(data[i,:])

    X, Y = np.array(X), np.array(Y)
    return X,Y

## ➡️ Split train-test set

In [11]:
X_origin_train, Y_origin_train = get_split_data(0, TRAIN_TEST_LENGTH)
X_test, Y_test = get_split_data(TRAIN_TEST_LENGTH-60, )

print(X_origin_train.shape, Y_origin_train.shape)
print(X_test.shape, Y_test.shape)

(660, 60, 4) (660, 4)
(180, 60, 4) (180, 4)


## ➡️ Split train-valid set

In [32]:
X_train, Y_train = get_split_data(0, TRAIN_VALID_LENGTH)
X_valid, Y_valid = get_split_data(TRAIN_VALID_LENGTH-60, TRAIN_TEST_LENGTH)

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

(443, 60, 4) (443, 4)
(217, 60, 4) (217, 4)


# 🎖 3. Naive training

## ➡️ Visualize predictions

In [29]:
def visualize_prediction(preds):
    data = pd_df[["Open price"]]
    train = data[:TRAIN_VALID_LENGTH]
    valid = data[TRAIN_VALID_LENGTH:TRAIN_TEST_LENGTH].reset_index()
    valid["Predict"] = preds[:, 0]
    valid = valid.set_index("Kline open time")
    valid["Predict"]= valid["Predict"].apply(lambda price: float(price))

    concat_df = pd.concat([train, valid], axis=0)
    fig = px.line(concat_df[["Open price","Predict"]], title="BTC/USDT price" , width=1000, height=800)
    fig.show()

In [34]:
X_train.shape

(443, 60, 4)

## ➡️ Linear Regression

### 📌 Build model and train

In [35]:
from sklearn.linear_model import LinearRegression

In [36]:
model = LinearRegression()
model.fit(X_train.reshape(X_train.shape[0], -1), Y_train)

LinearRegression()

### 📌 Predict and transform to the original scale

In [41]:
predictions = model.predict(X_valid.reshape(X_valid.shape[0], -1))
predictions = scaler.inverse_transform(np.array(predictions))
predictions.shape

(217, 4)

### 📌 Visualize the prediction

In [42]:
visualize_prediction(predictions)

## ➡️ Gated Recurrent Unit (GRU)

### 📌 Build model and train

In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU

2023-01-09 14:47:02.546045: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [60]:
model = Sequential()
model.add(GRU(200, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(GRU(100))
model.add(Dense(X_train.shape[2]))
model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, Y_train, batch_size=1, epochs=1)



<keras.callbacks.History at 0x7f7a9af69310>

### 📌 Predict and transform to the original scale

In [61]:
predictions = model.predict(X_valid)
predictions = scaler.inverse_transform(predictions)
predictions.shape



(217, 4)

### 📌 Visualize the prediction

In [46]:
visualize_prediction(predictions)

## ➡️ Seasonal Auto Regressive Integrated Moving Average (SARIMAX)

In [86]:
from statsmodels.tsa.api import SARIMAX

### 📌 Build model and train

In [None]:
predictions = list()
for t in range(X_valid.shape[0]):
    model = SARIMAX(X_valid[t][1])
    model_fit = model.fit()
    output = model_fit.forecast()
    pred_price = output[0]
    predictions.append(pred_price)

### 📌 Predict and transform to the original scale

In [None]:
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))
predictions.shape

### 📌 Visualize the prediction

In [26]:
visualize_prediction(predictions)        

# 🎖 4. Model selection and evaluation

In [27]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer, r2_score
from estimator import LSTMEstimator, SarimaxEstimator, LinearRegressionEstimator

## ➡️ Naive cross validation to select model

### 📌 Cross validation method

In [28]:
tscv = TimeSeriesSplit(n_splits=3)

- The method that can be used for cross-validating the time-series model is cross-validation on a rolling basis. Start with a small subset of data for training purpose, forecast for the later data points and then checking the accuracy for the forecasted data points. The same forecasted data points are then included as part of the next training dataset and subsequent data points are forecasted. <br>
![](https://miro.medium.com/max/640/1*XcqvKVTQ6U_zszSD52lSqA.webp)

### 📌 Metric method

In [29]:
metric = make_scorer(r2_score)

- $R^2$ score is used to evaluate the performance of a linear regression model. It is the amount of the variation in the output dependent attribute which is predictable from the input independent variable(s) <br>
![](https://vitalflux.com/wp-content/uploads/2019/07/R-squared-formula-function-of-SSE-and-SST.jpg)

### 📌 Evaluate models

In [30]:
estimators = [
    LinearRegressionEstimator(),
    LSTMEstimator(), 
    SarimaxEstimator(), 
]

In [None]:
scores_dict = {}

for estimator in estimators:
    scores = cross_val_score(estimator, X_origin_train, Y_origin_train, scoring=metric, cv=tscv, n_jobs=-1)
    print(estimator.__class__.__name__, scores.mean())
    scores_dict[estimator.__class__.__name__] = scores
    
scores_df = pd.DataFrame(scores_dict)
scores_df.index = scores_df.index.map(lambda x: f'fold_{x+1}')
scores_df.loc['mean'] = scores_df.mean()

### 📌 Score and reveal best model

In [32]:
scores_df

Unnamed: 0,LinearRegressionEstimator,LSTMEstimator,SarimaxEstimator
fold_1,0.700039,0.287115,0.686077
fold_2,0.862796,0.687337,0.807752
fold_3,0.942435,0.821581,0.914218
mean,0.83509,0.598678,0.802682


- The best best_estimator is one that have highest mean of scores

In [33]:
best_estimator = scores_df.loc['mean'].idxmin()
best_estimator

'LSTMEstimator'

## ➡️ Select best model with best hyperparameters using grid search

In [34]:
from estimator import Estimator, LinearRegressionEstimator, SarimaxEstimator, LSTMEstimator 
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline

### 📌 Cross validation and metric method

- Using aforementioned methods

In [35]:
tscv = TimeSeriesSplit()
metric = make_scorer(r2_score)

### 📌 Evaluate hyperparameters and models 

In [36]:
parameters = [
    {
        'reg__estimator': [LinearRegressionEstimator()]
    },
    {
        'reg__estimator': [SarimaxEstimator()],
        'reg__estimator__order': [(1, 1, 1), (3, 1, 1)],
        'reg__estimator__seasonal_order': [(1, 1, 1, 12), (3, 1, 1, 12)]
    },
    {
        'reg__estimator': [LSTMEstimator()],
        'reg__estimator__epochs': [1, 5, 10],
        'reg__estimator__batch_size': [4, 16, 32],
        'reg__estimator__neurons': [50, 100, 200]
    },
]

In [None]:
pipeline = Pipeline(
    steps=[("reg", Estimator())]
)

grid_search = GridSearchCV(pipeline, parameters, scoring=metric, cv=tscv, n_jobs=2, verbose=0)
grid_search.fit(X_train, Y_train)

- Best model with best hyperparameters

In [38]:
grid_search.best_params_

{'reg__estimator': SarimaxEstimator(order=(1, 1, 1)),
 'reg__estimator__order': (1, 1, 1),
 'reg__estimator__seasonal_order': (1, 1, 1, 12)}

# 🎖 5. Predict future prices

- We utilize the best model to predict prices

In [None]:
from statsmodels.tsa.api import SARIMAX

data = pd_df.filter(["Open price"])

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data).reshape(-1)

predictions = list()
model = SARIMAX(scaled_data, order=(1,1,1), seasonal_order=(1,1,1,12))
model_fit = model.fit()
output = model_fit.forecast()
pred_price = output[0]
predictions.append(pred_price)

In [40]:
pred_price = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))
pred_price

array([[17218.5754658]])