In [1]:
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import warnings
warnings.filterwarnings("ignore")

<h1 style="text-align:center">🏰 Data Modelling</h1> 

<h3 style="text-align:center">Predict BTC prices</h3> 

# 🎖 1. Get newest data points

## ➡️ Define symbol and time interval

In [2]:
symbol = "BTCUSDT"
PERIOD = "15m"

In [3]:
!jupyter nbconvert --execute --to notebook --inplace ../obtain/get_newest_price.ipynb

[NbConvertApp] Converting notebook ../obtain/get_newest_price.ipynb to notebook
[NbConvertApp] Writing 14773 bytes to ../obtain/get_newest_price.ipynb


In [4]:
pd_df = pd.read_csv(f"../../datastore/processed/{symbol}_{PERIOD}.csv")
pd_df = pd_df.iloc[-3000:]
pd_df

Unnamed: 0,Kline open time,Open price,High price,Low price,Close price,Volume,Kline Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume
169430,1670715000000,17122.40,17140.30,17113.19,17121.63,2090.79715,1670715899999,3.580622e+07,33777,937.10800,1.604848e+07
169431,1670715900000,17122.10,17131.73,17092.00,17127.49,2747.31029,1670716799999,4.701130e+07,42257,1528.78458,2.616088e+07
169432,1670716800000,17127.49,17130.98,17121.92,17128.75,1261.27226,1670717699999,2.160077e+07,28877,617.70589,1.057909e+07
169433,1670717700000,17128.99,17143.71,17127.32,17141.38,1125.88568,1670718599999,1.929091e+07,26377,616.50333,1.056340e+07
169434,1670718600000,17141.79,17145.06,17130.66,17135.28,942.34718,1670719499999,1.615027e+07,25332,450.48498,7.720600e+06
...,...,...,...,...,...,...,...,...,...,...,...
172425,1673410500000,17422.80,17423.48,17404.45,17406.81,979.55166,1673411399999,1.705655e+07,37483,452.05611,7.871567e+06
172426,1673411400000,17406.45,17418.02,17391.00,17416.18,1457.02145,1673412299999,2.535729e+07,45961,737.84522,1.284141e+07
172427,1673412300000,17416.53,17417.56,17403.62,17405.20,538.80086,1673413199999,9.381003e+06,17426,265.52046,4.623036e+06
172428,1673413200000,17407.98,17420.54,17397.27,17416.27,1464.23045,1673414099999,2.549018e+07,42776,808.88019,1.408156e+07


## ➡️ Select features

### 📌 Using Open time as index to visualize later

In [5]:
pd_df["Open price"] = pd_df["Open price"].apply(lambda price: float(price))
pd_df = pd_df.set_index("Kline open time").sort_index() 
pd_df.index = pd.to_datetime(pd_df.index, unit="ms") + pd.Timedelta('07:00:00')
pd_df

Unnamed: 0_level_0,Open price,High price,Low price,Close price,Volume,Kline Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume
Kline open time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-12-11 06:30:00,17122.40,17140.30,17113.19,17121.63,2090.79715,1670715899999,3.580622e+07,33777,937.10800,1.604848e+07
2022-12-11 06:45:00,17122.10,17131.73,17092.00,17127.49,2747.31029,1670716799999,4.701130e+07,42257,1528.78458,2.616088e+07
2022-12-11 07:00:00,17127.49,17130.98,17121.92,17128.75,1261.27226,1670717699999,2.160077e+07,28877,617.70589,1.057909e+07
2022-12-11 07:15:00,17128.99,17143.71,17127.32,17141.38,1125.88568,1670718599999,1.929091e+07,26377,616.50333,1.056340e+07
2022-12-11 07:30:00,17141.79,17145.06,17130.66,17135.28,942.34718,1670719499999,1.615027e+07,25332,450.48498,7.720600e+06
...,...,...,...,...,...,...,...,...,...,...
2023-01-11 11:15:00,17422.80,17423.48,17404.45,17406.81,979.55166,1673411399999,1.705655e+07,37483,452.05611,7.871567e+06
2023-01-11 11:30:00,17406.45,17418.02,17391.00,17416.18,1457.02145,1673412299999,2.535729e+07,45961,737.84522,1.284141e+07
2023-01-11 11:45:00,17416.53,17417.56,17403.62,17405.20,538.80086,1673413199999,9.381003e+06,17426,265.52046,4.623036e+06
2023-01-11 12:00:00,17407.98,17420.54,17397.27,17416.27,1464.23045,1673414099999,2.549018e+07,42776,808.88019,1.408156e+07


### 📌 Visualize prices as candle stick

In [6]:
fig = go.Figure(data=[go.Candlestick(x=pd_df.index,
                open=pd_df['Open price'],
                high=pd_df['High price'],
                low=pd_df['Low price'],
                close=pd_df['Close price'])])
# set new height and width
fig.update_layout(
    height=800,
    width=1000,
    title_text="BTC/USDT price",
    yaxis_title="Price (BTC/USDT)",
    xaxis_title="Date",
    xaxis_rangeslider_visible=True
)

fig.show()

### 📌 Using open price as feature and target as well

> Still only use Open Price because there are some models that can't run multivariate 👍 <strong>Model only uses univariate</strong>

In [7]:
dataset = pd_df.filter(["Open price"]).values
dataset[:5]

array([[17122.4 ],
       [17122.1 ],
       [17127.49],
       [17128.99],
       [17141.79]])

# 🎖 2. Prepare train-test set

✅ Train-Test ratio: `80%` train, `20%` test <br>
✅ Train-Valid ratio: `70%` train, `30%` valid

In [8]:
TRAIN_TEST_LENGTH = int(len(dataset) * 0.8)
TRAIN_VALID_LENGTH = int(TRAIN_TEST_LENGTH * 0.7)

## ➡️ Scale data

In [9]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(dataset)
scaled_data[:5]

array([[0.38379281],
       [0.38363723],
       [0.38643247],
       [0.38721036],
       [0.3938484 ]])

## ➡️ Prepare time series data

In [10]:
def get_split_data(start, end = None):
    data = scaled_data[start:end]
    X = []
    Y = []
    for i in range(60,len(data)):
        X.append(data[i-60:i,:])
        Y.append(data[i,:])

    X, Y = np.array(X), np.array(Y)
    return X,Y

## ➡️ Split train-test set

In [11]:
X_origin_train, Y_origin_train = get_split_data(0, TRAIN_TEST_LENGTH)
X_test, Y_test = get_split_data(TRAIN_TEST_LENGTH-60, )

print(X_origin_train.shape, Y_origin_train.shape)
print(X_test.shape, Y_test.shape)

(2340, 60, 1) (2340, 1)
(600, 60, 1) (600, 1)


## ➡️ Split train-valid set

In [12]:
X_train, Y_train = get_split_data(0, TRAIN_VALID_LENGTH)
X_valid, Y_valid = get_split_data(TRAIN_VALID_LENGTH-60, TRAIN_TEST_LENGTH)

print(X_train.shape, Y_train.shape)
print(X_valid.shape, Y_valid.shape)

(1620, 60, 1) (1620, 1)
(720, 60, 1) (720, 1)


# 🎖 3. Naive training

## ➡️ Visualize predictions

In [13]:
def visualize_prediction(preds, variant="train"):
    if variant == "train":
        train_end = TRAIN_VALID_LENGTH
        valid_start = TRAIN_VALID_LENGTH
        valid_end = TRAIN_TEST_LENGTH
    else:
        train_end = TRAIN_TEST_LENGTH
        valid_start = TRAIN_TEST_LENGTH
        valid_end = None
        
    
    data = pd_df[["Open price"]]
    train = data[:train_end]
    valid = data[valid_start:valid_end].reset_index()
    valid["Predict"] = preds
    
    if variant == "test":
        print(valid)
        
    valid = valid.set_index("Kline open time")
    valid["Predict"]= valid["Predict"].apply(lambda price: float(price))

    concat_df = pd.concat([train, valid], axis=0)
    fig = px.line(concat_df[["Open price","Predict"]], title="BTC/USDT price" , width=1000, height=800)
    fig.show()

In [14]:
X_train.shape

(1620, 60, 1)

## ➡️ Linear Regression

### 📌 Build model and train

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
model = LinearRegression()
model.fit(X_train.reshape(X_train.shape[0], -1), Y_train)

LinearRegression()

### 📌 Predict and transform to the original scale

In [17]:
predictions = model.predict(X_valid.reshape(X_valid.shape[0], -1))
predictions = scaler.inverse_transform(np.array(predictions))
predictions.shape

(720, 1)

### 📌 Visualize the prediction

In [18]:
visualize_prediction(predictions)

## ➡️ Gated Recurrent Unit (GRU)

### 📌 Build model and train

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU

2023-01-11 12:18:12.388607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
model = Sequential()
model.add(GRU(200, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(GRU(100))
model.add(Dense(X_train.shape[2]))
model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, Y_train, batch_size=1, epochs=1)

2023-01-11 12:18:16.708500: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




<keras.callbacks.History at 0x7f8b4fdb0ac0>

### 📌 Predict and transform to the original scale

In [21]:
predictions = model.predict(X_valid)
predictions = scaler.inverse_transform(predictions)
predictions.shape



(720, 1)

### 📌 Visualize the prediction

In [22]:
visualize_prediction(predictions)

## ➡️ Seasonal Auto Regressive Integrated Moving Average (SARIMAX)

In [23]:
from statsmodels.tsa.api import SARIMAX

In [24]:
dataset = pd_df.filter(["Open price"]).values
dataset[:5]
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(dataset)
scaled_data[:5]
X_SARIMAX_valid, Y_SARIMAX_valid = get_split_data(TRAIN_VALID_LENGTH-60, TRAIN_TEST_LENGTH)

### 📌 Build model and train

In [None]:
predictions = list()
for t in range(X_SARIMAX_valid.shape[0]):
    model = SARIMAX(X_SARIMAX_valid[t])
    model_fit = model.fit()
    output = model_fit.forecast()
    pred_price = output[0]
    predictions.append(pred_price)

### 📌 Predict and transform to the original scale

In [26]:
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))
predictions.shape

(720, 1)

### 📌 Visualize the prediction

In [27]:
visualize_prediction(predictions)        

# 🎖 4. Model selection and evaluation

In [28]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer, r2_score
from estimator import GRUEstimator, SarimaxEstimator, LinearRegressionEstimator

## ➡️ Naive cross validation to select model

### 📌 Cross validation method

In [29]:
tscv = TimeSeriesSplit(n_splits=3)

- The method that can be used for cross-validating the time-series model is cross-validation on a rolling basis. Start with a small subset of data for training purpose, forecast for the later data points and then checking the accuracy for the forecasted data points. The same forecasted data points are then included as part of the next training dataset and subsequent data points are forecasted. <br> <br>
![](https://miro.medium.com/max/640/1*XcqvKVTQ6U_zszSD52lSqA.webp)

### 📌 Metric method

In [30]:
metric = make_scorer(r2_score)

- $R^2$ score is used to evaluate the performance of a linear regression model. It is the amount of the variation in the output dependent attribute which is predictable from the input independent variable(s) <br> <br>
![](https://vitalflux.com/wp-content/uploads/2019/07/R-squared-formula-function-of-SSE-and-SST.jpg)

### 📌 Evaluate models

In [31]:
estimators = [
    LinearRegressionEstimator(),
    GRUEstimator(), 
    SarimaxEstimator(), 
]

In [None]:
scores_dict = {}

for estimator in estimators:
    scores = cross_val_score(estimator, X_origin_train, Y_origin_train, scoring=metric, cv=tscv, n_jobs=-1, verbose=3)
    print(estimator.__class__.__name__, scores.mean())
    scores_dict[estimator.__class__.__name__] = scores
    
scores_df = pd.DataFrame(scores_dict)
scores_df.index = scores_df.index.map(lambda x: f'iter_{x+1}')
scores_df.loc['mean'] = scores_df.mean()

### 📌 Score and reveal best model

In [33]:
scores_df

Unnamed: 0,LinearRegressionEstimator,GRUEstimator,SarimaxEstimator
iter_1,0.932009,0.883754,-18.378754
iter_2,0.982565,0.969673,-12.007933
iter_3,0.979066,0.968548,-5.905806
mean,0.964546,0.940658,-12.097498


- The best best_estimator is one that have highest mean of scores

In [34]:
best_estimator = scores_df.loc['mean'].idxmax()
best_estimator

'LinearRegressionEstimator'

## ➡️ Select best model with best hyperparameters using grid search

In [35]:
from estimator import Estimator, LinearRegressionEstimator, SarimaxEstimator, GRUEstimator
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline

### 📌 Cross validation and metric method

- Using aforementioned methods

In [36]:
NUM_SPLIT = 5

In [37]:
tscv = TimeSeriesSplit(n_splits=NUM_SPLIT)
metric = make_scorer(r2_score)

### 📌 Evaluate hyperparameters and models 

In [38]:
parameters = [
    {
        'reg__estimator': [LinearRegressionEstimator()],
        'reg__estimator__fit_intercept': [True, False],
    },
    {
        'reg__estimator': [SarimaxEstimator()],
        'reg__estimator__order': [(1, 1, 1), (3, 1, 1)],
        'reg__estimator__seasonal_order': [(1, 1, 1, 12), (3, 1, 1, 12)]
    },
    {
        'reg__estimator': [GRUEstimator()],
        'reg__estimator__epochs': [1, 5, 10],
        'reg__estimator__batch_size': [4, 16, 32],
        'reg__estimator__neurons': [50, 100, 200]
    },
]

In [None]:
pipeline = Pipeline(
    steps=[("reg", Estimator())]
)

grid_search = GridSearchCV(pipeline, parameters, scoring=metric, cv=tscv, n_jobs=2, verbose=3)
grid_search.fit(X_train, Y_train)

- Best model with best hyperparameters

In [40]:
def GridSearch_table_plot(grid_clf, param_name,
                          negative=True,
                          display_all_params=True):
    clf = grid_clf.best_estimator_
    clf_params = grid_clf.best_params_
    if negative:
        clf_score = -grid_clf.best_score_
    else:
        clf_score = grid_clf.best_score_
    clf_stdev = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]
    cv_results = grid_clf.cv_results_

    print("best parameters: {}".format(clf_params))
    print("best score:      {:0.5f} (+/-{:0.5f})".format(clf_score, clf_stdev))
    if display_all_params:
        import pprint
        pprint.pprint(clf.get_params())

    # pick out the best results
    # =========================
    scores_df = pd.DataFrame(cv_results).sort_values(by='rank_test_score')

    best_row = scores_df.iloc[0, :]
    if negative:
        best_mean = -best_row['mean_test_score']
    else:
        best_mean = best_row['mean_test_score']
    best_stdev = best_row['std_test_score']
    best_param = best_row['param_' + param_name]

    # display the top 'num_results' results
    result_table = pd.DataFrame(cv_results).sort_values(by='rank_test_score')
    result_table.to_csv(f'../../datastore/model_scores/grid_search_cv_{NUM_SPLIT}_len_{dataset.shape[0]}.csv', index=False)
    display(result_table)

    # plot the results
    # scores_df = scores_df.sort_values(by='param_' + param_name)
    if negative:
        means = -scores_df['mean_test_score']
    else:
        means = scores_df['mean_test_score']
    stds = scores_df['std_test_score']
    params = scores_df['param_' + param_name]
        
GridSearch_table_plot(grid_search, "reg__estimator", negative=False)

best parameters: {'reg__estimator': GRUEstimator(batch_size=4, epochs=10, neurons=100), 'reg__estimator__batch_size': 4, 'reg__estimator__epochs': 10, 'reg__estimator__neurons': 100}
best score:      0.84205 (+/-0.19607)
{'memory': None,
 'reg': Estimator(estimator=GRUEstimator(batch_size=4, epochs=10, neurons=100)),
 'reg__estimator': GRUEstimator(batch_size=4, epochs=10, neurons=100),
 'reg__estimator__batch_size': 4,
 'reg__estimator__epochs': 10,
 'reg__estimator__neurons': 100,
 'steps': [('reg',
            Estimator(estimator=GRUEstimator(batch_size=4, epochs=10, neurons=100)))],
 'verbose': False}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg__estimator,param_reg__estimator__fit_intercept,param_reg__estimator__order,param_reg__estimator__seasonal_order,param_reg__estimator__batch_size,param_reg__estimator__epochs,param_reg__estimator__neurons,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,21.441244,11.130295,0.445718,0.177663,"GRUEstimator(batch_size=4, epochs=10, neurons=...",,,,4.0,10.0,100.0,"{'reg__estimator': GRUEstimator(batch_size=4, ...",0.983355,0.908559,0.901061,0.454905,0.962351,0.842046,0.196075,1
14,32.442763,15.506453,0.419484,0.025973,"GRUEstimator(batch_size=4, epochs=10, neurons=...",,,,4.0,10.0,200.0,"{'reg__estimator': GRUEstimator(batch_size=4, ...",0.981993,0.90912,0.913335,0.389639,0.962452,0.831308,0.222602,2
0,0.019346,0.00611,0.000871,0.000418,LinearRegressionEstimator(),True,,,,,,{'reg__estimator': LinearRegressionEstimator()...,0.989257,0.91213,0.901482,0.361894,0.965341,0.826021,0.234341,3
1,0.026481,0.005837,0.000839,0.000286,LinearRegressionEstimator(),False,,,,,,{'reg__estimator': LinearRegressionEstimator()...,0.989257,0.91213,0.901482,0.361894,0.965341,0.826021,0.234341,3
12,20.480924,10.643572,0.30482,0.032636,"GRUEstimator(batch_size=4, epochs=10, neurons=...",,,,4.0,10.0,50.0,"{'reg__estimator': GRUEstimator(batch_size=4, ...",0.975527,0.874505,0.883241,0.378521,0.964616,0.815282,0.2222,5
11,16.568253,7.728859,0.368986,0.010379,"GRUEstimator(batch_size=4, epochs=10, neurons=...",,,,4.0,5.0,200.0,"{'reg__estimator': GRUEstimator(batch_size=4, ...",0.976733,0.868526,0.878833,0.347376,0.951141,0.804522,0.232274,6
10,11.237536,5.335863,0.324232,0.02182,"GRUEstimator(batch_size=4, epochs=10, neurons=...",,,,4.0,5.0,100.0,"{'reg__estimator': GRUEstimator(batch_size=4, ...",0.980173,0.871363,0.860844,0.372359,0.933247,0.803597,0.21992,7
23,12.861785,5.548403,0.419696,0.038287,"GRUEstimator(batch_size=4, epochs=10, neurons=...",,,,16.0,10.0,200.0,"{'reg__estimator': GRUEstimator(batch_size=4, ...",0.978532,0.839922,0.836825,0.348807,0.957005,0.792218,0.229236,8
19,5.043652,1.683054,0.34356,0.029701,"GRUEstimator(batch_size=4, epochs=10, neurons=...",,,,16.0,5.0,100.0,"{'reg__estimator': GRUEstimator(batch_size=4, ...",0.961765,0.864831,0.823806,0.292035,0.943381,0.777164,0.247756,9
32,9.734932,3.804288,0.403287,0.048812,"GRUEstimator(batch_size=4, epochs=10, neurons=...",,,,32.0,10.0,200.0,"{'reg__estimator': GRUEstimator(batch_size=4, ...",0.971919,0.861326,0.836435,0.25797,0.92384,0.770298,0.260525,10


# 🎖 5. Testing

### 📌 Evaluate with test set

In [59]:
# predictions = []
# for t in range(X_test.shape[0]):
#     model = SARIMAX(X_test[t], seasonal_order=(1, 1, 1, 12), order=(1, 1, 1))
#     model_fit = model.fit()
#     output = model_fit.forecast()
#     pred_price = output[0]
#     predictions.append(pred_price)
    
model = GRUEstimator(epochs=10, batch_size=4, neurons=100)
model.fit(X_origin_train, Y_origin_train)

predictions = model.predict(X_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### 📌 Metric method

In [64]:
r2_score(Y_test, np.array(predictions).reshape(-1, 1))

-25944337404.991287

- ✅ Incredible score

### 📌 Visualize result

In [63]:
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))

In [62]:
visualize_prediction(predictions, variant="test")

        Kline open time  Open price       Predict
0   2023-01-05 06:30:00    16852.83  16835.824219
1   2023-01-05 06:45:00    16859.50  16847.941406
2   2023-01-05 07:00:00    16850.36  16853.750000
3   2023-01-05 07:15:00    16862.72  16843.992188
4   2023-01-05 07:30:00    16865.94  16856.937500
..                  ...         ...           ...
595 2023-01-11 11:15:00    17422.80  17412.058594
596 2023-01-11 11:30:00    17406.45  17419.578125
597 2023-01-11 11:45:00    17416.53  17401.060547
598 2023-01-11 12:00:00    17407.98  17413.714844
599 2023-01-11 12:15:00    17415.87  17404.105469

[600 rows x 3 columns]


# 🎖 5. Predict future prices

## ➡️ Utilize the best model to predict prices

In [65]:
data = pd_df.filter(["Open price"])[-60:]

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data).reshape(1, 60, 1)

# predictions = list()
# model = SARIMAX(scaled_data, order=(1,1,1), seasonal_order=(1,1,1,12))
# model_fit = model.fit()
# output = model_fit.forecast()
# pred_price = output[0]
# predictions.append(pred_price)

predictions = model.predict(scaled_data)



In [67]:
pred_price = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))

## ➡️ Next price

In [68]:
strtime = (pd_df.index[-1] + pd.Timedelta('00:15:00')).strftime("%Y-%m-%d %H:%M")
print(f"Predict price at {strtime} is {pred_price[0][0]}")

Predict price at 2023-01-11 12:30 is 17416.34375
