In [82]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense 

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [83]:
symbol = "BTCUSDT"
PERIOD = "15m"

In [84]:
!jupyter nbconvert --to python ../collect/get_newest_price.ipynb
!python ../collect/get_newest_price.py

[NbConvertApp] Converting notebook ../collect/get_newest_price.ipynb to python
[NbConvertApp] Writing 1547 bytes to ../collect/get_newest_price.py


# Origin data

In [100]:
pd_df = pd.read_csv(f"../../datastore/price/{symbol}_{PERIOD}.csv")
# pd_df = pd_df.iloc[-1800:]
pd_df

Unnamed: 0,Kline open time,Open price,High price,Low price,Close price,Volume,Kline Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume
0,1517500800000,9326.23,9340.00,9195.89,9217.00,487.476964,1517501699999,4.506285e+06,4546,171.643712,1.586177e+06
1,1517501700000,9208.09,9275.00,9130.00,9145.00,652.169234,1517502599999,6.004393e+06,6930,334.699485,3.083885e+06
2,1517502600000,9144.98,9257.40,9020.00,9184.99,1028.146331,1517503499999,9.361539e+06,8632,376.048437,3.429102e+06
3,1517503500000,9184.98,9270.00,9011.51,9066.66,770.943401,1517504399999,7.037635e+06,6722,356.335133,3.256645e+06
4,1517504400000,9058.78,9156.98,9040.00,9106.22,490.060336,1517505299999,4.458077e+06,4557,244.406918,2.225572e+06
...,...,...,...,...,...,...,...,...,...,...,...
172342,1673076600000,16942.51,16948.00,16942.51,16947.01,818.700220,1673077499999,1.387305e+07,27676,408.431860,6.921030e+06
172343,1673077500000,16947.01,16948.78,16945.32,16948.09,862.531950,1673078399999,1.461734e+07,28234,441.381230,7.480149e+06
172344,1673078400000,16948.18,16954.95,16935.92,16936.50,1137.070630,1673079299999,1.926988e+07,31337,528.659310,8.959437e+06
172345,1673079300000,16936.74,16944.78,16926.99,16929.12,1501.223120,1673080199999,2.542451e+07,39622,766.439690,1.298056e+07


### Format open price and time

In [101]:
pd_df["Open price"] = pd_df["Open price"].apply(lambda price: float(price))
pd_df = pd_df.set_index("Kline open time").sort_index() 
pd_df.index = pd.to_datetime(pd_df.index, unit='ms') + pd.Timedelta('07:00:00')
pd_df

Unnamed: 0_level_0,Open price,High price,Low price,Close price,Volume,Kline Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume
Kline open time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-02-01 23:00:00,9326.23,9340.00,9195.89,9217.00,487.476964,1517501699999,4.506285e+06,4546,171.643712,1.586177e+06
2018-02-01 23:15:00,9208.09,9275.00,9130.00,9145.00,652.169234,1517502599999,6.004393e+06,6930,334.699485,3.083885e+06
2018-02-01 23:30:00,9144.98,9257.40,9020.00,9184.99,1028.146331,1517503499999,9.361539e+06,8632,376.048437,3.429102e+06
2018-02-01 23:45:00,9184.98,9270.00,9011.51,9066.66,770.943401,1517504399999,7.037635e+06,6722,356.335133,3.256645e+06
2018-02-02 00:00:00,9058.78,9156.98,9040.00,9106.22,490.060336,1517505299999,4.458077e+06,4557,244.406918,2.225572e+06
...,...,...,...,...,...,...,...,...,...,...
2023-01-07 14:30:00,16942.51,16948.00,16942.51,16947.01,818.700220,1673077499999,1.387305e+07,27676,408.431860,6.921030e+06
2023-01-07 14:45:00,16947.01,16948.78,16945.32,16948.09,862.531950,1673078399999,1.461734e+07,28234,441.381230,7.480149e+06
2023-01-07 15:00:00,16948.18,16954.95,16935.92,16936.50,1137.070630,1673079299999,1.926988e+07,31337,528.659310,8.959437e+06
2023-01-07 15:15:00,16936.74,16944.78,16926.99,16929.12,1501.223120,1673080199999,2.542451e+07,39622,766.439690,1.298056e+07


In [87]:
fig = go.Figure(data=[go.Candlestick(x=pd_df.index,
                open=pd_df['Open price'],
                high=pd_df['High price'],
                low=pd_df['Low price'],
                close=pd_df['Close price'])])
# set new height and width
fig.update_layout(
    height=800,
    width=1000,
    title_text="BTC/USDT price",
    yaxis_title="Price (BTC/USDT)",
    xaxis_title="Date",
    xaxis_rangeslider_visible=True
)

fig.show()

# Prepare dataset

### Price column

In [102]:
dataset = pd_df.filter(["Open price"]).values
dataset.shape

(172347, 1)

### Splitting ratio

In [103]:
TRAIN_DATA_LENGTH = int(len(dataset) * 0.8)

### Scale data to [0, 1]

In [104]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(dataset)
scaled_data

array([[0.09397191],
       [0.0921697 ],
       [0.09120696],
       ...,
       [0.21024387],
       [0.21006935],
       [0.20995311]])

### Split training set

In [105]:
train_data = scaled_data[:TRAIN_DATA_LENGTH]

X_train = []
Y_train = []
for i in range(60,len(train_data)):
    X_train.append(train_data[i-60:i,:])
    Y_train.append(train_data[i,:])

X_train, Y_train = np.array(X_train), np.array(Y_train)
print(X_train.shape, Y_train.shape)

(137817, 60, 1) (137817, 1)


### Split test set

In [106]:
test_data = scaled_data[TRAIN_DATA_LENGTH-60:]

X_test = []
Y_test = []
for i in range(60,len(test_data)):
    X_test.append(test_data[i-60:i,:])
    Y_test.append(test_data[i,:])

X_test, Y_test = np.array(X_test), np.array(Y_test)
print(X_test.shape, Y_test.shape)

(34470, 60, 1) (34470, 1)


# Training

## Long-short term memory model

In [108]:
model = Sequential()
model.add(LSTM(200, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(200, return_sequences=False))
model.add(Dense(100))
model.add(Dense(X_train.shape[2]))
model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, Y_train, batch_size=128, epochs=1)



<keras.callbacks.History at 0x7f8b52915850>

### Predict and transform [0, 1] to the original scale

In [109]:
predictions = model.predict(X_test)
predictions = scaler.inverse_transform(predictions)
predictions[:5]



array([[43702.75 ],
       [43711.11 ],
       [43717.445],
       [43714.87 ],
       [43714.23 ]], dtype=float32)

### Visualize the prediction

In [110]:
data = pd_df.filter(["Open price"])
data.shape

(172347, 1)

In [111]:
train = data[:TRAIN_DATA_LENGTH]
valid = data[TRAIN_DATA_LENGTH:].reset_index()
valid["Predict"] = predictions[:, 0]
valid = valid.set_index("Kline open time")
valid["Predict"]= valid["Predict"].apply(lambda price: float(price))

lstm_df = pd.concat([train, valid], axis=0)

In [115]:
fig = px.line(lstm_df[["Open price","Predict"]].dropna(), title="BTC/USDT price" , width=1000, height=800)
fig.show()

## Autoregressive integrated moving average model

In [73]:
from statsmodels.tsa.arima.model import ARIMA

### Scale data to [0, 1] and split train-test set

In [74]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)
train, test = scaled_data[:TRAIN_DATA_LENGTH], scaled_data[TRAIN_DATA_LENGTH:]
test.shape

(360, 1)

### Train model

In [None]:
history = train.copy().tolist()

predictions = list()
for t in range(len(test)):
    model = ARIMA(history, order=(5,1,0))
    model_fit = model.fit()
    output = model_fit.forecast()
    pred_price = output[0]
    predictions.append(pred_price)
    actual_price = test[t]
    history.append(actual_price)

### Visualize the prediction

In [81]:
train_df, test_df = data[:TRAIN_DATA_LENGTH], data[TRAIN_DATA_LENGTH:]
test_df["Predict"] = scaler.inverse_transform(np.array(predictions).reshape(-1,1)).reshape(-1)
arima_df = pd.concat([train_df, test_df], axis=0)

fig = px.line(arima_df, title="BTC/USDT price" , width=1000, height=800)
fig.show()               



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Model selection with cross validation

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from estimator import ArimaEstimator, LSTMEstimator, SarimaxEstimator, AutoRegEstimator, ExponentialSmoothingEstimator, HoltEstimator

# cross vadiation strategy
tscv = TimeSeriesSplit(n_splits=5)

# mean squared error metric
metric = make_scorer(mean_squared_error)

# models
estimators = [
    ArimaEstimator(), 
    SarimaxEstimator(), 
    AutoRegEstimator(),
    ExponentialSmoothingEstimator(), 
    HoltEstimator(),
    LSTMEstimator(), 
]

scores_dict = {}

for estimator in estimators:
    scores = cross_val_score(estimator, X_train, Y_train, scoring=metric, cv=tscv, n_jobs=-1)
    print(estimator.__class__.__name__, scores.mean())
    scores_dict[estimator.__class__.__name__] = scores
    
scores_df = pd.DataFrame(scores_dict)
scores_df.index = scores_df.index.map(lambda x: f'fold_{x+1}')
scores_df.loc['mean'] = scores_df.mean()
scores_df.to_csv('../../datastore/scores.csv')

In [111]:
scores_df

Unnamed: 0,ArimaEstimator,SarimaxEstimator,AutoRegEstimator,ExponentialSmoothingEstimator,HoltEstimator,LSTMEstimator
fold_1,2.7e-05,3.2e-05,2.2e-05,2e-05,2.2e-05,0.000558
fold_2,2.7e-05,5e-05,2.5e-05,2.4e-05,2.6e-05,3.8e-05
fold_3,3.6e-05,5.2e-05,3.4e-05,3.1e-05,3.3e-05,6.4e-05
fold_4,0.00084,0.001099,0.000818,0.000777,0.000807,0.00477
fold_5,0.000769,0.001182,0.000754,0.000696,0.000722,0.003198
mean,0.00034,0.000483,0.000331,0.00031,0.000322,0.001726


In [112]:
# find column with lowest mean score
best_estimator = scores_df.loc['mean'].idxmin()
best_estimator

'ExponentialSmoothingEstimator'

In [None]:
%%capture
from estimator import ExponentialSmoothingEstimator as Estimator
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

tscv = TimeSeriesSplit()

metric = make_scorer(mean_squared_error, greater_is_better=False)

param_grid = {
    'trend': ["add", "mul", "additive", "multiplicative", None],
    'damped_trend': [True, False],
    'seasonal': ["add", "mul", "additive", "multiplicative", None],
    'seasonal_periods': [7, 30, 365],
}

estimator = Estimator()
estimator.get_params().keys()
grid_search = GridSearchCV(estimator, param_grid, scoring=metric, cv=tscv, n_jobs=-1, verbose=0)
grid_search.fit(X_train, Y_train)

In [114]:
grid_search.best_estimator_

ExponentialSmoothingEstimator(seasonal_periods=7)

# Params selection with grid search

In [115]:
from statsmodels.tsa.api import ExponentialSmoothing

data = pd_df.filter(["Open price"])

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data).reshape(-1)
train, test = scaled_data[:TRAIN_DATA_LENGTH], scaled_data[TRAIN_DATA_LENGTH:]
history = train.copy().tolist()

predictions = list()
for t in range(len(test)):
    model = ExponentialSmoothing(history, seasonal_periods=7)
    model_fit = model.fit()
    output = model_fit.forecast()
    pred_price = output[0]
    predictions.append(pred_price)
    actual_price = test[t]
    history.append(actual_price)

In [116]:

train_df, test_df = data[:TRAIN_DATA_LENGTH], data[TRAIN_DATA_LENGTH:]
test_df["Predict"] = scaler.inverse_transform(np.array(predictions).reshape(-1,1)).reshape(-1)


# concate train_df and test_df to get the full data set
full_df = pd.concat([train_df, test_df], axis=0)


fig = px.line(full_df, height=800, width=1000)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [117]:
model = ExponentialSmoothing(data["Open price"].values, seasonal_periods=7)
model_fit = model.fit()
output = model_fit.forecast()
pred_price = output[0]

In [118]:
pred_price

16832.28719995541