In [1]:
import requests
import json

import pandas as pd
from pandas.tseries.frequencies import to_offset

import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import math

from sklearn.preprocessing import MinMaxScaler

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

import time
import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
segment_size = 100

# 1727 rows
data = pd.read_csv('../Dataset/STB.csv')

# Calculate the number of segments in the dataset
print("len(data)", len(data))
num_segments = int(len(data)/segment_size)
# Đọc vào tập dữ liệu bị phân mảnh ngang
data_chunks = []
for i in range(num_segments):
    chunk = pd.read_csv(f'../Dataset/STB.csv')
    data_chunks.append(chunk)
data = pd.concat(data_chunks, ignore_index=True)
data['date'] = pd.to_datetime(data.trunc_time)
# Áp dụng moving average với cửa sổ 5 và lưu vào cột 'ma_5'
data['ma_5'] = data['close_price'].rolling(window=5,min_periods=1).mean()

# Xóa bỏ các dòng có giá trị null do moving average
data.dropna(inplace=True)
# Chia tập dữ liệu thành tập huấn luyện và tập kiểm tra
# train_data = data.sample(frac=0.8, random_state=42)
# test_data = data.drop(train_data.index)

len(data) 1727


In [3]:
closedf = data[['trunc_time','close_price','ma_5']]
print(closedf)
print("Shape of close dataframe:", closedf.shape)

       trunc_time  close_price          ma_5
0      2016-01-04        12600  12600.000000
1      2016-01-05        12300  12450.000000
2      2016-01-06        12800  12566.666667
3      2016-01-07        12600  12575.000000
4      2016-01-08        12600  12580.000000
...           ...          ...           ...
29354  2022-11-21        16700  16450.000000
29355  2022-11-22        16900  16810.000000
29356  2022-11-23        17500  17080.000000
29357  2022-11-24        18200  17280.000000
29358  2022-11-25        18900  17640.000000

[29359 rows x 3 columns]
Shape of close dataframe: (29359, 3)


In [4]:
import copy

close_stock = closedf.copy()
del closedf['trunc_time']

X_data = copy.deepcopy(closedf)
Y_data = copy.deepcopy(closedf)

del X_data['close_price']
del Y_data['ma_5']

scaler=MinMaxScaler(feature_range=(0,1))

X_data=scaler.fit_transform(np.array(X_data).reshape(-1,1))
Y_data=scaler.fit_transform(np.array(Y_data).reshape(-1,1))

print(X_data, X_data.shape)
print(Y_data, Y_data.shape)

[[0.18039216]
 [0.17504456]
 [0.1792038 ]
 ...
 [0.34010695]
 [0.34723708]
 [0.3600713 ]] (29359, 1)
[[0.18563923]
 [0.17513135]
 [0.19264448]
 ...
 [0.35726795]
 [0.38178634]
 [0.40630473]] (29359, 1)


In [5]:
# training_size=int(len(closedf)*0.8)
# test_size=len(closedf)-training_size
# train_data,test_data=closedf[0:training_size,:],closedf[training_size:len(closedf),:1]
# print("train_data: ", train_data.shape)
# print("test_data: ", test_data.shape)

In [6]:
# convert an array of values into a dataset matrix
# def create_dataset(dataset, time_step=1):
#     dataX, dataY = [], []
#     for i in range(len(dataset)-time_step-1):
#         a = dataset[i:(i+time_step), 0]   ###i=0, 0,1,2,3-----99   100 
#         dataX.append(a)
#         dataY.append(dataset[i + time_step, 0])
#     return np.array(dataX), np.array(dataY)
def create_dataset(data_set):
    training_size=int(len(data_set)*0.8)
    # test_size=len(data_set)-training_size
    train_data,test_data=data_set[0:training_size,:],data_set[training_size:len(data_set),:1]
    print("train_data: ", train_data)
    print("test_data: ", test_data)
    return train_data,test_data

In [7]:
# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 10
# print(type(X_data))
X_train, X_test = create_dataset(X_data)
y_train, y_test = create_dataset(Y_data)

print("X_train: ", X_train)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)
# print("type(X_test)",type(X_test))

train_data:  [[0.18039216]
 [0.17504456]
 [0.1792038 ]
 ...
 [0.13333333]
 [0.13048128]
 [0.12905526]]
test_data:  [[0.1315508 ]
 [0.13368984]
 [0.14438503]
 ...
 [0.34010695]
 [0.34723708]
 [0.3600713 ]]
train_data:  [[0.18563923]
 [0.17513135]
 [0.19264448]
 ...
 [0.12434326]
 [0.13485114]
 [0.13660245]]
test_data:  [[0.15236427]
 [0.15061296]
 [0.17688266]
 ...
 [0.35726795]
 [0.38178634]
 [0.40630473]]
X_train:  [[0.18039216]
 [0.17504456]
 [0.1792038 ]
 ...
 [0.13333333]
 [0.13048128]
 [0.12905526]]
y_train:  (23487, 1)
X_test:  (5872, 1)
y_test (5872, 1)


In [10]:
# regressor = RandomForestRegressor(n_estimators=100, random_state=0)
# regressor.fit(X_train, y_train)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import optuna

# Define the objective function for Optuna
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    criterion = trial.suggest_categorical('criterion', ['poisson', 'squared_error', 'absolute_error', 'friedman_mse'])
    random_state = 42

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        criterion=criterion,
        random_state=random_state
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    return mse

# Use Optuna to optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=15)
best_params = study.best_params
print("Optuna: ", best_params)

regressor = RandomForestRegressor(**best_params)
regressor.fit(X_train, y_train)

[32m[I 2023-04-23 13:07:17,862][0m A new study created in memory with name: no-name-53ea060b-bb06-4f0f-ab1f-bd97fd426226[0m
[32m[I 2023-04-23 13:07:24,020][0m Trial 0 finished with value: 9.794384355581547e-05 and parameters: {'n_estimators': 254, 'max_depth': 13, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': 'auto', 'bootstrap': True, 'criterion': 'squared_error'}. Best is trial 0 with value: 9.794384355581547e-05.[0m
[32m[I 2023-04-23 13:45:07,113][0m Trial 1 finished with value: 0.00011015529657856413 and parameters: {'n_estimators': 661, 'max_depth': 14, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': 'auto', 'bootstrap': False, 'criterion': 'absolute_error'}. Best is trial 0 with value: 9.794384355581547e-05.[0m
[32m[I 2023-04-23 13:45:23,603][0m Trial 2 finished with value: 0.00019037296487355742 and parameters: {'n_estimators': 321, 'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False

Optuna:  {'n_estimators': 531, 'max_depth': 23, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': False, 'criterion': 'friedman_mse'}


In [11]:
# Lets Do the prediction 

RF_train_predict=regressor.predict(X_train)
RF_test_predict=regressor.predict(X_test)
# print("Train data prediction:", train_predict)
# # print("Test data prediction:", test_predict)
RF_train_predict = RF_train_predict.reshape(-1,1)
RF_test_predict = RF_test_predict.reshape(-1,1)

print("Train data prediction:", RF_train_predict.shape)
print("Test data prediction:", RF_test_predict.shape)

Train data prediction: (23487, 1)
Test data prediction: (5872, 1)


In [12]:
# Transform back to original form

RF_train_predict = scaler.inverse_transform(RF_train_predict)
RF_test_predict = scaler.inverse_transform(RF_test_predict)
RF_original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1)) 
RF_original_ytest = scaler.inverse_transform(y_test.reshape(-1,1)) 



In [13]:
# Evaluation metrices RMSE and MAE
RF_RMSE_train = math.sqrt(mean_squared_error(RF_original_ytrain,RF_train_predict))
RF_MSE_train = mean_squared_error(RF_original_ytrain,RF_train_predict)
RF_MAE_train = mean_absolute_error(RF_original_ytrain,RF_train_predict)

RF_RMSE_test = math.sqrt(mean_squared_error(RF_original_ytest,RF_test_predict))
RF_MSE_test = mean_squared_error(RF_original_ytest,RF_test_predict)
RF_MAE_test = mean_absolute_error(RF_original_ytest,RF_test_predict)

print("Train data RMSE: ", RF_RMSE_train)
print("Train data MSE: ", RF_MSE_train)
print("Test data MAE: ", RF_MAE_train)
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", RF_RMSE_test)
print("Test data MSE: ", RF_MSE_test)
print("Test data MAE: ", RF_MAE_test)

Train data RMSE:  254.63785142125946
Train data MSE:  64840.43537643541
Test data MAE:  127.61935020312761
-------------------------------------------------------------------------------------
Test data RMSE:  259.4511457485762
Test data MSE:  67314.89703024893
Test data MAE:  125.77442914674934


In [14]:
RF_EV_train = explained_variance_score(RF_original_ytrain, RF_train_predict)
RF_EV_test = explained_variance_score(RF_original_ytest, RF_test_predict)

print("Train data explained variance regression score:", RF_EV_train)
print("Test data explained variance regression score:", RF_EV_test)

Train data explained variance regression score: 0.9985887825998605
Test data explained variance regression score: 0.9987088078369664


In [15]:
RF_r2_train = r2_score(RF_original_ytrain, RF_train_predict)
RF_r2_test = r2_score(RF_original_ytest, RF_test_predict)

print("Train data R2 score:", RF_r2_train)
print("Test data R2 score:", RF_r2_test)

Train data R2 score: 0.9985887825998605
Test data R2 score: 0.9987087625421559


In [16]:
RF_MGD_train = mean_gamma_deviance(RF_original_ytrain, RF_train_predict)
RF_MGD_test = mean_gamma_deviance(RF_original_ytest, RF_test_predict)
RF_MPD_train = mean_poisson_deviance(RF_original_ytrain, RF_train_predict)
RF_MPD_test = mean_poisson_deviance(RF_original_ytest, RF_test_predict)
print("Train data MGD: ", RF_MGD_train)
print("Test data MGD: ", RF_MGD_test)
print("----------------------------------------------------------------------")
print("Train data MPD: ", RF_MPD_train)
print("Test data MPD: ",RF_MPD_test)

Train data MGD:  0.0003134675295040991
Test data MGD:  0.0003048511810084298
----------------------------------------------------------------------
Train data MPD:  4.321812876581981
Test data MPD:  4.32041649011323


In [17]:
# shift train predictions for plotting
from itertools import cycle
import plotly.express as px
print(RF_train_predict.shape)
print(RF_test_predict.shape)

look_back=time_step
trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(RF_train_predict)+look_back, :] = RF_train_predict
print("Train predicted data: ", trainPredictPlot.shape)

# shift test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(RF_train_predict)+(look_back*2)+1:len(closedf)-1, :] = RF_test_predict
print("Test predicted data: ", testPredictPlot.shape)

names = cycle(['Original close price','Train predicted close price','Test predicted close price'])


plotdf = pd.DataFrame({'date': close_stock['trunc_time'],
                       'original_close': close_stock['ma_5'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_close'],plotdf['train_predicted_close'],
                                          plotdf['test_predicted_close']],
              labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original close price vs predicted close price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

(23487, 1)
(5872, 1)
Train predicted data:  (29359, 2)


ValueError: could not broadcast input array from shape (5872,1) into shape (5850,2)

[32m[I 2023-04-23 11:12:59,252][0m A new study created in memory with name: no-name-55042f78-35b4-40b1-b583-282cf1135922[0m
