In [178]:
import requests
import json

import pandas as pd
from pandas.tseries.frequencies import to_offset

import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import math

from sklearn.preprocessing import MinMaxScaler

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

import time
import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

In [179]:
# segment_size = 100

# # Calculate the number of segments in the dataset
# num_segments = 17
# # Đọc vào tập dữ liệu bị phân mảnh ngang
# data_chunks = []
# for i in range(num_segments):
#     chunk = pd.read_csv(f'../Dataset/STB.csv')
#     data_chunks.append(chunk)
# data = pd.concat(data_chunks, ignore_index=True)
data = pd.read_csv('../Dataset/STB.csv')
# Áp dụng moving average với cửa sổ 5 và lưu vào cột 'ma_5'
data['ma_5'] = data['close_price'].rolling(window=5,min_periods=1).mean()

# Xóa bỏ các dòng có giá trị null do moving average
data.dropna(inplace=True)
print(data)
# Chia tập dữ liệu thành tập huấn luyện và tập kiểm tra
# train_data = data.sample(frac=0.8, random_state=42)
# test_data = data.drop(train_data.index)

     trunc_time  open_price  high_price  low_price  close_price    volume   
0    2020-10-06       13750       14400      13700        14200  27112100  \
1    2020-10-07       14050       14300      13700        13700  19465920   
2    2020-10-08       13800       13850      13450        13550  21328780   
3    2020-10-09       13600       13800      13450        13550  12474800   
4    2020-10-12       13750       13800      13300        13300  17962060   
..          ...         ...         ...        ...          ...       ...   
532  2022-11-21       17300       17400      16700        16700  17502900   
533  2022-11-22       16600       17800      16600        16900  30775500   
534  2022-11-23       16900       17900      16900        17500  25665600   
535  2022-11-24       17300       18200      16900        18200  18314400   
536  2022-11-25       18500       18950      18000        18900  24962000   

             ma_5  
0    14200.000000  
1    13950.000000  
2    13816.6666

In [180]:
closedf = data[['trunc_time','ma_5']]
print(closedf)
print("Shape of close dataframe:", closedf.shape)

     trunc_time          ma_5
0    2020-10-06  14200.000000
1    2020-10-07  13950.000000
2    2020-10-08  13816.666667
3    2020-10-09  13750.000000
4    2020-10-12  13660.000000
..          ...           ...
532  2022-11-21  16450.000000
533  2022-11-22  16810.000000
534  2022-11-23  17080.000000
535  2022-11-24  17280.000000
536  2022-11-25  17640.000000

[537 rows x 2 columns]
Shape of close dataframe: (537, 2)


In [181]:
close_stock = closedf.copy()
del closedf['trunc_time']
scaler=MinMaxScaler(feature_range=(0,1))
closedf=scaler.fit_transform(np.array(closedf).reshape(-1,1))
print(closedf.shape)

(537, 1)


In [182]:
training_size=int(len(closedf)*0.8)
test_size=len(closedf)-training_size
train_data,test_data=closedf[0:training_size,:],closedf[training_size:len(closedf),:1]
print("train_data: ", train_data.shape)
print("test_data: ", test_data.shape)

train_data:  (429, 1)
test_data:  (108, 1)


In [183]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]   ###i=0, 0,1,2,3-----99   100 
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

In [184]:
# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 10
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

X_train:  (418, 10)
y_train:  (418,)
X_test:  (97, 10)
y_test (97,)


In [194]:
regressor = RandomForestRegressor(n_estimators=100, random_state=0)
regressor.fit(X_train, y_train)

In [195]:
# Lets Do the prediction 

RF_train_predict=regressor.predict(X_train)
RF_test_predict=regressor.predict(X_test)
# print("Train data prediction:", train_predict)
# # print("Test data prediction:", test_predict)
RF_train_predict = RF_train_predict.reshape(-1,1)
RF_test_predict = RF_test_predict.reshape(-1,1)

print("Train data prediction:", RF_train_predict.shape)
print("Test data prediction:", RF_test_predict.shape)

Train data prediction: (418, 1)
Test data prediction: (97, 1)


In [196]:
# Transform back to original form

RF_train_predict = scaler.inverse_transform(RF_train_predict)
RF_test_predict = scaler.inverse_transform(RF_test_predict)
RF_original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1)) 
RF_original_ytest = scaler.inverse_transform(y_test.reshape(-1,1)) 



In [197]:
# Evaluation metrices RMSE and MAE
RF_RMSE_train = math.sqrt(mean_squared_error(RF_original_ytrain,RF_train_predict))
RF_MSE_train = mean_squared_error(RF_original_ytrain,RF_train_predict)
RF_MAE_train = mean_absolute_error(RF_original_ytrain,RF_train_predict)

RF_RMSE_test = math.sqrt(mean_squared_error(RF_original_ytest,RF_test_predict))
RF_MSE_test = mean_squared_error(RF_original_ytest,RF_test_predict)
RF_MAE_test = mean_absolute_error(RF_original_ytest,RF_test_predict)

print("Train data RMSE: ", RF_RMSE_train)
print("Train data MSE: ", RF_MSE_train)
print("Test data MAE: ", RF_MAE_train)
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", RF_RMSE_test)
print("Test data MSE: ", RF_MSE_test)
print("Test data MAE: ", RF_MAE_test)

Train data RMSE:  121.85894445832918
Train data MSE:  14849.602344498157
Test data MAE:  83.60909090909217
-------------------------------------------------------------------------------------
Test data RMSE:  481.11770234024397
Test data MSE:  231474.2435051556
Test data MAE:  377.1608247422693


In [198]:
RF_EV_train = explained_variance_score(RF_original_ytrain, RF_train_predict)
RF_EV_test = explained_variance_score(RF_original_ytest, RF_test_predict)

print("Train data explained variance regression score:", RF_EV_train)
print("Test data explained variance regression score:", RF_EV_test)

Train data explained variance regression score: 0.9995976975005243
Test data explained variance regression score: 0.9865455858538779


In [199]:
RF_r2_train = r2_score(RF_original_ytrain, RF_train_predict)
RF_r2_test = r2_score(RF_original_ytest, RF_test_predict)

print("Train data R2 score:", RF_r2_train)
print("Test data R2 score:", RF_r2_test)

Train data R2 score: 0.9995973901371498
Test data R2 score: 0.9822334117678425


In [200]:
RF_MGD_train = mean_gamma_deviance(RF_original_ytrain, RF_train_predict)
RF_MGD_test = mean_gamma_deviance(RF_original_ytest, RF_test_predict)
RF_MPD_train = mean_poisson_deviance(RF_original_ytrain, RF_train_predict)
RF_MPD_test = mean_poisson_deviance(RF_original_ytest, RF_test_predict)
print("Train data MGD: ", RF_MGD_train)
print("Test data MGD: ", RF_MGD_test)
print("----------------------------------------------------------------------")
print("Train data MPD: ", RF_MPD_train)
print("Test data MPD: ",RF_MPD_test)

Train data MGD:  2.5377501430496013e-05
Test data MGD:  0.0007028654423737045
----------------------------------------------------------------------
Train data MPD:  0.5982199138531964
Test data MPD:  12.58740592875556


In [201]:
# shift train predictions for plotting
from itertools import cycle
import plotly.express as px
print(RF_train_predict.shape)

look_back=time_step
trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(RF_train_predict)+look_back, :] = RF_train_predict
print("Train predicted data: ", trainPredictPlot.shape)

# shift test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(RF_train_predict)+(look_back*2)+1:len(closedf)-1, :] = RF_test_predict
print("Test predicted data: ", testPredictPlot.shape)

names = cycle(['Original close price','Train predicted close price','Test predicted close price'])


plotdf = pd.DataFrame({'date': close_stock['trunc_time'],
                       'original_close': close_stock['ma_5'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_close'],plotdf['train_predicted_close'],
                                          plotdf['test_predicted_close']],
              labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original close price vs predicted close price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

(418, 1)
Train predicted data:  (537, 1)
Test predicted data:  (537, 1)
