In [1]:
import requests
import json

import pandas as pd
from pandas.tseries.frequencies import to_offset

import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import math

from sklearn.preprocessing import MinMaxScaler

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

import time
import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
# from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
segment_size = 100

# 1727 rows
data = pd.read_csv('../Dataset/STB.csv')

# Calculate the number of segments in the dataset
print("len(data)", len(data))
num_segments = int(len(data)/segment_size)
# Đọc vào tập dữ liệu bị phân mảnh ngang
data_chunks = []
for i in range(num_segments):
    chunk = pd.read_csv(f'../Dataset/STB.csv')
    data_chunks.append(chunk)
data = pd.concat(data_chunks, ignore_index=True)
data['date'] = pd.to_datetime(data.trunc_time)
# Áp dụng moving average với cửa sổ 5 và lưu vào cột 'ma_5'
data['ma_5'] = data['close_price'].rolling(window=5,min_periods=1).mean()

# Xóa bỏ các dòng có giá trị null do moving average
data.dropna(inplace=True)
# Chia tập dữ liệu thành tập huấn luyện và tập kiểm tra
# train_data = data.sample(frac=0.8, random_state=42)
# test_data = data.drop(train_data.index)

len(data) 1727


In [3]:
closedf = data[['trunc_time','close_price','ma_5']]
print(closedf)
print("Shape of close dataframe:", closedf.shape)

       trunc_time  close_price          ma_5
0      2016-01-04        12600  12600.000000
1      2016-01-05        12300  12450.000000
2      2016-01-06        12800  12566.666667
3      2016-01-07        12600  12575.000000
4      2016-01-08        12600  12580.000000
...           ...          ...           ...
29354  2022-11-21        16700  16450.000000
29355  2022-11-22        16900  16810.000000
29356  2022-11-23        17500  17080.000000
29357  2022-11-24        18200  17280.000000
29358  2022-11-25        18900  17640.000000

[29359 rows x 3 columns]
Shape of close dataframe: (29359, 3)


In [4]:
import copy

close_stock = closedf.copy()
del closedf['trunc_time']

X_data = copy.deepcopy(closedf)
Y_data = copy.deepcopy(closedf)

del X_data['close_price']
del Y_data['ma_5']

scaler=MinMaxScaler(feature_range=(0,1))

X_data=scaler.fit_transform(np.array(X_data).reshape(-1,1))
Y_data=scaler.fit_transform(np.array(Y_data).reshape(-1,1))

print(X_data, X_data.shape)
print(Y_data, Y_data.shape)

[[0.18039216]
 [0.17504456]
 [0.1792038 ]
 ...
 [0.34010695]
 [0.34723708]
 [0.3600713 ]] (29359, 1)
[[0.18563923]
 [0.17513135]
 [0.19264448]
 ...
 [0.35726795]
 [0.38178634]
 [0.40630473]] (29359, 1)


In [5]:
# training_size=int(len(closedf)*0.8)
# test_size=len(closedf)-training_size
# train_data,test_data=closedf[0:training_size,:],closedf[training_size:len(closedf),:1]
# print("train_data: ", train_data.shape)
# print("test_data: ", test_data.shape)
# train_data

In [6]:
# convert an array of values into a dataset matrix
# def create_dataset(dataset, time_step=1):
#     dataX, dataY = [], []
#     for i in range(len(dataset)-time_step-1):
#         a = dataset[i:(i+time_step), 0]   ###i=0, 0,1,2,3-----99   100 
#         dataX.append(a)
#         dataY.append(dataset[i + time_step, 0])
#     return np.array(dataX), np.array(dataY)
def create_dataset(data_set):
    training_size=int(len(data_set)*0.8)
    # test_size=len(data_set)-training_size
    train_data,test_data=data_set[0:training_size,:],data_set[training_size:len(data_set),:1]
    print("train_data: ", train_data)
    print("test_data: ", test_data)
    return train_data,test_data

In [7]:
# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 10
# print(type(X_data))
X_train, X_test = create_dataset(X_data)
y_train, y_test = create_dataset(Y_data)

print("X_train: ", X_train)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)
# print("type(X_test)",type(X_test))

train_data:  [[0.18039216]
 [0.17504456]
 [0.1792038 ]
 ...
 [0.13333333]
 [0.13048128]
 [0.12905526]]
test_data:  [[0.1315508 ]
 [0.13368984]
 [0.14438503]
 ...
 [0.34010695]
 [0.34723708]
 [0.3600713 ]]
train_data:  [[0.18563923]
 [0.17513135]
 [0.19264448]
 ...
 [0.12434326]
 [0.13485114]
 [0.13660245]]
test_data:  [[0.15236427]
 [0.15061296]
 [0.17688266]
 ...
 [0.35726795]
 [0.38178634]
 [0.40630473]]
X_train:  [[0.18039216]
 [0.17504456]
 [0.1792038 ]
 ...
 [0.13333333]
 [0.13048128]
 [0.12905526]]
y_train:  (23487, 1)
X_test:  (5872, 1)
y_test (5872, 1)


In [8]:
class Node:
    def __init__(self, split_feature=None, split_value=None, left=None, right=None, prediction=None, is_leaf=False):
        self.split_feature = split_feature
        self.split_value = split_value
        self.left = left
        self.right = right
        self.prediction = prediction
        self.is_leaf = is_leaf

In [9]:
import numpy as np

class DecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _split(self, X, y):
        best_feature = None
        best_value = None
        best_score = float('inf')

        m, p = X.shape

        for feature in range(p):
            values = np.unique(X[:, feature])

            for value in values:
                left_indices = X[:, feature] <= value
                right_indices = X[:, feature] > value

                if len(y[left_indices]) < self.min_samples_leaf or len(y[right_indices]) < self.min_samples_leaf:
                    continue

                score = len(y[left_indices]) * np.var(y[left_indices]) + len(y[right_indices]) * np.var(y[right_indices])

                if score < best_score:
                    best_feature = feature
                    best_value = value
                    best_score = score

        return best_feature, best_value, best_score

    def _build_tree(self, X, y, node=None, depth=0):
        if node is None:
            node = Node()

        if depth == self.max_depth or len(y) < self.min_samples_split:
            node.is_leaf = True
            node.prediction = np.mean(y)
            return node

        split_feature, split_value, split_score = self._split(X, y)

        if split_feature is None:
            node.is_leaf = True
            node.prediction = np.mean(y)
            return node

        left_indices = X[:, split_feature] <= split_value
        right_indices = X[:, split_feature] > split_value

        node.split_feature = split_feature
        node.split_value = split_value

        node.left = self._build_tree(X[left_indices], y[left_indices], Node(), depth + 1)
        node.right = self._build_tree(X[right_indices], y[right_indices], Node(), depth + 1)

        return node
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])
    
    def _traverse_tree(self, x, node):
        if node.is_leaf:
            return node.prediction
        else:
            if x[node.split_feature] <= node.split_value:
                return self._traverse_tree(x, node.left)
            else:
                return self._traverse_tree(x, node.right)

In [10]:
import numpy as np

class RandomForestRegressor:
    def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.trees = []
        
    def fit(self, X, y):
        for i in range(self.n_estimators):
            bootstrap_indices = np.random.choice(len(X), size=len(X), replace=True)
            bootstrap_X = X[bootstrap_indices]
            bootstrap_y = y[bootstrap_indices]
            
            tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split, 
                                          min_samples_leaf=self.min_samples_leaf, max_features=self.max_features)
            tree.fit(bootstrap_X, bootstrap_y)
            self.trees.append(tree)
            
    def predict(self, X):
        return np.mean([tree.predict(X) for tree in self.trees], axis=0)

In [11]:
from sklearn.metrics import mean_squared_error
import optuna
# from sklearn.ensemble import RandomForestRegressor as SklearnRandom

# Define the objective function for Optuna
# def objective(trial):
#     n_estimators = trial.suggest_int('n_estimators', 10, 1000)
#     max_depth = trial.suggest_int('max_depth', 2, 32)
#     min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
#     min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
#     max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
#     print(n_estimators)

#     model = RandomForestRegressor(
#         n_estimators=n_estimators,
#         max_depth=max_depth,
#         min_samples_split=min_samples_split,
#         min_samples_leaf=min_samples_leaf,
#         max_features=max_features    
#     )

#     print(n_estimators)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     mse = mean_squared_error(y_test, y_pred)

#     return mse

# # Use Optuna to optimize hyperparameters
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=2)
# best_params = study.best_params

In [12]:
rf = RandomForestRegressor(max_depth=26, min_samples_leaf=2, min_samples_split=10,n_estimators=160, max_features=1)
# rf.fit(X_train, y_train)

In [13]:
# rf = SklearnRandom(max_depth=26, min_samples_leaf=2, min_samples_split=10,
#                       n_estimators=160)
rf.fit(X_train, y_train)

In [14]:
# Lets Do the prediction 

RF_train_predict=rf.predict(X_train)
RF_test_predict=rf.predict(X_test)
# print("Train data prediction:", train_predict)
# # print("Test data prediction:", test_predict)
RF_train_predict = RF_train_predict.reshape(-1,1)
RF_test_predict = RF_test_predict.reshape(-1,1)

print("Train data prediction:", RF_train_predict.shape)
print("Test data prediction:", RF_test_predict.shape)

Train data prediction: (23487, 1)
Test data prediction: (5872, 1)


In [15]:
# Transform back to original form

RF_train_predict = scaler.inverse_transform(RF_train_predict)
RF_test_predict = scaler.inverse_transform(RF_test_predict)
RF_original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1)) 
RF_original_ytest = scaler.inverse_transform(y_test.reshape(-1,1)) 

In [16]:
# Evaluation metrices RMSE and MAE
RF_RMSE_train = math.sqrt(mean_squared_error(RF_original_ytrain,RF_train_predict))
RF_MSE_train = mean_squared_error(RF_original_ytrain,RF_train_predict)
RF_MAE_train = mean_absolute_error(RF_original_ytrain,RF_train_predict)

RF_RMSE_test = math.sqrt(mean_squared_error(RF_original_ytest,RF_test_predict))
RF_MSE_test = mean_squared_error(RF_original_ytest,RF_test_predict)
RF_MAE_test = mean_absolute_error(RF_original_ytest,RF_test_predict)

print("Train data RMSE: ", RF_RMSE_train)
print("Train data MSE: ", RF_MSE_train)
print("Test data MAE: ", RF_MAE_train)
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", RF_RMSE_test)
print("Test data MSE: ", RF_MSE_test)
print("Test data MAE: ", RF_MAE_test)

Train data RMSE:  254.65358511181233
Train data MSE:  64848.448410299054
Test data MAE:  127.64254871882568
-------------------------------------------------------------------------------------
Test data RMSE:  259.48371160170717
Test data MSE:  67331.79658659793
Test data MAE:  125.80121367665456


In [17]:
RF_EV_train = explained_variance_score(RF_original_ytrain, RF_train_predict)
RF_EV_test = explained_variance_score(RF_original_ytest, RF_test_predict)

print("Train data explained variance regression score:", RF_EV_train)
print("Test data explained variance regression score:", RF_EV_test)

Train data explained variance regression score: 0.9985886083637554
Test data explained variance regression score: 0.998708489581844


In [18]:
RF_r2_train = r2_score(RF_original_ytrain, RF_train_predict)
RF_r2_test = r2_score(RF_original_ytest, RF_test_predict)

print("Train data R2 score:", RF_r2_train)
print("Test data R2 score:", RF_r2_test)

Train data R2 score: 0.9985886082004637
Test data R2 score: 0.9987084383740871


In [19]:
RF_MGD_train = mean_gamma_deviance(RF_original_ytrain, RF_train_predict)
RF_MGD_test = mean_gamma_deviance(RF_original_ytest, RF_test_predict)
RF_MPD_train = mean_poisson_deviance(RF_original_ytrain, RF_train_predict)
RF_MPD_test = mean_poisson_deviance(RF_original_ytest, RF_test_predict)
print("Train data MGD: ", RF_MGD_train)
print("Test data MGD: ", RF_MGD_test)
print("----------------------------------------------------------------------")
print("Train data MPD: ", RF_MPD_train)
print("Test data MPD: ",RF_MPD_test)

Train data MGD:  0.00031349423851573845
Test data MGD:  0.00030491271417433177
----------------------------------------------------------------------
Train data MPD:  4.322252022372447
Test data MPD:  4.32141175239604


In [24]:
# shift train predictions for plotting
from itertools import cycle
import plotly.express as px
look_back=time_step
trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(RF_train_predict)+look_back, :] = RF_train_predict
print("Train predicted data: ", trainPredictPlot.shape)

# shift test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(RF_train_predict)+(look_back*2)+1:len(closedf)-1, :] = RF_test_predict
print("Test predicted data: ", testPredictPlot.shape)

names = cycle(['Original close price','Train predicted close price','Test predicted close price'])


plotdf = pd.DataFrame({'date': close_stock['trunc_time'],
                       'original_close': close_stock['ma_5'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_close'],plotdf['train_predicted_close'],
                                          plotdf['test_predicted_close']],
              labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original close price vs predicted close price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

Train predicted data:  (29359, 2)


ValueError: could not broadcast input array from shape (5872,1) into shape (5850,2)

In [None]:
x_input=test_data[len(test_data)-time_step:].reshape(1,-1)
temp_input=list(x_input)
temp_input=temp_input[0].tolist()

from numpy import array

lst_output=[]
n_steps=time_step
i=0
pred_days = 10
while(i<pred_days):
    
    if(len(temp_input)>time_step):
        
        x_input=np.array(temp_input[1:])
#         print("{} day input {}".format(i,x_input))
        x_input=x_input.reshape(1,-1)
        
        yhat = rf.predict(x_input)
#         print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat.tolist())
        temp_input=temp_input[1:]
       
        lst_output.extend(yhat.tolist())
        i=i+1
        
    else:
        yhat = rf.predict(x_input)
        
        temp_input.extend(yhat.tolist())
        lst_output.extend(yhat.tolist())
        
        i=i+1
        
print("Output of predicted next days: ", len(lst_output))

Output of predicted next days:  10


In [None]:
last_days=np.arange(1,time_step+1)
day_pred=np.arange(time_step+1,time_step+pred_days+1)
print(last_days)
print(day_pred)

[ 1  2  3  4  5  6  7  8  9 10]
[11 12 13 14 15 16 17 18 19 20]


In [None]:
import copy

temp_mat = np.empty((len(last_days)+pred_days+1,1))
temp_mat[:] = np.nan
temp_mat = temp_mat.reshape(1,-1).tolist()[0]

last_original_days_value = copy.deepcopy(temp_mat)
next_predicted_days_value = copy.deepcopy(temp_mat)

last_original_days_value[0:time_step+1] = scaler.inverse_transform(closedf[len(closedf)-time_step:]).reshape(1,-1).tolist()[0]
next_predicted_days_value[time_step+1:] = scaler.inverse_transform(np.array(lst_output).reshape(-1,1)).reshape(1,-1).tolist()[0]
last_original_days_value.append(None)
names = cycle(['Last 15 days close price','Predicted next 10 days close price'])

new_pred_plot = pd.DataFrame({
    'last_original_days_value':last_original_days_value,
    'next_predicted_days_value':next_predicted_days_value
})

fig = px.line(new_pred_plot,x=new_pred_plot.index, y=[new_pred_plot['last_original_days_value'],
                                                      new_pred_plot['next_predicted_days_value']],
              labels={'value': 'Stock price','index': 'Timestamp'})
fig.update_layout(title_text='Compare last 15 days vs next 10 days',
                  plot_bgcolor='white', font_size=15, font_color='black',legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()