In [1]:
import pandas as pd
from pandas.tseries.frequencies import to_offset

import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import math

from sklearn.preprocessing import MinMaxScaler


import time
import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
segment_size = 100

# 1727 rows
data = pd.read_csv('./Dataset/HAG.csv')

# Calculate the number of segments in the dataset
print("len(data)", len(data))

data['ma_5'] = data['close_price'].ewm(span=3, adjust=False, min_periods=0).mean()

# Xóa bỏ các dòng có giá trị null do moving average
data.dropna(inplace=True)


len(data) 1720


In [3]:
closedf = data[['trunc_time','close_price','ma_5']]
print(closedf)
print("Shape of close dataframe:", closedf.shape)

      trunc_time  close_price          ma_5
0     2016-01-07        10100  10100.000000
1     2016-01-08        10200  10150.000000
2     2016-01-11        10900  10525.000000
3     2016-01-12        11300  10912.500000
4     2016-01-13        10900  10906.250000
...          ...          ...           ...
1715  2022-11-21         7750   7329.238613
1716  2022-11-22         7800   7564.619307
1717  2022-11-23         7350   7457.309653
1718  2022-11-24         7860   7658.654827
1719  2022-11-25         8410   8034.327413

[1720 rows x 3 columns]
Shape of close dataframe: (1720, 3)


In [4]:
import copy

close_stock = closedf.copy()
del closedf['trunc_time']

X_data = copy.deepcopy(closedf)
Y_data = copy.deepcopy(closedf)

del X_data['close_price']
del Y_data['ma_5']

scaler=MinMaxScaler(feature_range=(0,1))

X_data=scaler.fit_transform(np.array(X_data).reshape(-1,1))
Y_data=scaler.fit_transform(np.array(Y_data).reshape(-1,1))
print(X_data, X_data.shape)
print(Y_data, Y_data.shape)

[[0.59684186]
 [0.60081252]
 [0.63059241]
 ...
 [0.38697773]
 [0.40296717]
 [0.43280048]] (1720, 1)
[[0.57633588]
 [0.58396947]
 [0.63740458]
 ...
 [0.36641221]
 [0.40534351]
 [0.44732824]] (1720, 1)


In [5]:
del closedf['close_price']
closedf=scaler.fit_transform(np.array(closedf).reshape(-1,1))

print(closedf.shape)

(1720, 1)


In [6]:

def create_dataset(data_set):
    training_size=int(len(data_set)*0.8)
    train_data,test_data=data_set[0:training_size,:],data_set[training_size:len(data_set),:1]
    print("train_data: ", train_data)
    print("test_data: ", test_data)
    return train_data,test_data

In [7]:
# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 10
X_train, X_test = create_dataset(X_data)
y_train, y_test = create_dataset(Y_data)

print("X_train: ", X_train)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)


train_data:  [[0.59684186]
 [0.60081252]
 [0.63059241]
 ...
 [0.23206454]
 [0.21989121]
 [0.21936345]]
test_data:  [[0.21433479]
 [0.19752611]
 [0.19865133]
 [0.19921395]
 [0.19989232]
 [0.19943737]
 [0.19206273]
 [0.19036073]
 [0.18911267]
 [0.19245929]
 [0.18817662]
 [0.1812705 ]
 [0.18655288]
 [0.18919407]
 [0.19210292]
 [0.19236615]
 [0.19368896]
 [0.1963357 ]
 [0.20004146]
 [0.21420336]
 [0.22088725]
 [0.22224386]
 [0.2225251 ]
 [0.22147453]
 [0.21658152]
 [0.2121497 ]
 [0.20834552]
 [0.21081115]
 [0.21124983]
 [0.20591026]
 [0.20324048]
 [0.19753787]
 [0.19468656]
 [0.19326091]
 [0.19334221]
 [0.19457406]
 [0.19518998]
 [0.19748327]
 [0.19982111]
 [0.20297536]
 [0.20137596]
 [0.19819387]
 [0.19779402]
 [0.19402051]
 [0.19689853]
 [0.19476396]
 [0.19369667]
 [0.19276596]
 [0.19825659]
 [0.20298723]
 [0.20257309]
 [0.21626331]
 [0.21357885]
 [0.21223662]
 [0.202433  ]
 [0.19951652]
 [0.19845534]
 [0.19752769]
 [0.19468147]
 [0.19325836]
 [0.19294387]
 [0.19358076]
 [0.19429627]
 [0

In [8]:
from sklearn.model_selection import KFold
import time
import psutil

# cross validation
# evaluate performance of model, can be used to be the target function in optuna
# ref: from sklearn.model_selection import cross_val_score
def cross_val_score(estimator, X, y, cv=5, scoring=None):
    scores = []
    
    # scoring: target function, if not provided it will be r2
    if scoring is None:
        scoring = r2_score
    
    cv_splitter = KFold(n_splits=cv, shuffle=True)
    
    for train_indices, test_indices in cv_splitter.split(X):
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        
        estimator.fit(X_train, y_train)
        y_pred = estimator.predict(X_test)
        
        score = scoring(y_test, y_pred)
        scores.append(score)
    
    return np.mean(scores) # return average score of `cv` times run


# def r2_score(y_true, y_pred):
#     numerator = np.sum((y_true - y_pred) ** 2)
#     denominator = np.sum((y_true - np.mean(y_true)) ** 2)
#     r2 = 1 - (numerator / denominator)
#     return r2

def measure_system_metrics():
    cpu_percent = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent
    disk_usage = psutil.disk_usage('/').percent
    network_io = psutil.net_io_counters()
    
    return cpu_percent, memory_usage, disk_usage, network_io

In [9]:
from sklearn.metrics import mean_squared_error
import optuna

# Define the objective function for Optuna
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_int("max_features", 1, X_train.shape[1])


    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
    )

    scores = cross_val_score(model, X_train, y_train, cv=2)

    return scores

# Use Optuna to optimize hyperparameters
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=500)
best_params = study.best_params

rf = RandomForestRegressor(**best_params)

[I 2023-06-10 13:30:01,285] A new study created in memory with name: no-name-11577a59-4f42-4435-a664-3e36ae7690b1
[I 2023-06-10 13:30:02,068] Trial 0 finished with value: 0.993607080801167 and parameters: {'n_estimators': 148, 'max_depth': 20, 'min_samples_split': 7, 'min_samples_leaf': 9, 'max_features': 1}. Best is trial 0 with value: 0.993607080801167.
[I 2023-06-10 13:30:04,653] Trial 1 finished with value: 0.9939006061324758 and parameters: {'n_estimators': 430, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_features': 1}. Best is trial 1 with value: 0.9939006061324758.
[I 2023-06-10 13:30:08,386] Trial 2 finished with value: 0.9942658965923352 and parameters: {'n_estimators': 795, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_features': 1}. Best is trial 2 with value: 0.9942658965923352.
[I 2023-06-10 13:30:12,798] Trial 3 finished with value: 0.9936663847179144 and parameters: {'n_estimators': 831, 'max_depth': 22, 'min_samples_split': 

In [10]:
# regressor = RandomForestRegressor(max_depth=26, min_samples_leaf=2, min_samples_split=10,n_estimators=160, max_features=1)
rf.fit(X_train, y_train)

In [11]:
# Lets Do the prediction 

RF_train_predict=rf.predict(X_train)
RF_test_predict=rf.predict(X_test)

RF_train_predict = RF_train_predict.reshape(-1,1)
RF_test_predict = RF_test_predict.reshape(-1,1)

print("Train data prediction:", RF_train_predict.shape)
print("Test data prediction:", RF_test_predict.shape)

Train data prediction: (1376, 1)
Test data prediction: (344, 1)


In [12]:
# Transform back to original form

RF_train_predict = scaler.inverse_transform(RF_train_predict)
RF_test_predict = scaler.inverse_transform(RF_test_predict)
RF_original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1)) 
RF_original_ytest = scaler.inverse_transform(y_test.reshape(-1,1)) 



In [13]:
# Evaluation metrices RMSE and MAE
RF_RMSE_train = math.sqrt(mean_squared_error(RF_original_ytrain,RF_train_predict))
RF_MSE_train = mean_squared_error(RF_original_ytrain,RF_train_predict)
RF_MAE_train = mean_absolute_error(RF_original_ytrain,RF_train_predict)

RF_RMSE_test = math.sqrt(mean_squared_error(RF_original_ytest,RF_test_predict))
RF_MSE_test = mean_squared_error(RF_original_ytest,RF_test_predict)
RF_MAE_test = mean_absolute_error(RF_original_ytest,RF_test_predict)

print("Train data RMSE: ", RF_RMSE_train)
print("Train data MSE: ", RF_MSE_train)
print("Test data MAE: ", RF_MAE_train)
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", RF_RMSE_test)
print("Test data MSE: ", RF_MSE_test)
print("Test data MAE: ", RF_MAE_test)

Train data RMSE:  88.72623724067076
Train data MSE:  7872.345174887791
Test data MAE:  61.565864340251146
-------------------------------------------------------------------------------------
Test data RMSE:  1153.8148132013755
Test data MSE:  1331288.6231629248
Test data MAE:  687.9875051849945


In [14]:
RF_EV_train = explained_variance_score(RF_original_ytrain, RF_train_predict)
RF_EV_test = explained_variance_score(RF_original_ytest, RF_test_predict)

print("Train data explained variance regression score:", RF_EV_train)
print("Test data explained variance regression score:", RF_EV_test)

Train data explained variance regression score: 0.9968883055091947
Test data explained variance regression score: 0.8918771839344459


In [15]:
RF_r2_train = r2_score(RF_original_ytrain, RF_train_predict)
RF_r2_test = r2_score(RF_original_ytest, RF_test_predict)

print("Train data R2 score:", RF_r2_train)
print("Test data R2 score:", RF_r2_test)

Train data R2 score: 0.9968883042502484
Test data R2 score: 0.8521220931113069


In [16]:
RF_MGD_train = mean_gamma_deviance(RF_original_ytrain, RF_train_predict)
RF_MGD_test = mean_gamma_deviance(RF_original_ytest, RF_test_predict)
RF_MPD_train = mean_poisson_deviance(RF_original_ytrain, RF_train_predict)
RF_MPD_test = mean_poisson_deviance(RF_original_ytest, RF_test_predict)
print("Train data MGD: ", RF_MGD_train)
print("Test data MGD: ", RF_MGD_test)
print("----------------------------------------------------------------------")
print("Train data MPD: ", RF_MPD_train)
print("Test data MPD: ",RF_MPD_test)

Train data MGD:  0.0002095899806901599
Test data MGD:  0.010095343529957203
----------------------------------------------------------------------
Train data MPD:  1.2424033690283693
Test data MPD:  115.47240227491736


In [17]:
print(closedf)


[[0.59684186]
 [0.60081252]
 [0.63059241]
 ...
 [0.38697773]
 [0.40296717]
 [0.43280048]]


In [18]:
# shift train predictions for plotting
from itertools import cycle
import plotly.express as px
print(RF_train_predict.shape)
print(RF_test_predict.shape)
print(len(closedf))
look_back=time_step

trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[0:len(RF_train_predict), :] = RF_train_predict

print("Train predicted data: ", trainPredictPlot)

# shift test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(RF_train_predict):len(closedf), :] = RF_test_predict

print("Test predicted data: ", testPredictPlot)

close_stock['Predictions']=testPredictPlot
close_stock['Stock']='HAG'
close_stock['Model']='SkLearn'
close_stock['Method']='MV_Opt'
close_stock.to_csv('./output/HAG_SkLearn_MV_Opt.csv', index=False)

names = cycle(['Original close price','Train predicted close price','Test predicted close price'])


plotdf = pd.DataFrame({'date': close_stock['trunc_time'],
                       'original_close': close_stock['ma_5'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_close'],plotdf['train_predicted_close'],
                                          plotdf['test_predicted_close']],
              labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original close price vs predicted close price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

(1376, 1)
(344, 1)
1720
Train predicted data:  [[ 9784.92463717]
 [ 9790.83303528]
 [10261.00653963]
 ...
 [           nan]
 [           nan]
 [           nan]]
Test predicted data:  [[          nan]
 [          nan]
 [          nan]
 ...
 [7288.79729845]
 [7499.17021936]
 [7903.74419573]]
