In [1]:
import pandas as pd
from pandas.tseries.frequencies import to_offset

import numpy as np

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import math

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor


import time
import datetime
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("./Dataset/HAG.csv")
data.rename(columns={"trunc_time":"Date","open_price":"open","high_price":"high","low_price":"low","close_price":"Close"}, inplace= True)

In [3]:
from sklearn.preprocessing import MinMaxScaler
closedf = data[['Date','Close']]
print(closedf)
print("Shape of close dataframe:", closedf.shape)
close_stock = closedf.copy()
del closedf['Date']
scaler=MinMaxScaler(feature_range=(0,1))
closedf=scaler.fit_transform(np.array(closedf).reshape(-1,1))
print(closedf.shape)
time_step = 20

            Date  Close
0     2016-01-07  10100
1     2016-01-08  10200
2     2016-01-11  10900
3     2016-01-12  11300
4     2016-01-13  10900
...          ...    ...
1715  2022-11-21   7750
1716  2022-11-22   7800
1717  2022-11-23   7350
1718  2022-11-24   7860
1719  2022-11-25   8410

[1720 rows x 2 columns]
Shape of close dataframe: (1720, 2)
(1720, 1)


In [4]:
training_size=int(len(closedf)*0.8)
test_size=len(closedf)-training_size
train_data,test_data=closedf[0:training_size,:],closedf[training_size:len(closedf),:1]
print("train_data: ", train_data.shape)
print("test_data: ", test_data.shape)
train_data

train_data:  (1376, 1)
test_data:  (344, 1)


array([[0.57633588],
       [0.58396947],
       [0.63740458],
       ...,
       [0.21221374],
       [0.20229008],
       [0.2129771 ]])

In [5]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]   ###i=0, 0,1,2,3-----99   100 
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return np.array(dataX), np.array(dataY)


In [6]:
# reshape into X=t,t+1,t+2,t+3 and Y=t+4
time_step = 10
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)
print(test_data.shape)
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

(344, 1)
X_train:  (1365, 10)
y_train:  (1365,)
X_test:  (333, 10)
y_test (333,)


In [7]:
from sklearn.model_selection import KFold
import time
import psutil

# cross validation
# evaluate performance of model, can be used to be the target function in optuna
# ref: from sklearn.model_selection import cross_val_score
def cross_val_score(estimator, X, y, cv=5, scoring=None):
    scores = []
    
    # scoring: target function, if not provided it will be r2
    if scoring is None:
        scoring = r2_score
    
    cv_splitter = KFold(n_splits=cv, shuffle=True)
    
    for train_indices, test_indices in cv_splitter.split(X):
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
        
        estimator.fit(X_train, y_train)
        y_pred = estimator.predict(X_test)
        
        score = scoring(y_test, y_pred)
        scores.append(score)
    
    return np.mean(scores) # return average score of `cv` times run


# def r2_score(y_true, y_pred):
#     numerator = np.sum((y_true - y_pred) ** 2)
#     denominator = np.sum((y_true - np.mean(y_true)) ** 2)
#     r2 = 1 - (numerator / denominator)
#     return r2

def measure_system_metrics():
    cpu_percent = psutil.cpu_percent()
    memory_usage = psutil.virtual_memory().percent
    disk_usage = psutil.disk_usage('/').percent
    network_io = psutil.net_io_counters()
    
    return cpu_percent, memory_usage, disk_usage, network_io

In [8]:
from sklearn.metrics import mean_squared_error
import optuna

# Define the objective function for Optuna
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_int("max_features", 1, X_train.shape[1])


    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
    )

    scores = cross_val_score(model, X_train, y_train, cv=2)

    return scores

# Use Optuna to optimize hyperparameters
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=500)
best_params = study.best_params

rf = RandomForestRegressor(**best_params)

[I 2023-06-10 13:29:54,370] A new study created in memory with name: no-name-1429e240-500a-429b-bea5-21caa0a54c62
[I 2023-06-10 13:29:59,935] Trial 0 finished with value: 0.9873781161709472 and parameters: {'n_estimators': 398, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': 10}. Best is trial 0 with value: 0.9873781161709472.
[I 2023-06-10 13:30:04,489] Trial 1 finished with value: 0.986521148622121 and parameters: {'n_estimators': 501, 'max_depth': 28, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': 6}. Best is trial 0 with value: 0.9873781161709472.
[I 2023-06-10 13:30:06,623] Trial 2 finished with value: 0.986475264096483 and parameters: {'n_estimators': 247, 'max_depth': 28, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 5}. Best is trial 0 with value: 0.9873781161709472.
[I 2023-06-10 13:30:10,892] Trial 3 finished with value: 0.9834732217931828 and parameters: {'n_estimators': 600, 'max_depth': 10, 'min_samples_spli

In [9]:
start_time = time.time()
cpu_percent, memory_usage, disk_usage, network_io = measure_system_metrics()

print(f"CPU usage: {cpu_percent}%")
print(f"Memory usage: {memory_usage}%")
print(f"Disk usage: {disk_usage}%")
print(f"Network I/O: {network_io}")

scores = cross_val_score(rf, X_train, y_train, cv=5)
print("Cross-validated my model r2:", scores)
end_time = time.time()

# Measure system metrics after code execution
cpu_percent, memory_usage, disk_usage, network_io = measure_system_metrics()

execution_time = end_time - start_time

print(f"Execution time: {execution_time} seconds")
print(f"CPU usage: {cpu_percent}%")
print(f"Memory usage: {memory_usage}%")
print(f"Disk usage: {disk_usage}%")
print(f"Network I/O: {network_io}")

rf.fit(X_train, y_train)

CPU usage: 63.1%
Memory usage: 90.6%
Disk usage: 79.7%
Network I/O: snetio(bytes_sent=88983330, bytes_recv=1669374383, packets_sent=591089, packets_recv=1115162, errin=0, errout=7, dropin=0, dropout=0)
Cross-validated my model r2: 0.9876924587684368
Execution time: 22.67951798439026 seconds
CPU usage: 63.0%
Memory usage: 91.7%
Disk usage: 79.7%
Network I/O: snetio(bytes_sent=89043913, bytes_recv=1670553828, packets_sent=591702, packets_recv=1116232, errin=0, errout=7, dropin=0, dropout=0)


In [10]:
# Lets Do the prediction 

RF_train_predict=rf.predict(X_train)
RF_test_predict=rf.predict(X_test)
# print("Train data prediction:", train_predict)
# # print("Test data prediction:", test_predict)
RF_train_predict = RF_train_predict.reshape(-1,1)
RF_test_predict = RF_test_predict.reshape(-1,1)

print("Train data prediction:", RF_train_predict.shape)
print("Test data prediction:", RF_test_predict.shape)

Train data prediction: (1365, 1)
Test data prediction: (333, 1)


In [11]:
# Transform back to original form

RF_train_predict = scaler.inverse_transform(RF_train_predict)
RF_test_predict = scaler.inverse_transform(RF_test_predict)
RF_original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1)) 
RF_original_ytest = scaler.inverse_transform(y_test.reshape(-1,1)) 

In [12]:
# Evaluation metrices RMSE and MAE
RF_RMSE_train = math.sqrt(mean_squared_error(RF_original_ytrain,RF_train_predict))
RF_MSE_train = mean_squared_error(RF_original_ytrain,RF_train_predict)
RF_MAE_train = mean_absolute_error(RF_original_ytrain,RF_train_predict)

RF_RMSE_test = math.sqrt(mean_squared_error(RF_original_ytest,RF_test_predict))
RF_MSE_test = mean_squared_error(RF_original_ytest,RF_test_predict)
RF_MAE_test = mean_absolute_error(RF_original_ytest,RF_test_predict)

print("Train data RMSE: ", RF_RMSE_train)
print("Train data MSE: ", RF_MSE_train)
print("Test data MAE: ", RF_MAE_train)
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", RF_RMSE_test)
print("Test data MSE: ", RF_MSE_test)
print("Test data MAE: ", RF_MAE_test)

Train data RMSE:  129.13165283371518
Train data MSE:  16674.983763567143
Test data MAE:  88.18539424453996
-------------------------------------------------------------------------------------
Test data RMSE:  2169.138858082755
Test data MSE:  4705163.38564456
Test data MAE:  1477.3297012680778


In [13]:
RF_EV_train = explained_variance_score(RF_original_ytrain, RF_train_predict)
RF_EV_test = explained_variance_score(RF_original_ytest, RF_test_predict)

print("Train data explained variance regression score:", RF_EV_train)
print("Test data explained variance regression score:", RF_EV_test)

Train data explained variance regression score: 0.9936524351103257
Test data explained variance regression score: 0.703324160697453


In [14]:
RF_r2_train = r2_score(RF_original_ytrain, RF_train_predict)
RF_r2_test = r2_score(RF_original_ytest, RF_test_predict)

print("Train data R2 score:", RF_r2_train)
print("Test data R2 score:", RF_r2_test)

Train data R2 score: 0.9936523809365561
Test data R2 score: 0.5032936940332933


In [15]:
RF_MGD_train = mean_gamma_deviance(RF_original_ytrain, RF_train_predict)
RF_MGD_test = mean_gamma_deviance(RF_original_ytest, RF_test_predict)
RF_MPD_train = mean_poisson_deviance(RF_original_ytrain, RF_train_predict)
RF_MPD_test = mean_poisson_deviance(RF_original_ytest, RF_test_predict)
print("Train data MGD: ", RF_MGD_train)
print("Test data MGD: ", RF_MGD_test)
print("----------------------------------------------------------------------")
print("Train data MPD: ", RF_MPD_train)
print("Test data MPD: ",RF_MPD_test)

Train data MGD:  0.00043764792554857614
Test data MGD:  0.04211330620007427
----------------------------------------------------------------------
Train data MPD:  2.599666272366832
Test data MPD:  443.0613468780001


In [16]:
# shift train predictions for plotting
from itertools import cycle
import plotly.express as px


print(RF_train_predict.shape)
print(RF_test_predict.shape)
print(len(closedf))

look_back=time_step
trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(RF_train_predict)+look_back, :] = RF_train_predict
print("Train predicted data: ", trainPredictPlot)

# shift test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(RF_train_predict)+(look_back*2)+1:len(closedf)-1, :] = RF_test_predict

close_stock['Predictions']=testPredictPlot
close_stock['Stock']='HAG'
close_stock['Model']='SkLearn'
close_stock['Method']='Opt'
close_stock.to_csv('./output/HAG_SkLearn_Opt.csv', index=False)

names = cycle(['Original close price','Train predicted close price','Test predicted close price'])


plotdf = pd.DataFrame({'date': close_stock['Date'],
                       'original_close': close_stock['Close'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_close'],plotdf['train_predicted_close'],
                                          plotdf['test_predicted_close']],
              labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original close price vs predicted close price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

(1365, 1)
(333, 1)
1720
Train predicted data:  [[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]]
