In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

import pandas as pd

In [18]:
# Import data
df = pd.read_csv('./source/dataset.csv', index_col=0)
X, y = df[['RNN', 'LSTM', 'CNN', 'GRU']], df['Actual']
y_raw = df['Raw']
# Get 1000 samples 
X_train, y_train = X[:1000], y[:1000]
X_test, y_test = X[1000:], y_raw[1000:]

In [13]:
def iterate_prediction(clf, X_train, y_train, X_test, y_test, step=5):
    # Train the model
    clf.fit(X_train, y_train)
    # Create prediction result dataframe [predicted, ...]
    results = []

    # Iterate prediction and train the model again in every 5 steps
    for i in range(step, len(X_test) + 1, step):
        # Predict the next steps
        y_pred = clf.predict(X_test[:i])
        # Store the prediction result
        results.extend(y_pred[-step:])  # Only store the last 5 predictions
        # Update the model
        X_train = np.concatenate([X_train, X_test[:i]])
        y_train = np.concatenate([y_train, y_test[:i]])
        clf.fit(X_train, y_train)
    
    # Calculate the RMSE
    rmse = calculate_rmse(y_test, results)
    return (results, rmse)


def calculate_rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [14]:
# Base model
clf = RandomForestRegressor(random_state=0)
results, rmse = iterate_prediction(clf, X_train, y_train, X_test, y_test, step=100)
print('Base model RMSE: ', rmse)



Base model RMSE:  0.04966304316013401


In [95]:
# Hyperparameter tuning
# Best Parameters: OrderedDict([('bootstrap', True), ('criterion', 'friedman_mse'), ('max_depth', 27), ('max_features', 'auto'), ('max_leaf_nodes', 17), ('min_impurity_decrease', 0.00035269496460264014), ('min_samples_leaf', 2), ('min_samples_split', 5), ('n_estimators', 110)])

clf = RandomForestRegressor(
    bootstrap=True,
    criterion='friedman_mse',
    max_depth=27,
    max_features='auto',
    max_leaf_nodes=13,
    min_impurity_decrease=0.00035269496460264014,
    min_samples_leaf=10,
    min_samples_split=5,
    n_estimators=545,
    random_state=0,
    verbose=0,
)
results, rmse = iterate_prediction(clf, X_train, y_train, X_test, y_test, step=1500)
print('Tuned model RMSE: ', rmse)
# Tuned model RMSE:  0.047063465935930644

  warn(
  warn(


Tuned model RMSE:  0.047063465935930644


In [97]:
# Save the result to CSV
df_pred = pd.DataFrame(results, columns=['Prediction'])
df_pred.to_csv('./source/tuned_random_forest.csv', index=False)

In [None]:
# Find RMSE df_pred with y_test
rmse = calculate_rmse(y_test, results)
print("first 10 y_test: ", y_test[:10])
print("first 10 results: ", results[:10])

first 10 y_test:  Time
2023-09-08 04:02:40    0.113369
2023-09-08 04:02:45    0.125056
2023-09-08 04:02:50    0.140664
2023-09-08 04:02:55    0.146101
2023-09-08 04:03:00    0.138480
2023-09-08 04:03:05    0.128242
2023-09-08 04:03:10    0.125827
2023-09-08 04:03:15    0.132401
2023-09-08 04:03:20    0.143752
2023-09-08 04:03:25    0.155942
Name: Actual, dtype: float64
first 10 results:  [0.11299582887462649, 0.122433467133583, 0.14135958830491693, 0.14719729077633703, 0.15698268474867752, 0.12374000417937901, 0.1310137406698499, 0.13788535977607352, 0.1495079397434335, 0.16082462915146375]
