In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

import pandas as pd

In [42]:
# Import data
df = pd.read_csv('./source/dataset.csv', index_col=0)
X, y = df[['RNN', 'GRU']], df['Actual']

# Get 1000 samples 
X_train, y_train = X[:1000], y[:1000]
X_test, y_test = X[1000:], y[1000:]

In [43]:
def iterate_prediction(clf, X_train, y_train, X_test, y_test, step=5):
    # Train the model
    clf.fit(X_train, y_train)
    # Create prediction result dataframe [predicted, ...]
    results = []

    # Iterate prediction and train the model again in every 5 steps
    for i in range(step, len(X_test) + 1, step):
        # Predict the next steps
        y_pred = clf.predict(X_test[:i])
        # Store the prediction result
        results.extend(y_pred[-step:])  # Only store the last 5 predictions
        # Update the model
        X_train = np.concatenate([X_train, X_test[:i]])
        y_train = np.concatenate([y_train, y_test[:i]])
        clf.fit(X_train, y_train)
    
    # Calculate the RMSE
    rmse = calculate_rmse(y_test, results)
    return (results, rmse)


def calculate_rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [44]:
# Base model
clf = RandomForestRegressor(random_state=0)
results, rmse = iterate_prediction(clf, X_train, y_train, X_test, y_test, step=100)
print('Base model RMSE: ', rmse)



Base model RMSE:  0.02628922376649094


In [63]:
# Hyperparameter tuning
# Best Parameters: OrderedDict([('bootstrap', False), ('criterion', 'friedman_mse'), ('max_depth', 6), ('max_features', 'sqrt'), ('max_leaf_nodes', 14), ('min_impurity_decrease', 0.0002966630673990501), ('min_samples_leaf', 6), ('min_samples_split', 15), ('n_estimators', 963)])

clf = RandomForestRegressor(
    bootstrap=False,
    criterion='friedman_mse',
    max_depth=6,
    max_features='log2',
    max_leaf_nodes=14,
    min_impurity_decrease=0.0002966630673990501,
    min_samples_leaf=6,
    min_samples_split=15,
    n_estimators=963,
    random_state=0
)
results, rmse = iterate_prediction(clf, X_train, y_train, X_test, y_test, step=1500)
print('Tuned model RMSE: ', rmse)

# Save the result to CSV
# df = pd.DataFrame(results, columns=['Prediction'])
# df.to_csv('./source/random_forest.csv', index=False)

Tuned model RMSE:  0.024967464620108824


Best

clf = RandomForestRegressor(
    bootstrap=False,
    criterion='friedman_mse',
    max_depth=6,
    max_features='sqrt',
    max_leaf_nodes=14,
    min_impurity_decrease=0.0002966630673990501,
    min_samples_leaf=6,
    min_samples_split=15,
    n_estimators=963,
    random_state=0
)

Tuned model RMSE:  0.024877229636657863