#### Tuning Linear Regression Hyperparameters 

In [5]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

import pandas as pd

In [6]:
# Import data
df = pd.read_csv('./source/dataset.csv', index_col=0)
X, y = df[['RNN', 'LSTM', 'CNN', 'GRU']], df['Actual']
y_raw = df['Raw']
# Get 1000 samples 
X_train = X[:1000]
y_train = y[:1000]
X_test = X[1000:]
# y_test = y[1000:]
y_test = y_raw[1000:]

In [7]:
search_space = {
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(3, 30),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 20),
    'max_features': Categorical(['auto', 'sqrt', 'log2']),
    'max_leaf_nodes': Integer(10, 1000, "log-uniform"),
    'min_impurity_decrease': Real(0.0, 1e-1),
    'bootstrap': Categorical([True, False]),
    'criterion': Categorical(['squared_error', 'absolute_error', 'friedman_mse', 'poisson'])  # Updated criterion values
}

# Create a RandomForestRegressor instance
rf_model = RandomForestRegressor(random_state=0)

# Custom scorer function to return RMSE
def rmse_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return -np.sqrt(mean_squared_error(y, y_pred))  # Negative RMSE for maximization

# Set up the BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=rf_model,
    search_spaces=search_space,
    n_iter=32,
    scoring=rmse_scorer,  # Using custom RMSE scorer
    n_jobs=-1,
    cv=10,
    random_state=42
)

# Perform the search
bayes_search.fit(X_train, y_train)

# Best parameters and score
best_params = bayes_search.best_params_
best_score = -bayes_search.best_score_  # Converting back to positive RMSE

print("Best Parameters:", best_params)
print("Best Score (RMSE):", best_score)


  warn(


Best Parameters: OrderedDict([('bootstrap', True), ('criterion', 'friedman_mse'), ('max_depth', 27), ('max_features', 'auto'), ('max_leaf_nodes', 17), ('min_impurity_decrease', 0.00035269496460264014), ('min_samples_leaf', 2), ('min_samples_split', 5), ('n_estimators', 110)])
Best Score (RMSE): 0.023842113184446394


In [9]:
# Print Score of the base model on the test set RMSE
base_model = RandomForestRegressor(random_state=0)
base_model.fit(X_train, y_train)
y_pred = base_model.predict(X_test)
base_score = np.sqrt(mean_squared_error(y_test, y_pred))
print("Base Model Score (RMSE):", base_score)

# Print Score of the model on the test set
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)
best_score = np.sqrt(mean_squared_error(y_test, y_pred))
print("Best Model Score (RMSE):", best_score)

Base Model Score (RMSE): 0.04743107288994787
Best Model Score (RMSE): 0.047208229161314404


#### Record

Best Parameters: OrderedDict([('bootstrap', True), ('criterion', 'friedman_mse'), ('max_depth', 27), ('max_features', 'auto'), ('max_leaf_nodes', 17), ('min_impurity_decrease', 0.00035269496460264014), ('min_samples_leaf', 2), ('min_samples_split', 5), ('n_estimators', 110)])

Base Model Score (RMSE): 0.025487677133160157

Best Model Score (RMSE): 0.024724725455989666