<a href="https://colab.research.google.com/github/msbeigi/data_exercise/blob/main/Hyperparameter_Optuna/Hyperparameters_withOptuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install optuna

In [4]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as plt

import optuna

### Loading Healthexp dataset


In [5]:
healthexp=sns.load_dataset('healthexp')
healthexp.dropna(inplace=True)

In [6]:
X = healthexp.drop(columns=['Life_Expectancy'])
y = healthexp['Life_Expectancy']

In [7]:
X = pd.get_dummies(X, drop_first=True)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Trial with optuna

In [15]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    max_features = trial.suggest_categorical(name='max_features',choices=[ 'sqrt', 'log2',None])  #

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # this is the objective_value to optimize
    return rmse

In [16]:
study = optuna.create_study(direction='minimize')


[I 2024-08-14 14:18:03,028] A new study created in memory with name: no-name-7fbdeada-5b07-4f2e-9bea-82991da39194


In [None]:
study.optimize(objective, n_trials=50)


In [18]:
best_trial = study.best_trial
print(f'Best trial: Value: {best_trial.value}, Params: {best_trial.params}')

Best trial: Value: 0.29221762538451596, Params: {'n_estimators': 921, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}


### RandomForestRegressor with default values

In [19]:
default_model = RandomForestRegressor(random_state=42)
default_model.fit(X_train, y_train)
y_pred_default = default_model.predict(X_test)
rmse_default = np.sqrt(mean_squared_error(y_test, y_pred_default))


### RnadomForestRegressor with optuna bestparams

In [20]:
optimized_model = RandomForestRegressor(
    n_estimators=best_trial.params['n_estimators'],
    max_depth=best_trial.params['max_depth'],
    min_samples_split=best_trial.params['min_samples_split'],
    min_samples_leaf=best_trial.params['min_samples_leaf'],
    max_features=best_trial.params['max_features'],
    random_state=42
)
optimized_model.fit(X_train, y_train)
y_pred_optimized = optimized_model.predict(X_test)
rmse_optimized = np.sqrt(mean_squared_error(y_test, y_pred_optimized))

print(f"Default Model RMSE: {rmse_default}")
print(f"Optimized Model RMSE: {rmse_optimized}")

Default Model RMSE: 0.3953488448078476
Optimized Model RMSE: 0.29221762538451596


### Visualization

In [24]:
fig = optuna.visualization.plot_optimization_history(study)
fig.update_layout(
    width=1000,
    height=600
)
fig.show()

In [25]:
fig=optuna.visualization.plot_parallel_coordinate(study)
fig.update_layout(
    width=1000,
    height=600
)
fig.show()





1.   **max_depth:** *the darker dots are clustered around lower max_depth values, it suggests that shallower trees (trees with fewer levels) are performing better on this task, which is max_depth=10.*
2.   **max_features:** *the better performance (lower rmse) is also clustered on 'sqrt'. *
3.  **n_estimators:** *as it's depicted the higher number of stimators the better the rmse result, so the best param is 921 for this study.*



In [23]:
optuna.visualization.plot_slice(study,params=['n_estimators',  'max_depth','min_samples_split','min_samples_leaf','max_features'])