# Optuna Experiments

This file contains the code for the experiments conducted using Optuna on both the classification and regression datasets.

In [16]:
# Import required modules
import optuna
import time
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, accuracy_score, precision_score, recall_score

In [17]:
# Set random seed
RANDOM_SEED = 3

In [18]:
# Function for calculating elapsed time
def print_elapsed_time(start, end):
    elapsed_time = end - start
    minutes = int(elapsed_time // 60)
    seconds = int(elapsed_time % 60)
    print("Elapsed time: {} minutes, {} seconds".format(minutes, seconds))

## Hospital Readmissions (Classification)

In this section, we run Optuna on our classification dataset. In Optuna, we define an objective we are trying to maximize or minimize (in this example, accuracy score) and then create a study that runs some number of trials (here 100) in order to attempt to optimize based on our specified objective.

In [None]:
# Read in data
readmissions = pd.read_csv('../data/classification/readmissions_clean.csv')

# Split dataset into X and Y
X = readmissions.drop(['readmitted'], axis=1)
y = readmissions.readmitted

# splitting X and Y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=RANDOM_SEED, test_size=0.2)

In [19]:
# Define an objective to maximize or minimize (here, we maximize accuracy)
def objective(trial):
    # Use ranges of parameters equal to the range covered by grid search
    n_estimators = trial.suggest_int('n_estimators', 50, 300, 1)
    max_depth = trial.suggest_int('max_depth', 5, 15, 1)
    max_features = trial.suggest_int('max_features', 3, 10, 1)

    # Train and fit RFC
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features= max_features, random_state=RANDOM_SEED)
    rf.fit(X_train, y_train)

    # Make and score predictions
    pred=rf.predict(X_test)
    score = accuracy_score(y_test,pred)
    
    return score

# Run and time optimization
start = time.time()                                
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
end = time.time()

[32m[I 2023-04-20 13:28:14,701][0m A new study created in memory with name: no-name-7710ea6f-c0ec-481a-a61a-43bb349bc5a4[0m
[32m[I 2023-04-20 13:28:19,184][0m Trial 0 finished with value: 0.6246 and parameters: {'n_estimators': 252, 'max_depth': 13, 'max_features': 7}. Best is trial 0 with value: 0.6246.[0m
[32m[I 2023-04-20 13:28:21,016][0m Trial 1 finished with value: 0.6248 and parameters: {'n_estimators': 121, 'max_depth': 10, 'max_features': 8}. Best is trial 1 with value: 0.6248.[0m
[32m[I 2023-04-20 13:28:24,906][0m Trial 2 finished with value: 0.623 and parameters: {'n_estimators': 219, 'max_depth': 12, 'max_features': 8}. Best is trial 1 with value: 0.6248.[0m
[32m[I 2023-04-20 13:28:28,378][0m Trial 3 finished with value: 0.627 and parameters: {'n_estimators': 177, 'max_depth': 11, 'max_features': 10}. Best is trial 3 with value: 0.627.[0m
[32m[I 2023-04-20 13:28:28,802][0m Trial 4 finished with value: 0.629 and parameters: {'n_estimators': 75, 'max_depth': 6

We see that the Optuna optimization took 2 minutes and 43 seconds

In [20]:
# Display time elapsed
print_elapsed_time(start,end)

Elapsed time: 2 minutes, 43 seconds


We can also can view the optimal parameters that Optuna found across 100 trials:

In [21]:
# Display results of best trial
study.best_trial

FrozenTrial(number=29, state=TrialState.COMPLETE, values=[0.631], datetime_start=datetime.datetime(2023, 4, 20, 13, 29, 4, 200347), datetime_complete=datetime.datetime(2023, 4, 20, 13, 29, 5, 734533), params={'n_estimators': 147, 'max_depth': 8, 'max_features': 6}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=300, log=False, low=50, step=1), 'max_depth': IntDistribution(high=15, log=False, low=5, step=1), 'max_features': IntDistribution(high=10, log=False, low=3, step=1)}, trial_id=29, value=None)

The best trial has the following parameters, so we then use this optimized set to apply again to our model and get out the final metric scores for Optuna.

`params={'n_estimators': 147, 'max_depth': 8, 'max_features': 6}`

In [22]:
# Re-fit classifier with optimal parameters
rf = RandomForestClassifier(max_depth=8 ,max_features=6 ,n_estimators=147)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Calculate and print metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy: 0.6274
Precision: 0.6320916905444126
Recall: 0.47461273666092946


## Car Emissions Data (Regression)

For regression, we complete similar steps, instead using one of our regression-specific metrics, mean squared error (MSE). To optimize this metric, we want it to be as small as possible, so instead of maximizing for the objective (like we did with classification), we minimize.

In [None]:
# Read in data
emissions = pd.read_csv("../data/regression/emissions_cleaned.csv")

# Split dataset into X and Y
X = emissions.drop('co2_emissions', axis=1)
y = emissions["co2_emissions"]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=3, test_size=0.2)

In [12]:
# Define an objective to maximize or minimize (here, we minimize MSE)

def objective(trial):
    # Use ranges of parameters equal to the range covered by grid search
    n_estimators = trial.suggest_int('n_estimators', 50, 300, 1)
    max_depth = trial.suggest_int('max_depth', 5, 15, 1)
    max_features = trial.suggest_int('max_features', 3, 10, 1)
    
    # Train and fit RFR
    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_features= max_features, random_state=RANDOM_SEED)
    rf.fit(X_train, y_train)

    # Make and score predictions
    pred=rf.predict(X_test)
    score = mean_squared_error(y_test,pred)
    
    return score

# Run and time optimization
start = time.time()                              
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
end = time.time()

[32m[I 2023-04-20 13:23:47,899][0m A new study created in memory with name: no-name-68666d82-200c-418c-9de9-5b5c709f7b57[0m
[32m[I 2023-04-20 13:23:49,788][0m Trial 0 finished with value: 10.24061001682512 and parameters: {'n_estimators': 295, 'max_depth': 14, 'max_features': 9}. Best is trial 0 with value: 10.24061001682512.[0m
[32m[I 2023-04-20 13:23:50,400][0m Trial 1 finished with value: 16.40074208896611 and parameters: {'n_estimators': 229, 'max_depth': 9, 'max_features': 4}. Best is trial 0 with value: 10.24061001682512.[0m
[32m[I 2023-04-20 13:23:50,925][0m Trial 2 finished with value: 14.032112340024373 and parameters: {'n_estimators': 181, 'max_depth': 11, 'max_features': 3}. Best is trial 0 with value: 10.24061001682512.[0m
[32m[I 2023-04-20 13:23:52,246][0m Trial 3 finished with value: 10.083259155266468 and parameters: {'n_estimators': 262, 'max_depth': 12, 'max_features': 8}. Best is trial 3 with value: 10.083259155266468.[0m
[32m[I 2023-04-20 13:23:53,046

We see that the Optuna optimization took 1 minute and 1 second

In [13]:
# Display time elapsed
print_elapsed_time(start,end)

Elapsed time: 1 minutes, 1 seconds


We can also can view the optimal parameters that Optuna found across 100 trials:

In [14]:
# Display results of best trial
study.best_trial

FrozenTrial(number=84, state=TrialState.COMPLETE, values=[9.632951415122635], datetime_start=datetime.datetime(2023, 4, 20, 13, 24, 40, 747572), datetime_complete=datetime.datetime(2023, 4, 20, 13, 24, 41, 200687), params={'n_estimators': 65, 'max_depth': 15, 'max_features': 10}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=300, log=False, low=50, step=1), 'max_depth': IntDistribution(high=15, log=False, low=5, step=1), 'max_features': IntDistribution(high=10, log=False, low=3, step=1)}, trial_id=84, value=None)

The best trial has the following parameters, so we then use this optimized set to apply again to our model and get out the final metric scores for Optuna.

`params={'n_estimators': 65, 'max_depth': 15, 'max_features': 10}`

In [15]:
# Re-fit classifier with optimal parameters
rf = RandomForestRegressor(n_estimators=65, max_depth= 15, max_features= 10, random_state=RANDOM_SEED)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Calculate and print metrics
print('Mean Absolute Error (MAE):', mean_absolute_error(y_test, y_pred))
print('Mean Absolute Percentage Error (MAPE):', mean_absolute_percentage_error(y_test, y_pred))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, y_pred))

Mean Absolute Error (MAE): 1.6875127999525794
Mean Absolute Percentage Error (MAPE): 0.007003344008677135
Mean Squared Error (MSE): 9.632951415122635
