<a href="https://colab.research.google.com/github/pickle-lotus0976/Learning/blob/main/PyTorch%20Notebooks/Optuna_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.3-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.3-py3-none-any.whl (246 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.9/246.9 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.3 colorlog-6.9.0 optuna-4.4.0


In [2]:
# Import the neccessary libraries
import optuna
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the Pima Indian dataset from sklearn
# Remark: The Pima Indian dataset from sklearn is a regression dataset
# So we will load the dataset from this GitHub repository
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(url, names=columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
cols_with_missing_values = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_values] = df[cols_with_missing_values].replace(0, np.nan)
df.fillna(df.mean(), inplace=True)
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [5]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (537, 8)
Shape of X_test: (231, 8)


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define the objective function
def objective(trial):
  # Create the search space for the required hyperparameters
  n_estimators = trial.suggest_int('n_estimators', 50, 200)
  max_depth = trial.suggest_int('max_depth', 3, 20)

  # Create a RandomForestClassifier model with the suggested hyperparameters
  model = RandomForestClassifier(
      n_estimators=n_estimators,
      max_depth=max_depth,
      random_state=42
  )

  # Perform 3-fold cross-validation and calculate accuracy
  score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
  return score # Return the accuracy score for Optuna to maximize

# Using TPE Sampler

In [7]:
# Create a study and optimize the objective function using Optuna
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler()) # We are maximizing the accuracy and defining sampler
study.optimize(objective, n_trials=50)

[I 2025-07-10 06:59:30,471] A new study created in memory with name: no-name-8532ad0a-f4b5-4770-b176-f365ad5eb89c
[I 2025-07-10 06:59:31,967] Trial 0 finished with value: 0.7653631284916201 and parameters: {'n_estimators': 200, 'max_depth': 12}. Best is trial 0 with value: 0.7653631284916201.
[I 2025-07-10 06:59:34,532] Trial 1 finished with value: 0.7858472998137803 and parameters: {'n_estimators': 121, 'max_depth': 15}. Best is trial 1 with value: 0.7858472998137803.
[I 2025-07-10 06:59:37,112] Trial 2 finished with value: 0.7765363128491619 and parameters: {'n_estimators': 167, 'max_depth': 7}. Best is trial 1 with value: 0.7858472998137803.
[I 2025-07-10 06:59:39,681] Trial 3 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 190, 'max_depth': 13}. Best is trial 1 with value: 0.7858472998137803.
[I 2025-07-10 06:59:42,728] Trial 4 finished with value: 0.7746741154562384 and parameters: {'n_estimators': 200, 'max_depth': 17}. Best is trial 1 with value: 0.78584

In [8]:
# Print the best accuracy
print(f"Best trial accuracy: {study.best_value}")
print(f"Best parameters: {study.best_params}")

Best trial accuracy: 0.7858472998137803
Best parameters: {'n_estimators': 121, 'max_depth': 15}


In [9]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f"Test accuracy with the best hyperparameters: {test_accuracy:.2f}")

Test accuracy with the best hyperparameters: 0.76


# Optuna Samplers

In [10]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score

# # Define the objective function
# def objective(trial):
#   # Create the search space for the required hyperparameters
#   n_estimators = trial.suggest_int('n_estimators', 50, 200)
#   max_depth = trial.suggest_int('max_depth', 3, 20)

#   # Create a RandomForestClassifier model with the suggested hyperparameters
#   model = RandomForestClassifier(
#       n_estimators=n_estimators,
#       max_depth=max_depth,
#       random_state=42
#   )

#   # Perform 3-fold cross-validation and calculate accuracy
#   score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
#   return score # Return the accuracy score for Optuna to maximize

# Using Random Sampler

In [11]:
# # Create a study and optimize the objective function using Optuna
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler()) # We are maximizing the accuracy and defining sampler
# study.optimize(objective, n_trials=50)

In [12]:
# # Print the best accuracy
# print(f"Best trial accuracy: {study.best_value}")
# print(f"Best parameters: {study.best_params}")

In [13]:
# from sklearn.metrics import accuracy_score

# # Train a RandomForestClassifier using the best hyperparameters from Optuna
# best_model = RandomForestClassifier(**study.best_params, random_state=42)

# # Fit the model to the training data
# best_model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = best_model.predict(X_test)

# # Calculate the accuracy on the test set
# test_accuracy = accuracy_score(y_test, y_pred)

# # Print the test accuracy
# print(f"Test accuracy with the best hyperparameters: {test_accuracy}")

# Using GridSearchCV

In [14]:
# search_space = {
#     'n_estimators': [50, 100, 150, 200],
#     'max_depth': [5, 10, 15, 20]
# }

In [15]:
# # Create a study and optimize the objective function using Optuna
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.GridSampler(search_space)) # We are maximizing the accuracy and defining sampler
# study.optimize(objective)

In [16]:
# # Print the best accuracy
# print(f"Best trial accuracy: {study.best_value}")
# print(f"Best parameters: {study.best_params}")

In [17]:
# from sklearn.metrics import accuracy_score

# # Train a RandomForestClassifier using the best hyperparameters from Optuna
# best_model = RandomForestClassifier(**study.best_params, random_state=42)

# # Fit the model to the training data
# best_model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = best_model.predict(X_test)

# # Calculate the accuracy on the test set
# test_accuracy = accuracy_score(y_test, y_pred)

# # Print the test accuracy
# print(f"Test accuracy with the best hyperparameters: {test_accuracy}")

# Optuna Visualizations

In [53]:
# Import the required libraries
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances, plot_intermediate_values

# Plot Optimization Graph

In [19]:
# Plot Optimization History
plot_optimization_history(study).show()

In [20]:
# Parallel coordinate plot
plot_parallel_coordinate(study).show()

In [21]:
# Slice Plot
plot_slice(study).show()

In [22]:
# Contour plot
plot_contour(study).show()

In [23]:
# Parameters Importances
plot_param_importances(study).show()

# Define by Run

In [38]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC

def objective(trial):

  classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

  if classifier_name == 'SVM':
    # SVM hyperparameters
    c = trial.suggest_float('C', 0.1, 100, log=True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

    model = SVC(C=c, kernel=kernel, gamma=gamma)

  elif classifier_name == 'RandomForest':
    # Random Forest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        random_state=42
    )

  elif classifier_name == 'GradientBoosting':
    # Gradient Boosting hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

  score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
  return score

In [39]:
# Create a study and optimize it using CmaEsSampler
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-07-10 07:44:32,580] A new study created in memory with name: no-name-44e8a302-2efe-4e54-8169-f33d595fd8b0
[I 2025-07-10 07:44:32,776] Trial 0 finished with value: 0.7858472998137801 and parameters: {'classifier': 'SVM', 'C': 52.02659299508869, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 0 with value: 0.7858472998137801.
[I 2025-07-10 07:44:34,761] Trial 1 finished with value: 0.7728119180633147 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 137, 'learning_rate': 0.014700997776490114, 'max_depth': 13, 'min_samples_split': 3, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.7858472998137801.
[I 2025-07-10 07:44:36,282] Trial 2 finished with value: 0.7690875232774674 and parameters: {'classifier': 'RandomForest', 'n_estimators': 233, 'max_depth': 17, 'min_samples_split': 4, 'min_samples_leaf': 6, 'bootstrap': False}. Best is trial 0 with value: 0.7858472998137801.
[I 2025-07-10 07:44:40,060] Trial 3 finished with value: 0.756052141527002 and param

In [40]:
# Print the best accuracy
print(f"Best trial accuracy: {study.best_value}")
print(f"Best parameters: {study.best_params}")

Best trial accuracy: 0.7895716945996275
Best parameters: {'classifier': 'SVM', 'C': 0.1140215587286532, 'kernel': 'linear', 'gamma': 'scale'}


In [43]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC

# Get the best trial from the study
best_trial = study.best_trial

# Get the best classifier name and its parameters
best_classifier_name = best_trial.params['classifier']
best_params = {k: v for k, v in best_trial.params.items() if k != 'classifier'}

# Train the best model using the best hyperparameters
if best_classifier_name == 'SVM':
  best_model = SVC(**best_params, random_state=42)
elif best_classifier_name == 'RandomForest':
  best_model = RandomForestClassifier(**best_params, random_state=42)
elif best_classifier_name == 'GradientBoosting':
  best_model = GradientBoostingClassifier(**best_params, random_state=42)
else:
  raise ValueError(f"Unknown classifier: {best_classifier_name}")

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f"Test accuracy with the best hyperparameters: {test_accuracy:.2f}")

Test accuracy with the best hyperparameters: 0.74


In [44]:
study.trials_dataframe().head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.785847,2025-07-10 07:44:32.582457,2025-07-10 07:44:32.776595,0 days 00:00:00.194138,52.026593,,SVM,auto,linear,,,,,,COMPLETE
1,1,0.772812,2025-07-10 07:44:32.777504,2025-07-10 07:44:34.761688,0 days 00:00:01.984184,,,GradientBoosting,,,0.014701,13.0,10.0,3.0,137.0,COMPLETE
2,2,0.769088,2025-07-10 07:44:34.764803,2025-07-10 07:44:36.282800,0 days 00:00:01.517997,,False,RandomForest,,,,17.0,6.0,4.0,233.0,COMPLETE
3,3,0.756052,2025-07-10 07:44:36.286256,2025-07-10 07:44:40.060924,0 days 00:00:03.774668,,,GradientBoosting,,,0.093976,9.0,2.0,8.0,242.0,COMPLETE
4,4,0.759777,2025-07-10 07:44:40.061932,2025-07-10 07:44:42.967574,0 days 00:00:02.905642,,,GradientBoosting,,,0.088005,6.0,5.0,2.0,291.0,COMPLETE


In [45]:
study.trials_dataframe()['params_classifier'].value_counts()

Unnamed: 0_level_0,count
params_classifier,Unnamed: 1_level_1
SVM,79
GradientBoosting,11
RandomForest,10


In [47]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()

Unnamed: 0_level_0,value
params_classifier,Unnamed: 1_level_1
GradientBoosting,0.755544
RandomForest,0.760894
SVM,0.775711


In [48]:
plot_optimization_history(study).show()

In [50]:
plot_slice(study).show()