In [1]:
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('diabetes.csv')
df.head()
df.shape

(768, 9)

In [3]:
import numpy as np

cols_with_missing_values = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_values] = df[cols_with_missing_values].replace(0, np.nan)

df.fillna(df.mean(), inplace=True)

print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [4]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(614, 8) (154, 8) (614,) (154,)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 2, 100)
    max_depth = trial.suggest_int('max_depth', 1, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    score = cross_val_score(model, X_train, y_train, cv=5).mean()
    return score

In [6]:
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=100)

[I 2025-01-24 09:20:18,364] A new study created in memory with name: no-name-a57a4d83-06b4-4e97-a69d-c413a0c3255b
[I 2025-01-24 09:20:19,882] Trial 0 finished with value: 0.7720511795281888 and parameters: {'n_estimators': 87, 'max_depth': 15, 'min_samples_split': 12, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7720511795281888.
[I 2025-01-24 09:20:21,071] Trial 1 finished with value: 0.7622417699586832 and parameters: {'n_estimators': 64, 'max_depth': 13, 'min_samples_split': 19, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7720511795281888.
[I 2025-01-24 09:20:21,790] Trial 2 finished with value: 0.7704251632680262 and parameters: {'n_estimators': 45, 'max_depth': 13, 'min_samples_split': 7, 'min_samples_leaf': 12}. Best is trial 0 with value: 0.7720511795281888.
[I 2025-01-24 09:20:23,227] Trial 3 finished with value: 0.7491936558709849 and parameters: {'n_estimators': 100, 'max_depth': 30, 'min_samples_split': 16, 'min_samples_leaf': 18}. Best is trial 0 with val

In [7]:
print(study.best_params)   
print(study.best_value)

{'n_estimators': 69, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 2}
0.7818339330934292


In [8]:
from sklearn.metrics import accuracy_score


best_model = RandomForestClassifier(**study.best_params, random_state=42)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
print(test_accuracy)

0.7597402597402597


# Optuna Visualization

In [19]:
# for visualization

from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour, plot_parallel_coordinate, plot_edf, plot_intermediate_values

In [10]:
plot_optimization_history(study).show()

In [18]:
plot_param_importances(study).show()

NameError: name 'plot_param_importances' is not defined

In [20]:
from optuna.visualization import plot_param_importances

plot_param_importances(study).show()

In [21]:
from optuna.visualization import plot_slice

plot_slice(study).show()

# Dynamic Search Space

In [22]:
#import decession tree, random forest and xgboost,support vector machine, logistic regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# Import necessary libraries
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load and prepare data
df = pd.read_csv('diabetes.csv')  # Replace with your dataset path
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


def objective(trial):
    classifier_name = trial.suggest_categorical('classifier', ['DecisionTree', 'RandomForest', 'XGBoost', 'SVM', 'LogisticRegression'])

    if classifier_name == 'DecisionTree':
        # Define hyperparameters to search
        max_depth = trial.suggest_int('dt_max_depth', 1, 20)
        min_samples_split = trial.suggest_int('dt_min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('dt_min_samples_leaf', 1, 20)
        max_features = trial.suggest_categorical('dt_max_features', [None, 'sqrt', 'log2'])
        criterion = trial.suggest_categorical('dt_criterion', ['gini', 'entropy'])

        # Create Decision Tree model
        model = DecisionTreeClassifier(
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            criterion=criterion,
            random_state=42
        )

    elif classifier_name == 'RandomForest':
        # Define hyperparameters to search
        n_estimators = trial.suggest_int('rf_n_estimators', 50, 500)
        max_depth = trial.suggest_int('rf_max_depth', 2, 20)
        min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 20)
        max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None])
        bootstrap = trial.suggest_categorical('rf_bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=bootstrap,
            random_state=42
        )

    elif classifier_name == 'XGBoost':
        # Define hyperparameters to search
        n_estimators = trial.suggest_int('xgb_n_estimators', 50, 500)
        max_depth = trial.suggest_int('xgb_max_depth', 2, 20)
        min_child_weight = trial.suggest_int('xgb_min_child_weight', 1, 20)
        subsample = trial.suggest_float('xgb_subsample', 0.5, 1.0)
        colsample_bytree = trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0)
        learning_rate = trial.suggest_float('xgb_learning_rate', 0.01, 0.3)
        gamma = trial.suggest_float('xgb_gamma', 0, 10)

        model = XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            min_child_weight=min_child_weight,
            gamma=gamma,
            random_state=42
        )

    elif classifier_name == 'SVM':
        # Define hyperparameters to search
        C = trial.suggest_float('svm_C', 1e-3, 1e3)
        kernel = trial.suggest_categorical('svm_kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
        
        # Use separate parameter names for categorical and float gamma
        if kernel in ['linear', 'sigmoid']:
            gamma = trial.suggest_categorical('svm_gamma_categorical', ['scale', 'auto'])
        else:
            gamma = trial.suggest_float('svm_gamma_float', 0.001, 10)

        degree = trial.suggest_int('svm_degree', 2, 5) if kernel == 'poly' else 3
        coef0 = trial.suggest_float('svm_coef0', 0.0, 1.0) if kernel in ['poly', 'sigmoid'] else 0.0

        model = SVC(
            C=C,
            kernel=kernel,
            degree=degree,
            gamma=gamma,
            coef0=coef0,
            random_state=42
        )
    elif classifier_name == 'LogisticRegression':
        # Define hyperparameters to search
        C = trial.suggest_float('lr_C', 1e-3, 1e3)
        penalty = trial.suggest_categorical('lr_penalty', ['l1', 'l2'])
        solver = 'liblinear' if penalty == 'l1' else 'lbfgs'

        model = LogisticRegression(
            C=C,
            penalty=penalty,
            solver=solver,
            random_state=42
        )
    
    score = cross_val_score(model, X_train, y_train, cv=5).mean()

    return score

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-01-24 10:12:07,317] A new study created in memory with name: no-name-718f4572-a4b0-48b5-b80b-e9664bb884da


(614, 8) (154, 8) (614,) (154,)


[I 2025-01-24 10:12:08,687] Trial 0 finished with value: 0.7720111955217913 and parameters: {'classifier': 'XGBoost', 'xgb_n_estimators': 211, 'xgb_max_depth': 13, 'xgb_min_child_weight': 15, 'xgb_subsample': 0.6652962324455786, 'xgb_colsample_bytree': 0.7135539575676481, 'xgb_learning_rate': 0.275834472970287, 'xgb_gamma': 4.020073359456148}. Best is trial 0 with value: 0.7720111955217913.
[I 2025-01-24 10:12:14,073] Trial 1 finished with value: 0.7818072770891643 and parameters: {'classifier': 'RandomForest', 'rf_n_estimators': 455, 'rf_max_depth': 20, 'rf_min_samples_split': 19, 'rf_min_samples_leaf': 13, 'rf_max_features': 'sqrt', 'rf_bootstrap': False}. Best is trial 1 with value: 0.7818072770891643.
[I 2025-01-24 10:12:15,804] Trial 2 finished with value: 0.7687591630014661 and parameters: {'classifier': 'XGBoost', 'xgb_n_estimators': 336, 'xgb_max_depth': 5, 'xgb_min_child_weight': 15, 'xgb_subsample': 0.5855270079274598, 'xgb_colsample_bytree': 0.6964674110655664, 'xgb_learning

In [6]:
best_trail = study.best_trial

print('Best trial:')
print(best_trail.params)
print('Best score:')
print(best_trail.value)

Best trial:
{'classifier': 'RandomForest', 'rf_n_estimators': 435, 'rf_max_depth': 19, 'rf_min_samples_split': 20, 'rf_min_samples_leaf': 14, 'rf_max_features': 'sqrt', 'rf_bootstrap': False}
Best score:
0.7850593096094897
