In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import optuna


In [3]:
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print("Dataset Head:\n", df.head())
print("Class Distribution:\n", df['target'].value_counts())

Dataset Head:
    alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  targe

In [4]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['target']), df['target'], test_size=0.2, random_state=42, stratify=df['target']
)

# Normalize Data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Baseline Model
baseline_model = RandomForestClassifier(random_state=42)
baseline_model.fit(X_train, y_train)
baseline_preds = baseline_model.predict(X_test)
print("Baseline Accuracy:", accuracy_score(y_test, baseline_preds))
print("Baseline Classification Report:\n", classification_report(y_test, baseline_preds))


Baseline Accuracy: 1.0
Baseline Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [5]:
# Bayesian Optimization with Optuna
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 30)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[I 2025-02-25 13:32:18,876] A new study created in memory with name: no-name-38bc5b33-5eb8-48dd-9a7e-8c9b5f8cf3ef
[I 2025-02-25 13:32:18,003] Trial 0 finished with value: 0.9790640394088669 and parameters: {'n_estimators': 120, 'max_depth': 19, 'min_samples_split': 20, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.9790640394088669.
[I 2025-02-25 13:32:19,379] Trial 1 finished with value: 0.9790640394088669 and parameters: {'n_estimators': 243, 'max_depth': 8, 'min_samples_split': 13, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.9790640394088669.
[I 2025-02-25 13:32:20,909] Trial 2 finished with value: 0.97192118226601 and parameters: {'n_estimators': 279, 'max_depth': 20, 'min_samples_split': 7, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.9790640394088669.
[I 2025-02-25 13:32:22,476] Trial 3 finished with value: 0.9790640394088669 and parameters: {'n_estimators': 267, 'max_depth': 21, 'min_samples_split': 12, 'min_samples_leaf': 1}. Best is trial 0 with value