In [1]:
# Import modules

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
import time
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load and prepare data

df = pd.read_csv("titanic-lg.csv")

label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex']) # category encoding
df['Embarked'] = label_encoder.fit_transform(df['Embarked']) # category encoding
df.drop(columns=["Cabin","Name","Ticket"],inplace=True) # deleting unnecessary columns
df["Age"].fillna(df["Age"].mean(), inplace=True) # filling null values
df["Fare"].fillna(df["Fare"].mean(),inplace=True) # filling null values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True) # filling null values
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Fare"].fillna(df["Fare"].mean(),inplace=True) # filling null values


In [3]:
# Split data to train and test sets

x=df.drop(columns=["Survived"])
y=df["Survived"]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [4]:
# Defining objective function

def objective(trial: optuna.Trial):
    global x_train, x_test, y_train, y_test

    md = trial.suggest_int('max_depth', 2, 64)
    mi = trial.suggest_int('min_inst', 1, 32)
    
    clf =  DecisionTreeClassifier(max_depth=md, min_samples_leaf=mi, random_state=1, )
    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    
    return f1_score(y_test, y_pred)


In [6]:
study = optuna.create_study(study_name="DecisionTreeClassifier", direction='maximize')

optimization_time_in_seconds = -time.time()

study.optimize(objective, n_trials=10)

optimization_time_in_seconds += time.time()

[I 2024-08-10 10:10:03,445] A new study created in memory with name: DecisionTreeClassifier
[I 2024-08-10 10:10:20,046] Trial 0 finished with value: 0.818681567894627 and parameters: {'max_depth': 31, 'min_inst': 13}. Best is trial 0 with value: 0.818681567894627.
[I 2024-08-10 10:10:37,311] Trial 1 finished with value: 0.7750344556014963 and parameters: {'max_depth': 49, 'min_inst': 1}. Best is trial 0 with value: 0.818681567894627.
[I 2024-08-10 10:10:51,695] Trial 2 finished with value: 0.8250301891246515 and parameters: {'max_depth': 38, 'min_inst': 29}. Best is trial 2 with value: 0.8250301891246515.
[I 2024-08-10 10:11:08,211] Trial 3 finished with value: 0.8117315376713413 and parameters: {'max_depth': 45, 'min_inst': 10}. Best is trial 2 with value: 0.8250301891246515.
[I 2024-08-10 10:11:23,365] Trial 4 finished with value: 0.8190984902163841 and parameters: {'max_depth': 28, 'min_inst': 13}. Best is trial 2 with value: 0.8250301891246515.
[I 2024-08-10 10:11:34,236] Trial 5 f

In [7]:
print("Best params", study.best_params)
print("Best value", study.best_value)
print("Optimization time in seconds", optimization_time_in_seconds)

Best params {'max_depth': 15, 'min_inst': 19}
Best value 0.8301263925479432
Optimization time in seconds 155.22390222549438
