In [17]:
# Import modules

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
import time
import optuna

In [18]:
optimization_time_in_seconds = -time.time() # start the timer

In [19]:
# Load and prepare data

df = pd.read_csv("titanic-lg.csv")

label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex']) # category encoding
df['Embarked'] = label_encoder.fit_transform(df['Embarked']) # category encoding
df.drop(columns=["Cabin","Name","Ticket"],inplace=True) # deleting unnecessary columns
df["Age"].fillna(df["Age"].mean(), inplace=True) # filling null values
df["Fare"].fillna(df["Fare"].mean(),inplace=True) # filling null values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True) # filling null values
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Fare"].fillna(df["Fare"].mean(),inplace=True) # filling null values


In [20]:
# Split data to train and test sets

x=df.drop(columns=["Survived"])
y=df["Survived"]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [21]:
# Defining objective function

def objective(trial: optuna.Trial):
    md = trial.suggest_int('max_depth', 2, 64)
    mi = trial.suggest_int('min_inst', 1, 32)
    
    clf =  DecisionTreeClassifier(max_depth=md, min_samples_leaf=mi, random_state=1, )
    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    
    return f1_score(y_test, y_pred)


In [22]:
study = optuna.create_study(study_name="DecisionTreeClassifier", direction='maximize')

study.optimize(objective, n_trials=100)

[I 2024-08-10 16:31:25,127] A new study created in memory with name: DecisionTreeClassifier
[I 2024-08-10 16:31:25,777] Trial 0 finished with value: 0.631186135336059 and parameters: {'max_depth': 2, 'min_inst': 13}. Best is trial 0 with value: 0.631186135336059.
[I 2024-08-10 16:31:29,868] Trial 1 finished with value: 0.7936816198679942 and parameters: {'max_depth': 29, 'min_inst': 1}. Best is trial 1 with value: 0.7936816198679942.
[I 2024-08-10 16:31:30,740] Trial 2 finished with value: 0.713070000572538 and parameters: {'max_depth': 3, 'min_inst': 23}. Best is trial 1 with value: 0.7936816198679942.
[I 2024-08-10 16:31:34,444] Trial 3 finished with value: 0.8238418439393249 and parameters: {'max_depth': 25, 'min_inst': 20}. Best is trial 3 with value: 0.8238418439393249.
[I 2024-08-10 16:31:38,252] Trial 4 finished with value: 0.8203758941766321 and parameters: {'max_depth': 50, 'min_inst': 17}. Best is trial 3 with value: 0.8238418439393249.
[I 2024-08-10 16:31:41,012] Trial 5 fin

In [23]:
optimization_time_in_seconds += time.time() # stop the timer

In [24]:
print("Best params:", study.best_params)
print("Best value:", study.best_value)
minutes = int(optimization_time_in_seconds) // 60
seconds = int(optimization_time_in_seconds)
seconds = seconds % 60 + (optimization_time_in_seconds - int(optimization_time_in_seconds))    
print("Optimization time: {} m {} s".format(minutes, seconds))

Best params: {'max_depth': 13, 'min_inst': 9}
Best value: 0.8326488377310459
Optimization time: 4 m 46.18763518333435 s
