In [1]:
from DataLoader import getPreprocessedData

In [2]:
trainX, trainY, testX, testY = getPreprocessedData()

In [5]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score,matthews_corrcoef
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.metrics import f1_score,make_scorer
from imblearn.metrics import geometric_mean_score

In [6]:
random_seed = 42

# before hyperparam tuning 

In [7]:
dt = DecisionTreeClassifier(criterion='entropy',
                            random_state=random_seed)
dt.fit(trainX[:,:,0],trainY[:,0])
y_pred = dt.predict(testX[:,:,0])
print("Test Gmean :",geometric_mean_score(testY[:,0],y_pred) * 100)
print("Test f1 : ",f1_score(testY[:,0],y_pred) * 100 )
print("Test MMC : ",matthews_corrcoef(testY[:,0],y_pred) * 100 )

Test Gmean : 77.46694657527955
Test f1 :  77.4432118330692
Test MMC :  54.93415932046256


In [8]:
# criterion log loss + splitter random + best depth 52 79.60
dt = DecisionTreeClassifier(criterion='log_loss'
                            ,random_state=random_seed)
dt.fit(trainX[:,:,0],trainY[:,0])
y_pred = dt.predict(testX[:,:,0])
print("Test Gmean :",geometric_mean_score(testY[:,0],y_pred) * 100)
print("Test f1 : ",f1_score(testY[:,0],y_pred) * 100 )
print("Test MMC : ",matthews_corrcoef(testY[:,0],y_pred) * 100 )

Test Gmean : 77.46694657527955
Test f1 :  77.4432118330692
Test MMC :  54.93415932046256


In [9]:
dt = DecisionTreeClassifier(criterion='gini'
                            ,random_state=random_seed)
dt.fit(trainX[:,:,0],trainY[:,0])
y_pred = dt.predict(testX[:,:,0])
print("Test Gmean :",geometric_mean_score(testY[:,0],y_pred) * 100)
print("Test f1 : ",f1_score(testY[:,0],y_pred) * 100 )
print("Test MMC : ",matthews_corrcoef(testY[:,0],y_pred) * 100 )

Test Gmean : 77.8761492410653
Test f1 :  78.19979188345474
Test MMC :  55.801024614628545


# after hyperparam tuning 

In [10]:
# criterion entropy + splitter random + best depth 52 
dt = DecisionTreeClassifier(criterion='entropy',
                            splitter= 'random',
                            max_depth=52,
                            random_state=random_seed)
dt.fit(trainX[:,:,0],trainY[:,0])
y_pred = dt.predict(testX[:,:,0])
print("Test Gmean :",geometric_mean_score(testY[:,0],y_pred) * 100)
print("Test f1 : ",f1_score(testY[:,0],y_pred) * 100 )
print("Test MMC : ",matthews_corrcoef(testY[:,0],y_pred) * 100 )

Test Gmean : 79.38647091318931
Test f1 :  79.60302951162184
Test MMC :  58.79873327883816


In [11]:
# criterion log loss + splitter random + best depth 52 79.60
dt = DecisionTreeClassifier(criterion='log_loss',
                            splitter="random",
                            max_depth=52
                            ,random_state=random_seed)
dt.fit(trainX[:,:,0],trainY[:,0])
y_pred = dt.predict(testX[:,:,0])
print("Test Gmean :",geometric_mean_score(testY[:,0],y_pred) * 100)
print("Test f1 : ",f1_score(testY[:,0],y_pred) * 100 )
print("Test MMC : ",matthews_corrcoef(testY[:,0],y_pred) * 100 )

Test Gmean : 79.38647091318931
Test f1 :  79.60302951162184
Test MMC :  58.79873327883816


In [12]:
# criterion gini + splitter random + best depth 37 79.47
dt = DecisionTreeClassifier(criterion='gini',
                            splitter="random",
                            max_depth=37
                            ,random_state=random_seed)
dt.fit(trainX[:,:,0],trainY[:,0])
y_pred = dt.predict(testX[:,:,0])
print("Test Gmean :",geometric_mean_score(testY[:,0],y_pred) * 100)
print("Test f1 : ",f1_score(testY[:,0],y_pred) * 100 )
print("Test MMC : ",matthews_corrcoef(testY[:,0],y_pred) * 100 )

Test Gmean : 79.39215334910892
Test f1 :  79.47437582128778
Test MMC :  58.788121432532826


# bayesian hpo of Decision Trees 

In [1]:
from HPO import objective_DT
import optuna
from optuna.visualization import plot_optimization_history, plot_slice, plot_contour

In [2]:
# create a study
study = optuna.create_study(direction='maximize')
study.optimize(objective_DT, n_trials=150)
# get the best trial
trial = study.best_trial

[I 2024-04-30 14:15:19,928] A new study created in memory with name: no-name-661750e6-ba13-4678-a0ec-b2f17c4bfd55
[I 2024-04-30 14:15:20,100] Trial 0 finished with value: 0.787673021676678 and parameters: {'max_depth': 316.1329494032017, 'criterion': 'gini', 'splitter': 'random'}. Best is trial 0 with value: 0.787673021676678.
[I 2024-04-30 14:15:20,198] Trial 1 finished with value: 0.7098614506927465 and parameters: {'max_depth': 3.55429762211608, 'criterion': 'gini', 'splitter': 'random'}. Best is trial 0 with value: 0.787673021676678.
[I 2024-04-30 14:15:20,544] Trial 2 finished with value: 0.7753373908441387 and parameters: {'max_depth': 395.10684120621323, 'criterion': 'entropy', 'splitter': 'best'}. Best is trial 0 with value: 0.787673021676678.
[I 2024-04-30 14:15:20,741] Trial 3 finished with value: 0.7386666666666667 and parameters: {'max_depth': 8.169407710234893, 'criterion': 'entropy', 'splitter': 'best'}. Best is trial 0 with value: 0.787673021676678.
[I 2024-04-30 14:15:2

In [3]:
print('F1 score: {}'.format(trial.value*100))
print("Best hyperparameters: {}".format(trial.params))

F1 score: 80.12519561815337
Best hyperparameters: {'max_depth': 43.56982947220013, 'criterion': 'log_loss', 'splitter': 'random'}
