### Decision Tree - Label Classification

In [1]:
import sys
sys.path.append('../../../../')

# Import libraries
from utils.score import fold_cross_validate
from utils.print import print_cross_validation_scores
from constant.columns import FEATURES, LABEL, SIZE, TARGET_Y, MODE

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
# Import data
WINDOW_SIZE = f"{SIZE}_{TARGET_Y}"
if MODE == 1:
    WINDOW_SIZE = f"{SIZE}_{TARGET_Y}_normalize"

print(f'WINDOW_SIZE: {WINDOW_SIZE}')

# Get features with window size
expanded_features = []
for feature in FEATURES:
    for i in range(SIZE):
        expanded_features.append(f'{feature}_{i}')

# Read dataset
df = pd.read_csv(f'./source/{WINDOW_SIZE}/dataset.csv')
X, y = df[expanded_features], df[LABEL]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)

WINDOW_SIZE: 16_8_normalize


#### Cross Validation

In [65]:
# Define the model
clf_cv = DecisionTreeClassifier()
scores = fold_cross_validate(clf_cv, X_test, y_test)

# Print scores
print_cross_validation_scores(scores)

MACRO:
Precision: 0.6742 (0.1484)
Recall: 0.7179 (0.1323)
F1 score: 0.6783 (0.1437)
---------------------------------
MICRO:
Precision: 0.9057 (0.0478)
Recall: 0.9057 (0.0478)
F1 score: 0.9057 (0.0478)
---------------------------------


### Hyperparameter Tuning

In [4]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.tree import DecisionTreeClassifier

# Define the hyperparameter space
search_space = {
    'max_depth': Integer(3, 100),  # Depth of the tree
    'min_samples_split': Integer(2, 100),  # Minimum number of samples required to split a node
    'min_samples_leaf': Integer(1, 100),  # Minimum number of samples required at a leaf node
    'criterion': Categorical(['gini', 'entropy']),  # Function to measure the quality of a split
    'max_features': Categorical(['auto', 'sqrt', 'log2', None]),  # Number of features to consider for best split
}

# Create a DecisionTreeClassifier instance
dt = DecisionTreeClassifier()

# Set up the BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=dt,
    search_spaces=search_space,
    n_iter=100,  # Number of iterations
    scoring='f1_macro',  # Use F1 macro score
    n_jobs=-1,  # Use all CPU cores
    cv=10,       # 10-fold cross-validation
)

# Perform the search
bayes_search.fit(X_train, y_train)

# Best parameters and score
best_params = bayes_search.best_params_
best_score = bayes_search.best_score_

# Print the best parameters and score
print("Best parameters:", best_params)
print("Best score:", best_score)

KeyboardInterrupt: 