### Decision Tree - Label Classification

In [79]:
import sys
sys.path.append('../../../../')

# Import libraries
from utils.score import fold_cross_validate
from utils.print import print_cross_validation_scores
from constant.columns import FEATURES, LABEL, SIZE, TARGET_Y, MODE, IS_SINGLE_INPUT

import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [89]:
# Import data
feat = FEATURES
if IS_SINGLE_INPUT ==  False:
    WINDOW_SIZE = f"{SIZE}_{TARGET_Y}"
    if MODE == 1:
        WINDOW_SIZE = f"{SIZE}_{TARGET_Y}_normalize"
    if MODE == 2:
        WINDOW_SIZE = f"{SIZE}_{TARGET_Y}_normalize_std"

    print(f'WINDOW_SIZE: {WINDOW_SIZE}')
    # Get features with window size
    expanded_features = []
    for feature in FEATURES:
        for i in range(SIZE):
            expanded_features.append(f'{feature}_{i}')
    
    feat = expanded_features

# Read dataset
df = pd.read_csv(f'./source/dataset_smote.csv')
X_train, y_train = df[feat], df[LABEL]

# Read test dataset
df_test = pd.read_csv(f'./source/dataset_test.csv')
X_test, y_test = df_test[feat], df_test[LABEL]

WINDOW_SIZE: 10_10


#### Test with Actual Data

In [90]:
from sklearn.metrics import f1_score

# Define the model
clf_cv = DecisionTreeClassifier()

# Train the model with SMOTE data
clf_cv.fit(X_train, y_train)

# Use the trained model to make predictions on the test set
y_pred = clf_cv.predict(X_test)

# Calculate the macro-averaged F1 score
f1_macro = f1_score(y_test, y_pred, average='macro')

# Print the macro-averaged F1 score
print(f"Macro-averaged F1 score: {f1_macro}")

Macro-averaged F1 score: 0.7045857436088823


### Hyperparameter Tuning

In [93]:
# Best parameters: OrderedDict([('criterion', 'entropy'), ('max_depth', 166), ('max_features', None), ('max_leaf_nodes', 687), ('min_impurity_decrease', 0), ('min_samples_leaf', 1), ('min_samples_split', 2)])
# Best score: 0.898899854454981
clf_cv = DecisionTreeClassifier(
    criterion='gini',
    max_depth=129,
    max_features=None,
    max_leaf_nodes=637,
    min_impurity_decrease=0,
    min_samples_leaf=1,
    min_samples_split=2
)

clf_cv.fit(X_train, y_train)

# Use the trained model to make predictions on the test set
y_pred = clf_cv.predict(X_test)

# Calculate the macro-averaged F1 score
f1_macro = f1_score(y_test, y_pred, average='macro')

# Print the macro-averaged F1 score
print(f"Macro-averaged F1 score: {f1_macro}")

Macro-averaged F1 score: 0.695304831497437


In [48]:
from skopt import BayesSearchCV
from skopt.space import Categorical, Integer
from sklearn.tree import DecisionTreeClassifier

# Define the hyperparameter space
search_space = {
    'max_depth': Integer(3, 500),  # Depth of the tree
    'min_samples_split': Integer(2, 100),  # Minimum number of samples required to split a node
    'min_samples_leaf': Integer(1, 100),  # Minimum number of samples required at a leaf node
    'criterion': Categorical(['gini', 'entropy']),  # Function to measure the quality of a split
    'max_features': Categorical(['auto', 'sqrt', 'log2', None]),  # Number of features to consider for best split
    'max_leaf_nodes': Integer(10, 1000, prior='uniform'),  # Maximum number of leaf nodes
    'min_impurity_decrease': Integer(0, 5, prior='uniform'),  # Minimum impurity decrease required for a split to happen
}

# Create a DecisionTreeClassifier instance
dt = DecisionTreeClassifier()

# Set up the BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=dt,
    search_spaces=search_space,
    n_iter=150,  # Number of iterations
    scoring='f1_macro',  # Use F1 macro score
    n_jobs=-1,  # Use all CPU cores
    cv=10,       # 10-fold cross-validation
)

# Perform the search
bayes_search.fit(X_train, y_train)

# Best parameters and score
best_params = bayes_search.best_params_
best_score = bayes_search.best_score_

# Print the best parameters and score
print("Best parameters:", best_params)
print("Best score:", best_score)

KeyboardInterrupt: 