In [13]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, make_scorer
import time


In [2]:
X_train = pd.read_csv('Dataset/X_train_100.csv')
y_train = pd.read_csv('Dataset/y_train_100.csv').iloc[:, 0]
X_test = pd.read_csv('Dataset/X_test.csv')
y_test = pd.read_csv('Dataset/y_test.csv').iloc[:, 0]

In [28]:
param_grid= {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 30, 40],
    'max_features': [None, 0.8, 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

In [29]:
cart = DecisionTreeClassifier(random_state=42)

In [30]:
grid_search = GridSearchCV(
    estimator=cart,
    param_grid=param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

In [31]:
start_time = time.time()

In [32]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


KeyboardInterrupt: 

In [21]:
search_time = time.time() - start_time

In [22]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score (F1-weighted): {grid_search.best_score_:.4f}")

Best parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best CV score (F1-weighted): 0.9994


In [23]:
best_cart = grid_search.best_estimator_

In [24]:
y_pred_best = best_cart.predict(X_test)

In [25]:
test_accuracy = accuracy_score(y_test, y_pred_best)

In [26]:
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_best, average='weighted')

In [27]:
print(f"\nTEST SET PERFORMANCE WITH BEST PARAMETERS:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Tree depth: {best_cart.get_depth()}")
print(f"Number of leaves: {best_cart.get_n_leaves()}")

print(f"\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_best))


TEST SET PERFORMANCE WITH BEST PARAMETERS:
Accuracy: 0.9988
Precision: 0.9988
Recall: 0.9988
F1-Score: 0.9988
Tree depth: 21
Number of leaves: 225

Detailed Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13778
           1       1.00      1.00      1.00      3497
           2       0.96      0.98      0.97       298
           3       0.72      0.81      0.76        16
           4       1.00      1.00      1.00     20203

    accuracy                           1.00     37792
   macro avg       0.94      0.96      0.95     37792
weighted avg       1.00      1.00      1.00     37792

