<h1> 2. Classification Model - Decision Tree </h1>

In [61]:
import sys
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from pathlib import Path

### Get the absolute path of the current file

In [62]:
current_file_path = Path('decision_tree.ipynb').resolve()

# Get the directory of the current file
project_dir = current_file_path.parent

# Add the project directory to sys.path
sys.path.insert(0, str(project_dir))

### Step 1: Import Utility methods

In [63]:
from utility import (load_random_generator)

### Step 2: Preprocesing: Split features or data and target variable X, y respectively

In [64]:
data = load_random_generator()

X, y = data["X"], data["Y"]

### Step 3: Split the data into training and testing sets

In [65]:

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


### Step 4: Declare different hyperparameters

In [66]:
max_depths: list = [None, 5, 10]
min_samples_splits: list = [2, 5, 10]
min_samples_leafs: list = [1, 2, 5]
max_leaf_nodes_list: list = [None, 10, 20]
max_features_list: list = [None, 'sqrt', 'log2']

### Step 5: Define a function to train and evaluate decision tree classifiers with different hyperparameters

In [67]:
def evaluate_decision_tree(max_depth=None, min_samples_split=None, min_samples_leaf=None, max_leaf_nodes=None, max_features=None, X_train= None, y_train=None):
    # Initialize the decision tree classifier with specified hyperparameters
    clf = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes,
                                 max_features=max_features )
    # Train the classifier
    clf.fit(X_train, y_train)
    # Perform cross-validation to evaluate the model
    cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
    # Calculate the mean cross-validation accuracy
    mean_cv_accuracy = cv_scores.mean()
    return mean_cv_accuracy

### Step 6: Train decision tree classifiers with different hyperparameters

In [68]:

best_accuracy = -1
best_hyperparameters = {}

for max_depth in max_depths:
    for min_samples_split in min_samples_splits:
        for min_samples_leaf in min_samples_leafs:
            for max_leaf_nodes in max_leaf_nodes_list:
                for max_features in max_features_list:
                    # Train with different hyper parameters
                    accuracy = evaluate_decision_tree(max_depth, min_samples_split, min_samples_leaf,
                                                      max_leaf_nodes, max_features,X_train, y_train)

                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_hyperparameters = {
                            'max_depth': max_depth,
                            'min_samples_split': min_samples_split,
                            'min_samples_leaf': min_samples_leaf,
                            'max_leaf_nodes': max_leaf_nodes,
                            'max_features': max_features
                        }
print(best_hyperparameters)

{'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_leaf_nodes': None, 'max_features': None}


### Step 7: Train the best model on the full training set


In [69]:
best_clf = DecisionTreeClassifier(**best_hyperparameters)
best_clf = best_clf.fit(X_train, y_train)

### Step 8: Make Predictions with test set

In [70]:
y_pred = best_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Best Model Hyperparameters:", best_hyperparameters)
print("Test Accuracy:", test_accuracy)
print("Train Accuracy:", best_accuracy)
print(f"Test Predictions: {y_pred}")

Best Model Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_leaf_nodes': None, 'max_features': None}
Test Accuracy: 0.95
Train Accuracy: 1.0
Test Predictions: [False False  True  True False False False  True False False False  True
  True  True  True  True  True  True False  True  True False False  True
  True False  True False  True  True  True  True  True False False False
 False False False  True  True False False  True False  True  True  True
  True  True  True False False  True False  True  True  True False  True]


### Step 9: Evaluate the best model on the testing set

In [71]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score


In [72]:
# Confusion Matrix
conf_matrix_test = confusion_matrix(y_test, y_pred)
# Precision
precision_test = precision_score(y_test, y_pred)
# Recall
recall_test = recall_score(y_test, y_pred)
# F1 Score
f1_test = f1_score(y_test, y_pred)
# Precision-Recall Curve
precision_test_curve, recall_test_curve, _ = precision_recall_curve(y_test, y_pred)
# ROC Curve and AUC
fpr_test, tpr_test, _ = roc_curve(y_test, y_pred)
auc_test = roc_auc_score(y_test, y_pred)
# Display metrics
print("Confusion Matrix - Test:\n", conf_matrix_test)
print("Precision - Test:", precision_test)
print("Recall - Test:", recall_test)
print("F1 Score - Test:", f1_test)
print("AUC - Test:", auc_test)


Confusion Matrix - Test:
 [[25  2]
 [ 1 32]]
Precision - Test: 0.9411764705882353
Recall - Test: 0.9696969696969697
F1 Score - Test: 0.955223880597015
AUC - Test: 0.9478114478114478


### Overall, based on these metrics, the decision classifier model shows strong performance on the test set, with high precision, recall, F1 score, and AUC. These metrics collectively indicate that the model is effective in its classification task