# Import libraries

In [16]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt

# Generate synthetic dataset

In [17]:
# make_classification will create a dataset with 1000 samples, 20 features, 2 classes, and a random state of 42
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

In [18]:
# Split the dataset into training and testing sets with a 70-30 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model

In [19]:
# Train a Decision Tree Classifier with specified hyperparameters
model = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=10, ccp_alpha=0.01)

In [None]:
# Fit the model on the training data
model.fit(X_train, y_train)

In [None]:
# Predict the class labels for the test set
y_pred = model.predict(X_test)
y_pred[:5]

In [None]:
# Predict the class probabilities for the test set. We
y_proba = model.predict_proba(X_test)[:, 1]
y_proba[:5]

# Accuracy Metrics

In [None]:
# Compute the confusion matrix and extract true negatives, false positives, false negatives, and true positives
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}')

In [None]:
# Calculate various performance metrics
accuracy = accuracy_score(y_test, y_pred)  # Overall accuracy of the model
precision = precision_score(y_test, y_pred)  # Precision or positive predictive value
recall = recall_score(y_test, y_pred)  # Recall or sensitivity
f1 = f1_score(y_test, y_pred)  # F1 Score, the harmonic mean of precision and recall
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

In [None]:
import numpy as np
unique_scores = np.unique(y_proba)
unique_scores

In [None]:
# Compute the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)  # False Positive Rate and True Positive Rate
print(f"The thresholds are: {thresholds}")
print(f"The false positive rate is: {fpr}")
print(f"The true positive rate is: {tpr}")

In [None]:
# AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)  # Calculate the AUC
roc_auc

In [None]:
# Plot the ROC Curve
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')  # Plot the ROC curve
plt.plot([0, 1], [0, 1], 'k--')  # Plot a diagonal line for reference
plt.scatter(fpr, tpr, color='red')  # Add points to the ROC curve for better visualization
for i, txt in enumerate(thresholds):
    plt.annotate(f'Threshold: {txt:.2f}', (fpr[i], tpr[i]), fontsize=8)  # Add threshold numbers on points
plt.xlabel('False Positive Rate')  # Label for the x-axis
plt.ylabel('True Positive Rate')  # Label for the y-axis
plt.title('ROC Curve')  # Title of the plot
plt.legend()  # Add a legend to the plot
plt.show()  # Display the plot