In [1]:
# Notebook: Decision Trees

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc, classification_report

RANDOM_STATE = 42
OUT_DIR = "../capstone-project_visualization"
os.makedirs(OUT_DIR, exist_ok=True)


In [3]:
# 1) Load preprocessed data (matching other notebooks)
train_path = "../data/train_processed.csv"
test_path = "../data/test_processed.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

TARGET = "at_risk"
X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]
X_test = test_df.drop(columns=[TARGET])
y_test = test_df[TARGET]

print("Loaded data:", X_train.shape, X_test.shape)


Loaded data: (768, 20) (195, 20)


In [4]:
# 2) GridSearch to find good hyperparameters
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
base_dt = DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight='balanced')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
grid = GridSearchCV(base_dt, param_grid, scoring='f1', cv=cv, n_jobs=-1)
print("Starting GridSearch for Decision Tree...")
grid.fit(X_train, y_train)
best_dt = grid.best_estimator_
print("Best params:", grid.best_params_)


Starting GridSearch for Decision Tree...
Best params: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [5]:
# 3) Evaluate on the test set
y_pred = best_dt.predict(X_test)
try:
    y_proba = best_dt.predict_proba(X_test)[:, 1]
except Exception:
    y_proba = None

metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred)
}
if y_proba is not None:
    try:
        metrics['AUC'] = roc_auc_score(y_test, y_proba)
    except Exception:
        metrics['AUC'] = np.nan
else:
    metrics['AUC'] = np.nan

print(pd.Series(metrics))
print('\nClassification Report:\n', classification_report(y_test, y_pred, digits=4))


Accuracy    0.723077
F1-Score    0.181818
AUC         0.538788
dtype: float64

Classification Report:
               precision    recall  f1-score   support

           0     0.8491    0.8182    0.8333       165
           1     0.1667    0.2000    0.1818        30

    accuracy                         0.7231       195
   macro avg     0.5079    0.5091    0.5076       195
weighted avg     0.7441    0.7231    0.7331       195



In [6]:
# 4) Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Passed','At-Risk'], yticklabels=['Passed','At-Risk'])
plt.title('Decision Tree: Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
cm_path = os.path.join(OUT_DIR, 'dt_confusion_matrix.png')
plt.savefig(cm_path)
plt.close()
print('Saved', cm_path)


Saved ../capstone-project_visualization\dt_confusion_matrix.png


In [7]:
# 5) ROC Curve
if y_proba is not None:
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, label=f'Decision Tree (AUC={roc_auc:.3f})')
    plt.plot([0,1],[0,1], linestyle='--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Decision Tree: ROC Curve')
    plt.legend(loc='lower right')
    roc_path = os.path.join(OUT_DIR, 'dt_roc_curve.png')
    plt.tight_layout()
    plt.savefig(roc_path)
    plt.close()
    print('Saved', roc_path)
else:
    print('Model has no predict_proba; skipping ROC')


Saved ../capstone-project_visualization\dt_roc_curve.png


In [8]:
# 6) Feature importances
if hasattr(best_dt, 'feature_importances_'):
    fi = best_dt.feature_importances_
    fi_df = pd.DataFrame({'feature': X_train.columns, 'importance': fi}).sort_values('importance', ascending=False)
    plt.figure(figsize=(8, max(4, int(0.25*len(fi_df)))))
    sns.barplot(x='importance', y='feature', data=fi_df.head(20), palette='viridis')
    plt.title('Decision Tree: Top feature importances')
    plt.tight_layout()
    fi_path = os.path.join(OUT_DIR, 'dt_feature_importances.png')
    plt.savefig(fi_path)
    plt.close()
    print('Saved', fi_path)



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='importance', y='feature', data=fi_df.head(20), palette='viridis')


Saved ../capstone-project_visualization\dt_feature_importances.png


In [9]:
# 7) Tree plot (top levels)
plt.figure(figsize=(20,10))
plot_tree(best_dt, feature_names=list(X_train.columns), filled=True, max_depth=3, fontsize=8)
plt.title('Decision Tree (top 3 levels)')
plt.tight_layout()
plot_path = os.path.join(OUT_DIR, 'dt_tree_plot.png')
plt.savefig(plot_path)
plt.close()
print('Saved', plot_path)


Saved ../capstone-project_visualization\dt_tree_plot.png


In [10]:
# End of Decision Tree notebook
print('Decision Tree notebook finished.')


Decision Tree notebook finished.
