In [1]:
# Notebook: K-Nearest Neighbors (KNN)

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc, classification_report

RANDOM_STATE = 42
OUT_DIR = "../capstone-project_visualization"
os.makedirs(OUT_DIR, exist_ok=True)


In [3]:
# 1) Load preprocessed data
train_path = "../data/train_processed.csv"
test_path = "../data/test_processed.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

TARGET = 'at_risk'
X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]
X_test = test_df.drop(columns=[TARGET])
y_test = test_df[TARGET]

print('Loaded data:', X_train.shape, X_test.shape)


Loaded data: (768, 20) (195, 20)


In [4]:
# 2) Pipeline and GridSearch
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])
param_grid = {
    'knn__n_neighbors': [3,5,7,9,11],
    'knn__weights': ['uniform','distance'],
    'knn__p': [1,2]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
grid = GridSearchCV(pipe, param_grid, scoring='f1', cv=cv, n_jobs=-1)
print('Starting GridSearch for KNN...')
grid.fit(X_train, y_train)
best_knn = grid.best_estimator_
print('Best params:', grid.best_params_)


Starting GridSearch for KNN...
Best params: {'knn__n_neighbors': 7, 'knn__p': 1, 'knn__weights': 'distance'}


In [5]:
# 3) Evaluate on the test set
y_pred = best_knn.predict(X_test)
try:
    y_proba = best_knn.predict_proba(X_test)[:, 1]
except Exception:
    y_proba = None

metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred)
}
if y_proba is not None:
    try:
        metrics['AUC'] = roc_auc_score(y_test, y_proba)
    except Exception:
        metrics['AUC'] = np.nan
else:
    metrics['AUC'] = np.nan

print(pd.Series(metrics))
print('\nClassification Report:\n', classification_report(y_test, y_pred, digits=4))


Accuracy    0.774359
F1-Score    0.388889
AUC         0.741616
dtype: float64

Classification Report:
               precision    recall  f1-score   support

           0     0.8954    0.8303    0.8616       165
           1     0.3333    0.4667    0.3889        30

    accuracy                         0.7744       195
   macro avg     0.6144    0.6485    0.6253       195
weighted avg     0.8089    0.7744    0.7889       195



In [6]:
# 4) Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Passed','At-Risk'], yticklabels=['Passed','At-Risk'])
plt.title('KNN: Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
cm_path = os.path.join(OUT_DIR, 'knn_confusion_matrix.png')
plt.savefig(cm_path)
plt.close()
print('Saved', cm_path)


Saved ../capstone-project_visualization\knn_confusion_matrix.png


In [7]:
# 5) ROC Curve
if y_proba is not None:
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, label=f'KNN (AUC={roc_auc:.3f})')
    plt.plot([0,1],[0,1], linestyle='--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('KNN: ROC Curve')
    plt.legend(loc='lower right')
    roc_path = os.path.join(OUT_DIR, 'knn_roc_curve.png')
    plt.tight_layout()
    plt.savefig(roc_path)
    plt.close()
    print('Saved', roc_path)
else:
    print('Model has no predict_proba; skipping ROC')


Saved ../capstone-project_visualization\knn_roc_curve.png


In [8]:
# 6) Learning curve
train_sizes = np.linspace(0.1,1.0,5)
train_sizes, train_scores, test_scores = learning_curve(best_knn, X_train, y_train, cv=cv, scoring='f1', train_sizes=train_sizes, n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plt.figure(figsize=(6,5))
plt.plot(train_sizes, train_mean, 'o-', color='r', label='Train F1')
plt.plot(train_sizes, test_mean, 'o-', color='g', label='CV F1')
plt.xlabel('Training examples')
plt.ylabel('F1 Score')
plt.title('KNN: Learning Curve')
plt.legend(loc='best')
plt.grid(True)
plt.tight_layout()
lc_path = os.path.join(OUT_DIR, 'knn_learning_curve.png')
plt.savefig(lc_path)
plt.close()
print('Saved', lc_path)


Saved ../capstone-project_visualization\knn_learning_curve.png


In [9]:
# End of KNN notebook
print('KNN notebook finished.')


KNN notebook finished.
