In [33]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

import joblib

In [34]:
X_train = pd.read_csv("X_train_scaled.csv")
X_test = pd.read_csv("X_test_scaled.csv")

y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

y_train = y_train.squeeze().to_numpy()
y_test = y_test.squeeze().to_numpy()

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5625, 40)
(1407, 40)
(5625,)
(1407,)


In [35]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

knn_y_pred = knn.predict(X_test)

knn_acc = accuracy_score(y_test, knn_y_pred)
print(f"Default KNN accuracy: {knn_acc}")

Default KNN accuracy: 0.7547974413646056


In [36]:
parameter_grid = {
    'n_neighbors': [3, 5, 7, 9],            
    'weights': ['uniform', 'distance'],      
    'p': [1, 2],                             
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] 
}

grid_search = GridSearchCV(knn, param_grid=parameter_grid, cv=5, scoring="accuracy", n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best model: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_}")

Best model: {'algorithm': 'kd_tree', 'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}
Best CV Score: 0.7767111111111111


In [37]:
best_model = grid_search.best_estimator_

best_y_pred = best_model.predict(X_test)

best_acc = accuracy_score(y_test, best_y_pred)
print(f"Best KNN model test data accuracy: {best_acc}")

Best KNN model test data accuracy: 0.7555081734186212


In [38]:
print("\nClassification Report (Best Model):")
print(classification_report(y_test, best_y_pred))


Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.83      0.83      0.83      1033
           1       0.54      0.54      0.54       374

    accuracy                           0.76      1407
   macro avg       0.69      0.69      0.69      1407
weighted avg       0.76      0.76      0.76      1407



In [39]:
print("\nConfusion Matrix (Best Model):")
print(confusion_matrix(y_test, best_y_pred))


Confusion Matrix (Best Model):
[[860 173]
 [171 203]]


In [40]:
joblib.dump(best_model, 'knearest_neigbors_model.pkl')

['knearest_neigbors_model.pkl']