# KNN

In [3]:
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)

In [4]:
df = pd.read_csv("datasets/diabetes.csv")

In [5]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

X_scaled = StandardScaler().fit_transform(X)

X = pd.DataFrame(X_scaled, columns=X.columns)

### Model

In [7]:
knn_model = KNeighborsClassifier().fit(X, y)

random_user = X.sample(1, random_state=45)

knn_model.predict(random_user)

array([1])

### Model Evaluation

In [9]:
# Confusion matrix için y_pred:
y_pred = knn_model.predict(X)

In [10]:
# AUC için y_prob:
y_prob = knn_model.predict_proba(X)[:, 1]

In [11]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87       500
           1       0.79      0.70      0.74       268

    accuracy                           0.83       768
   macro avg       0.82      0.80      0.81       768
weighted avg       0.83      0.83      0.83       768



In [12]:
roc_auc_score(y, y_prob)

0.9017686567164179

In [13]:
cv_results = cross_validate(knn_model, X, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])

In [14]:
cv_results['test_accuracy'].mean()

0.733112638994992

In [15]:
cv_results['test_f1'].mean()

0.5905780011534191

In [16]:
cv_results['test_roc_auc'].mean()

0.7805279524807827

In [17]:
knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

### Hyperparameter Optimization

In [19]:
knn_model = KNeighborsClassifier()
knn_model.get_params()

knn_params = {"n_neighbors": range(2, 50)}

knn_gs_best = GridSearchCV(knn_model,
                           knn_params,
                           cv=5,
                           n_jobs=-1,
                           verbose=1).fit(X, y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [20]:
knn_gs_best.best_params_

{'n_neighbors': 17}

### Final Model

In [23]:
knn_final = knn_model.set_params(**knn_gs_best.best_params_).fit(X, y)

cv_results = cross_validate(knn_final,
                            X,
                            y,
                            cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

In [24]:
cv_results['test_accuracy'].mean()

0.7669892199303965

In [25]:
cv_results['test_f1'].mean()

0.6170909049720137

In [26]:
cv_results['test_roc_auc'].mean()

0.8127938504542278

In [27]:
random_user = X.sample(1)

In [28]:
knn_final.predict(random_user)

array([0])