In [68]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_absolute_error, roc_auc_score

%run "util copy.ipynb"

# knn

In [48]:
# https://github.com/AmirAli5/Machine-Learning/blob/main/Supervised%20Machine%20Learning/Classification/3.%20k-Nearest%20Neighbors/k-Nearest%20Neighbors.ipynb

# Load Data

In [49]:
X, Y, df_lab = get_data_lab()

In [50]:
df_lab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   vitaminD           319 non-null    float64
 1   reactiveProtein    319 non-null    float64
 2   hemoglobin         319 non-null    float64
 3   HDL                319 non-null    float64
 4   gender             319 non-null    int64  
 5   ALP                319 non-null    float64
 6   creatinine         319 non-null    float64
 7   diabetes           319 non-null    int64  
 8   hyperlipiidemia    319 non-null    int64  
 9   AST                319 non-null    float64
 10  LDL                319 non-null    float64
 11  coroaryArtDisease  319 non-null    int64  
 12  gallstoneStatus    319 non-null    int64  
dtypes: float64(8), int64(5)
memory usage: 32.5 KB


# Define attributes

In [53]:
X = df_lab.iloc[:, :-1].to_numpy()
Y = df_lab.iloc[:, -1].to_numpy()

In [None]:
# Feature scaling

sc_X = StandardScaler()
X = sc_X.fit_transform(X) 

In [57]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=0)

In [58]:
print("Xtrain", Xtrain.shape)
print("Ytrain", Ytrain.shape)

Xtrain (223, 12)
Ytrain (223,)


# Build model

In [59]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1-x2) ** 2))

class KNN: 
    def __init__(self, k=3):
        self.k = k
    def fit(self, X, Y):
        self.Xtrain = X
        self.Ytrain = Y
    def predict(self, X):
        Ypred = [self._predict(x) for x in X]
        return np.array(Ypred)
    def _predict(self, x):
        distances = [euclidean_distance(x, xtrain) for xtrain in self.Xtrain]
        k_idx = np.argsort(distances)[: self.k]
        k_n_labels = [self.Ytrain[i] for i in k_idx]
        most_common = Counter(k_n_labels).most_common(1)
        return most_common[0][0]

In [60]:
knn = KNN(k=5)
knn.fit(Xtrain, Ytrain)

In [61]:
Ypredict = knn.predict(Xtest)
Ypredict

array([1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0])

In [62]:
accuracy_score(Ypredict, Ytest)

0.7291666666666666

In [63]:
print(classification_report(Ypredict, Ytest))

              precision    recall  f1-score   support

           0       0.71      0.77      0.74        48
           1       0.75      0.69      0.72        48

    accuracy                           0.73        96
   macro avg       0.73      0.73      0.73        96
weighted avg       0.73      0.73      0.73        96



In [71]:
print(mean_absolute_error(Ypredict, Ytest))

0.2708333333333333


In [72]:
print(roc_auc_score(Ypredict, Ytest))

0.7291666666666666
