In [1]:
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# function to calculate distance

def euclidean_dist(x1, x2):
  distance = np.sqrt(np.sum((x1 - x2)**2))
  return distance

In [3]:
# knn classification class from scratch

class Knn:
    def __init__(self, k=3):
        self.k = k

    # fit the dataset with the values and the target
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    # predict method to take in the new test file to output the target values
    def predict(self, X):
        predicted_labels = [self._dist_pred(x) for x in X]
        return np.array(predicted_labels)

    def _dist_pred(self, x):
        # distance between x and x_values of the training set
        distances = [euclidean_dist(x, x_train) for x_train in self.X_train]
        # sort by distance and return the indices of the first k neighbors
        k_idx = np.argsort(distances)[:self.k]
        # extract the labels of the k nearest neighbor training samples
        k_neighbors_labels = [self.y_train[i] for i in k_idx]
        # return the most common class label
        most_common = Counter(k_neighbors_labels).most_common(1)
        return most_common[0][0]


In [4]:
from google.colab import files
uploaded = files.upload()

Saving heart_disease.csv to heart_disease (8).csv


In [5]:
df = pd.read_csv('heart_disease.csv')
print(df.shape)
df.head()

(303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [6]:
# split the data into train and test

train, test = train_test_split(df, test_size=0.2,
                               stratify=df['target'], random_state=42)

# X values
X_train = train[train.columns[0:13]]
X_test = test[test.columns[0:13]]

# y target values
y_train = train[['target']]
y_test = test[['target']]

In [7]:
from sklearn.preprocessing import StandardScaler

pre = StandardScaler()
pre.fit(X_train)
X_train = pre.transform(X_train)
X_test = pre.transform(X_test)

In [10]:
# changing to numpy array from pandas dataframe

y_train = y_train['target'].values
y_test = y_test['target'].values

In [11]:
# fit the knn model built from scratch to the datasets

# instantiate the class
knn_scratch = Knn(k = 3) # k value 3 by default

# fit the model to the datasets
knn_scratch.fit(X_train, y_train)

# make prediction for test data
predictions = knn_scratch.predict(X_test)

predictions

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1])

In [12]:
# checking the accuracy of the model build from scratch

accuracy = np.sum(predictions == y_test) / len(y_test)
print(f'Accuracy score for the model built from scratch is: {accuracy}')

Accuracy score for the model built from scratch is: 0.8032786885245902


## Now, we will import KNeighborsClassifiers from Sklearn neighbors and compute the accuracy score of the prediction based on what was learned during the train part

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3) # using 3 again

knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [14]:

y_pred = knn.predict(X_test)
y_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1])

In [15]:
# accuracy score with the actual library

acc = knn.score(X_test, y_test)
acc

0.8032786885245902

## The accuracy score looks the same. Now, we will compute time complexity for each models.

In [22]:
import time

# fit the knn model built from scratch to the datasets
start_time = time.time()
# instantiate the class
knn_scratch = Knn(k = 7) # k value 3 by default, but we are using 7 this time

# fit the model to the datasets
knn_scratch.fit(X_train, y_train)

# make prediction for test data
predictions = knn_scratch.predict(X_test)

# checking the accuracy of the model build from scratch

accuracy = np.sum(predictions == y_test) / len(y_test)

end_time = time.time()

print(f'Accuracy score for the model built from scratch is: {accuracy}')
print(f'It took {start_time - end_time} seconds')

Accuracy score for the model built from scratch is: 0.819672131147541
It took -0.1412825584411621 seconds


In [23]:
start_time = time.time()

knn = KNeighborsClassifier(n_neighbors=7) # using 7 nearest neighbor 

knn.fit(X_train, y_train)

knn.score(X_test, y_test)

end_time = time.time()

print(f'Accuracy score for sklearn kneighborsclassifier model: {accuracy}')
print(f'It took {start_time - end_time} seconds')

Accuracy score for sklearn kneighborsclassifier model: 0.819672131147541
It took -0.009079933166503906 seconds
