# Comparison between defined KNN and in-built sklearn algortihms

In [147]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [171]:
dataset = pd.read_csv('dataset/breast-cancer-wisconsin.data')
# Replace NaN 
dataset.replace('?', -99999, inplace=True)
# Drop id-column 
dataset.drop(['id'], axis=1, inplace=True)
# Format data as type float 
data = dataset.astype(float).values.tolist()
# Shuffle dataset before split 
np.random.shuffle(data)
# Final dataset into matrix form 
data = np.array(data)

# Get the full training and testing data including class label
# up to 80%
train_data = data[:int(train_size*len(data))]
test_data = data[int(train_size*len(data)):]

In [172]:
# Define function computing K-Nearest Neighbors using L2
# default K = 5
def k_nearest_neighbors(train_data, test_data, k=5):
    y_predict = []
    for test_row in test_data:
        distances = []
        for train_row in train_data: 
            euclidean_dist = np.linalg.norm(train_row[:-1] - test_row[:-1])
            distances.append([euclidean_dist, train_row[-1:].astype(int)])
        # Sort the list contains distances and get the k-th first elements 
        votes = [groups[1] for groups in sorted(distances)[:k]]
        # Cast the result to List of number instead of List of arrays 
        votes = [v[0] for v in votes]
        # Count the votes
        label = Counter(votes).most_common(1)[0][0]
        y_predict.append(label)
    return y_predict

## Testing using the defined function

In [170]:
# Split Training and Testing set 
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_predict = k_nearest_neighbors(train_data, test_data, k=3)

y_predict = np.array(y_predict)
y_test = test_data[:, -1:].flatten()

print("Testing accuracy using defined function: {}".format(accuracy_score(y_test, y_predict)))

Testing accuracy using defined function: 0.9785714285714285


## Testing using sklearn function 

In [169]:
# Build model 
model = neighbors.KNeighborsClassifier(n_neighbors=3)
# Train model 
model.fit(train_data[:,:-1], train_data[:,-1:])
# Test model 
accuracy = model.score(test_data[:,:-1], test_data[:,-1:])
# Get accuracy 
print("Testing accuracy using in-built function: {}".format(accuracy))

Testing accuracy using in-built function: 0.9785714285714285
