In [0]:
# K Nearest Neighbors basicly calculate how many the nearest neighbor (the data points) from the random points that will be the class classifier
# K Nearest Neighbors use the voting principal
# If K  = 2, that means check the 2 nearest neighbor from the random points
# If K  = 3, that means check the 3 nearest neighbor from the random points
# To calculate the distance between the random points and the K use the EUCLIDEAN DISTANCE
# Euclidean distance = sqrt(repeat i untill n sum((Qi - Pi)^2), where Q is the coordinate of data points and P is the coordinate of random points
# Basicly is (X1 - Y1) + (X2 - Y2) + .. (Xn - Yn)

In [0]:
import numpy as np
from math import sqrt
import pandas as pd
import random

import warnings

from collections import Counter

In [0]:
def k_nearest_neighbors(data, predict, k = 3):
  distances = []

  if len(data) >= k:
    warnings.warn('K is set to a value less than total voting groups!')

  for group in data:
    for features in data[group]:
      euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
      distances.append([euclidean_distance, group])

  votes = [v[1] for v in sorted(distances)[:k]]
  vote_result = Counter(votes).most_common(1)[0][0]

  confidence = Counter(votes).most_common(1)[0][1] / k
  
  return vote_result, confidence

In [0]:
df = pd.read_csv('breast-cancer-wisconsin.data.txt')

In [0]:
df.replace('?', -99999, inplace = True)

In [0]:
df.drop(['id'], axis = 1, inplace = True)

In [37]:
df.head()

Unnamed: 0,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [38]:
full_data = df.astype(float).values.tolist()

full_data[:5]

[[5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [5.0, 4.0, 4.0, 5.0, 7.0, 10.0, 3.0, 2.0, 1.0, 2.0],
 [3.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, 2.0],
 [6.0, 8.0, 8.0, 1.0, 3.0, 4.0, 3.0, 7.0, 1.0, 2.0],
 [4.0, 1.0, 1.0, 3.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0]]

In [39]:
random.shuffle(full_data)

full_data[:5]

[[3.0, 1.0, 1.0, 1.0, 2.0, 5.0, 5.0, 1.0, 1.0, 2.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [6.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0],
 [4.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0]]

In [0]:
test_size = 0.4
train_set = {2: [], 4: []}
test_set = {2: [], 4: []}

In [0]:
train_data = full_data[:-int(test_size * len(full_data))]
test_data = full_data[-int(test_size * len(full_data)):]

In [0]:
for i in train_data:
  train_set[i[-1]].append(i[:-1])

for i in test_data:
  test_set[i[-1]].append(i[:-1])

In [0]:
correct = 0
total = 0

In [50]:
for group in test_set:
  for data in test_set[group]:
    vote, confidence = k_nearest_neighbors(train_set, data, k = 5)
    
    if group == vote:
      correct += 1
    else:
      print(confidence)
    
    total += 1

0.6
0.8


In [51]:
print('Accuracy: ', correct / total)

Accuracy:  0.992831541218638
