## KNN Algorithm from scratch in python using ball dataset

In [2]:
import pandas as pd
import numpy as np
import os
import operator
import random
import math
from scipy.spatial import distance

In [3]:
path = os.getcwd()

for i in range(3):
	path = os.path.dirname(path)
data = pd.read_csv( path + '/Datasets/Ball_Dataset.csv', names = ['Weight', 'Surface', 'Label'])

data = data.replace(to_replace = "Rough", value = 0)
data = data.replace(to_replace = "Smooth", value = 1)

In [4]:
data

Unnamed: 0,Weight,Surface,Label
0,30,0,Tennis
1,31,0,Tennis
2,70,1,Cricket
3,77,1,Cricket
4,46,0,Tennis
5,78,1,Cricket
6,47,0,Tennis
7,79,1,Cricket
8,48,0,Tennis
9,49,0,Tennis


In [9]:
def train_test_split(dataset):
	training_data = dataset.iloc[:100].reset_index(drop = True)
	testing_data = dataset.iloc[100:].reset_index(drop = True)
	trainingSet = []
	test_classes = []
	test_data = []
	for index, rows in training_data.iterrows():
		my_list = [rows.Weight, rows.Surface, rows.Label]
		trainingSet.append(my_list)

	for index, rows in testing_data.iterrows():
		my_list = [rows.Weight, rows.Surface]
		test_classes.append(rows.Label)
		test_data.append(my_list)
	return trainingSet,test_data,test_classes


In [10]:
training_data, testing_data, test_classes =  train_test_split(data)

In [11]:
training_data

[[30, 0, 'Tennis'],
 [31, 0, 'Tennis'],
 [70, 1, 'Cricket'],
 [77, 1, 'Cricket'],
 [46, 0, 'Tennis'],
 [78, 1, 'Cricket'],
 [47, 0, 'Tennis'],
 [79, 1, 'Cricket'],
 [48, 0, 'Tennis'],
 [49, 0, 'Tennis'],
 [80, 1, 'Cricket'],
 [81, 1, 'Cricket'],
 [50, 0, 'Tennis'],
 [51, 0, 'Tennis'],
 [82, 1, 'Cricket'],
 [52, 0, 'Tennis'],
 [83, 1, 'Cricket'],
 [53, 0, 'Tennis'],
 [84, 1, 'Cricket'],
 [54, 0, 'Tennis'],
 [85, 1, 'Cricket'],
 [52, 0, 'Tennis'],
 [83, 1, 'Cricket'],
 [53, 0, 'Tennis'],
 [84, 1, 'Cricket'],
 [54, 0, 'Tennis'],
 [85, 1, 'Cricket'],
 [102, 1, 'Cricket'],
 [37, 0, 'Tennis'],
 [38, 0, 'Tennis'],
 [103, 1, 'Cricket'],
 [29, 0, 'Tennis'],
 [110, 1, 'Cricket'],
 [104, 1, 'Cricket'],
 [61, 0, 'Tennis'],
 [91, 1, 'Cricket'],
 [92, 1, 'Cricket'],
 [62, 0, 'Tennis'],
 [99, 1, 'Cricket'],
 [33, 0, 'Tennis'],
 [72, 1, 'Cricket'],
 [37, 0, 'Tennis'],
 [73, 1, 'Cricket'],
 [38, 0, 'Tennis'],
 [39, 0, 'Tennis'],
 [74, 1, 'Cricket'],
 [64, 0, 'Tennis'],
 [94, 1, 'Cricket'],
 [65, 0, 'Te

In [12]:
testing_data

[[81, 1],
 [35, 0],
 [101, 1],
 [36, 0],
 [102, 1],
 [37, 0],
 [38, 0],
 [103, 1],
 [29, 0],
 [27, 0],
 [66, 0],
 [63, 0],
 [86, 1],
 [82, 1],
 [25, 0],
 [32, 0],
 [33, 0],
 [71, 1],
 [34, 0],
 [35, 0],
 [36, 0],
 [72, 1],
 [37, 0],
 [73, 1],
 [87, 1],
 [56, 0],
 [34, 0],
 [35, 0],
 [36, 0]]

In [13]:
test_classes

['Cricket',
 'Tennis',
 'Cricket',
 'Tennis',
 'Cricket',
 'Tennis',
 'Tennis',
 'Cricket',
 'Tennis',
 'Tennis',
 'Tennis',
 'Tennis',
 'Cricket',
 'Cricket',
 'Tennis',
 'Tennis',
 'Tennis',
 'Cricket',
 'Tennis',
 'Tennis',
 'Tennis',
 'Cricket',
 'Tennis',
 'Cricket',
 'Cricket',
 'Tennis',
 'Tennis',
 'Tennis',
 'Tennis']

In [15]:
def euclidean_distance(data1, data2, length):
    distance1 = 0
    for i in range(length):
        distance1 += ((data1[i] - data2[i])**2)
    
    distance1 = (distance1 ** (1/2))

    return distance1

In [16]:
def getKNeighbors(trainingSet, testInstance, k):
	distances = []
	length = len(testInstance)
	for i in range(len(trainingSet)):
		dist = euclidean_distance(testInstance, trainingSet[i], length)
		distances.append((trainingSet[i], dist))
	distances.sort(key = operator.itemgetter(1))
	neighbors = []
	for i in range(k):
		neighbors.append(distances[i][0])
	return neighbors

In [17]:
def fit(train_data):

	trainingSet = train_data
	return trainingSet

In [18]:
def predict(neighbors):
	classVotes = {}
	for i in range(len(neighbors)):
		response = neighbors[i][-1]
		if response in classVotes:
			classVotes[response] +=1
		else:
			classVotes[response] = 1
	sortedVotes = sorted(classVotes.items(), key = operator.itemgetter(1), reverse = True)
	return sortedVotes[0][0]

In [19]:
def getAccuracy(testSet, predictions):
	correct = 0
	for i in range(len(testSet)):
		if testSet[i] is predictions[i]:
			correct += 1
	accuracy = correct/float(len(testSet)) * 100.0
	return accuracy

In [20]:
# training the model
trainSet = fit(training_data)

In [26]:
predictions = []
k = 3
for i in range(len(testing_data)):
	neighbors = getKNeighbors(trainSet, testing_data[i], k)
	predictions.append(predict(neighbors))

accuracy = getAccuracy(test_classes, predictions)

In [28]:
print("\nAccuracy = ", accuracy, "%")


Accuracy =  100.0 %


In [29]:
testInstance = [95,1]
neighbors = getKNeighbors(trainSet, testInstance, k)
print("Nearest Neighbors = ")
for neighbor in neighbors:
	print(neighbor)

prediction = predict(neighbors)
print("\nPrediction for test input : ",prediction)

Nearest Neighbors = 
[95, 1, 'Cricket']
[94, 1, 'Cricket']
[96, 1, 'Cricket']

Prediction for test input :  Cricket
