In [48]:
'''
Created on Jan 15, 2019

Some part of the code is based on this tutorial:
Tutorial To Implement k-Nearest Neighbors in Python From Scratch
(https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/)

@author: Diego Sanz Villafruela
'''
import csv
import random
import math
import operator

In [49]:
"""
Gets samples from CSV file.
"""                
def getSamples(filename, samples=[]):
    samples = []
    with open(filename) as f:
        for line in f.readlines():
            line = line.rstrip("\n\r")
            sample = []
            counter = 0
            for value in line.split(","):
                if counter < 4: # numeric values
                    sample.append(float(value))
                    counter += 1
                else: # type of iris
                    sample.append(value)
            samples.append(sample)
    return samples

In [56]:
"""
Calculates the Ecludean distance between 2 samples.
"""
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for propertyIndex in range(length):
        distance += pow((instance1[propertyIndex] - instance2[propertyIndex]), 2)
    return math.sqrt(distance)

"""
Gets the k closest neighbors from the trainingSet. 
"""
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    
    return list(map(lambda neighbor : neighbor[0], distances[:k]))
   
"""
Classifies the type accordinly to the number of neighbors.
"""
def getResponse(neighbors):        
    classVotes = {}
    for neighbor in neighbors:
        response = neighbor[-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
            
    return max(classVotes.items(), key=operator.itemgetter(1))[0]

"""
Gets the global accuracy taking into account all the predictions.
"""
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

"""
Calculates the accuracy of a kNearestNeighbors classifier.
"""
def kNearestNeighbors(trainingData, testData, k = 1):
    
    predictions = []
    for testInstance in testData:
        neighbors = getNeighbors(trainingData, testInstance, k)
        
        prediction = getResponse(neighbors)
        
        predictions.append(prediction)
    
    return getAccuracy(testData, predictions)
    
    

In [69]:
"""
Evaluation of the predictionPerformance using crossValidation.

samples: independent data set
"""
def crossValidation(samples, foldSize, predictionMethod, neighborNumber = 1):
    
    k = len(samples) // foldSize

    sumAccuracy = 0

    for fold in range(0,k):
        beginningFold = fold * foldSize
        endFold = beginningFold + foldSize
    
        trainingData = samples[:beginningFold] + samples[endFold:]
        testData = samples[beginningFold:endFold]
    
        # get the accuracy of the prediction for that testData
        iterationAccuracy = predictionMethod(trainingData,testData,neighborNumber)
    
        sumAccuracy += iterationAccuracy
    
    return sumAccuracy / k

In [75]:
samples = getSamples(r"iris.data.txt")
foldSize = 1
neighborNumber = 1
crossValidation(samples, foldSize, kNearestNeighbors, neighborNumber)

96.0

In [73]:
"""
Which crossValidation gives the best Accuracy.
"""
maxAccuracy = -1
maxFold = -1
for fold in range(1,len(samples)-1):
    accuracy = crossValidation(samples, fold, kNearestNeighbors)
    if accuracy > maxAccuracy:
        maxFold = fold
        maxAccuracy = accuracy

print ("MaxFold: " + str(maxFold) + " = " + str(maxAccuracy))

MaxFold: 39 = 96.58119658119658


In [76]:
"""
Which k-neighbor gives the best Accuracy.
"""
maxAccuracy = -1
maxNeighbor = -1
foldSize = 1

for neighbor in range(1,len(samples)-1):
    accuracy = crossValidation(samples, foldSize, kNearestNeighbors, neighbor)
    if accuracy > maxAccuracy:
        maxNeighbor = neighbor
        maxAccuracy = accuracy

print ("NeighborNumber: " + str(maxNeighbor) + " = " + str(maxAccuracy))

NeighborNumber: 19 = 98.0
