In [98]:
# -*- coding: utf-8 -*-
"""
K-nearest neighbors algorithm
Jason Brownlee, http://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/
Implemented by Emma Anderson
Written for the Iris dataset
ReWritten by Maxwel Gregg for the datatraining.txt
"""
import csv
import random
import math
import operator

def euclideanDistance(item1, item2, attributes):
    distance = 0
    for x in range(attributes):
        distance+=(item1[x] - item2[x])**2
    return math.sqrt(distance)

def getNeighbors(trainingSet, test, k):
    distances = []
    length = len(test) - 1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(test, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    #sort on the distance, not the data point
    distances.sort(key = operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response]+=1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse = True)
    return sortedVotes[0][0]

def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct+=1
    return (correct/float(len(testSet)))*100.0

def loadDataset(filename, split, trainingSet, testSet):    
    with open(filename, 'r') as csvfile:
        lines = csv.reader(csvfile)
        dataset = list(lines)
        for x in range(1, len(dataset)-1):
            #this needs changing for other datasets
            del dataset[x][0]
            del dataset[x][0]
            del dataset[x][0]
            del dataset[x][0]
            del dataset[x][2]
            for y in range(0, 3):
                dataset[x][y] = float(dataset[x][y])
            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])

def main():
    trainingSet = []
    testSet = []
    split = 0.95
    loadDataset('datatraining.txt', split, trainingSet, testSet)
    
    predictions = []
    k = 15
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
        #print('predicted:' + str(result) + ', actual:'+str(testSet[x][-1]))
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: ' + str(accuracy) + '%')

main()


#testSet = [[1, 1, 1, 'a'], [2, 2, 2, 'b'], [3, 3, 3, 'a']]
#predictions= ['a', 'a', 'a']
#print(getAccuracy(testSet, predictions))

#trainingSet = []
#testSet = []
#loadDataset('iris.txt', 0.9, trainingSet, testSet)
#print(len(trainingSet))
#print(len(testSet))

#data1 = [2, 2, 2, 'a']
#data2 =[4, 4, 4, 'b']
#distance = euclideanDistance(data1, data2, 3)
#print(distance)

#trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b']]
#test = [5, 5, 5]
#k=1
#print(getNeighbors(trainSet, test, k))

#neighbors = [[1, 1, 1, 'a'], [2, 2, 2, 'a'], [3, 3, 3, 'b']]
#response = getResponse(neighbors)
#print(response)



Accuracy: 99.03614457831326%


To personalize this function for the 'datatraining' dataset I had to change a couple of things.  First I changed the main function from loading the 'iris' dataset to 'datatraining'.  I then deleted the first two columns from the data set because they didn't contribute to the lab and were not defined by integers.  I then changed the range from 0-8 to 0-6 so that my list index was in range.  The rest of manipulations I made are stated below during my testing.

1. After testing various combinations of the different attributes I found that carbon dioxide level and light in the room are the only factors that are very important in this dataset for the program to predict correctly if the room is occupied or not.  To find this I systematically eliminated columns as I gathered the accuracy rates.  To gather the accuracy rates I ran the function ten times and collected then accuracy rates and averaged those ten numbers.  While I was eliminating columns I noticed that the accuracy rate was not changing much after eliminating them... That is until I got to light.  When I got rid of light the average went from around 98% to 70%.  I knew at that point that the light in the room was very important in predicting if there is somebody in the room or not.  I then kept light in the data and started working from the back.  I took out humidity ratio and the average stayed around 98%.  Then when I tried to take out carbon dioxide the accuracy went down near 70% again.  So I found after these tests that Co2 and Light levels are the only two things the function needs to predict with around a 98% accuracy rate if sombody is inside the room or not.  This makes sense to me in a real world sense because when someone enters a room the normally will turn on a light or open blinds or do something of the sort to increase the light in the room so that they can see better.  The carbon dioxide factor seemed obvious because humans exhale carbon dioxide so if someone was in a room the Co2 levels should spike making it easy for the function to precict very accuratly.  So, in conclusion, Carbon Dioxide and Light are the two most important factors in determining if someone is in a room or not.

2. I decided to test the different K values after the first part because if the only things that are important are Co2 and Light, then why would it be important to use any of the other factors since they didnt add any accuracy.  So I found the best value of K in the same sort of way I found the important factors.  I started by testing k = 1 ten times.  I then averaged these ten values to get a values that represents the accuracy of k = 1.  I then did this same process until k = 15.  K=1 turned out to be 98.83801% accuracy, k=2: 98.12066, k=3: 98.73376, k=4: 98.40909, k=5: 98.49624, k=6: 99.46666, k=7: 97.75561, k=8: 99.03846, k=9: 98.84526, k=10: 99.21052, k=11: 98.49624, k=12: 99.71671, k=13: 98.05352, k=14: 99.03147, k=15: 99.25558.  After finding these values it turns out when K=12 the function predicts the most accurate.  But even though this is true I don't think it is enough evidence to make a claim about this because the results were so varied.  What I mean by this is values of k like 6, 8, 10, 12, and 14 all were in the 99% accuracy rate, but these values are not really correlated at all nor have enough differences from the rest of the data to be significant.