# k Nearest Neighbors Example and Extensions

In [189]:
%matplotlib inline 

from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from scipy.spatial import distance
from operator import itemgetter
from collections import defaultdict

import numpy as np
import matplotlib.pyplot as plt


In [122]:
# Ronald Fisher's Iris Data Set 
iris = load_iris()
iris_X = iris.data
iris_Y = iris.target

In [248]:
# We split the iris dataset into training data and test data.
indices = np.random.permutation(len(iris_X))
iris_data_train = iris_X[indices[:-10]]
iris_target_train = iris_Y[indices[:-10]]
iris_data_test  = iris_X[indices[-10:]]
iris_target_test  = iris_Y[indices[-10:]]

## SKLearn's Built in k-NN Classifier

In [249]:
# Basic configuration of the sklearn k-NN classifier.
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='brute', p=2, metric='minkowski')
knn.fit(iris_data_train, iris_target_train)

print(knn.predict(iris_data_test))
print(iris_target_test)

[2 0 1 0 1 1 2 1 0 2]
[2 0 1 0 1 1 2 1 0 1]


# Basic Implementation

In [252]:
class knearestbasic():
    """
    Basic implementation of the k-Nearest Neighbor classification algorithm
    as described in https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
    """
    
    def __init__(self, training_data, training_targets):
        """
        Initializes the classifier with the given training data and
        classes.
        """
        self.training_data = training_data
        self.training_targets = training_targets
        
        self._classes = set(training_targets)
        self._num_classes = len(self._classes)
        
        self._features = len(training_data[0])
        
    def classify(self, cases, k=5):
        """
        Classify a set of cases using training data.
        """
        results = []
        for case in cases:
            neighbors = self.__get_neighbors(case, k)
            vote = self.__count_votes(neighbors)
            results.append(vote)
        return results
    
    def __get_neighbors(self, case, k):
        """
        Classifies a single case using training data.
        """
        if (len(case) != self._features):
            raise ValueError("invalid case")
            
        distances = []
        
        for i in range(len(self.training_data)):
            dist = distance.euclidean(case, self.training_data[i])
            distances.append([self.training_targets[i], dist])

        distances.sort(key=itemgetter(1))
        return distances[0:k]
    
    def __count_votes(self, neighbors):
        """
        Counts votes for classes and handles ties.
        """
        votes = {}
        for i in range(len(neighbors)):
            cl = neighbors[i][0]
            if cl in votes:
                votes[cl] += 1
            else:
                votes[cl] = 1
        sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True)

        # Break ties by counting fewer neighbors
        if len(sorted_votes) > 1 and sorted_votes[0][1] == sorted_votes[1][1]:
            return self.__count_votes(neighbors[:-1])
        
        return sorted_votes[0][0]
        
        
        

In [253]:
knn = knearestbasic(iris_data_train, iris_target_train)
print(knn.classify(iris_data_test, 10))
print(iris_target_test)

[2, 0, 1, 0, 1, 1, 2, 1, 0, 2]
[2 0 1 0 1 1 2 1 0 1]
