In [1]:
# Import Lib
import numpy as np
import math

#### Similarity
In order to make predictions we need to calculate the similarity between any two given data instances. This is needed so that we can locate the k most similar data instances in the training dataset for a given member of the test dataset and in turn make a prediction.

In [17]:
def distance(instance1, instance2):
    # just in case, if the instances are lists or tuples:
    instance1 = np.array(instance1) 
    instance2 = np.array(instance2)
    diff = instance1 - instance2
    return np.linalg.norm(diff)

In [18]:
trainSet = [(1, 1, 1)]
testSet = [(4, 4, 4)]
distance(testSet, trainSet)

5.196152422706632

#### Neighbors
Now that we have a similarity measure, we can use it collect the k most similar instances for a given unseen instance.

In [19]:
def get_neighbors(training_set, 
                  labels, 
                  test_instance, 
                  k, 
                  distance=distance):
    """
    get_neighors calculates a list of the k nearest neighbors
    of an instance 'test_instance'.
    The list neighbors contains 3-tuples with  
    (index, dist, label)
    where 
    index    is the index from the training_set, 
    dist     is the distance between the test_instance and the 
             instance training_set[index]
    distance is a reference to a function used to calculate the 
             distances
    """
    distances = []
    for index in range(len(training_set)):
        dist = distance(test_instance, training_set)
        #print(dist)
        distances.append((training_set[index], dist, labels[index]))
    #print('distances', distances)
    distances.sort(key=lambda x: x[1])
    #print('distances', distances)
    #neighbors = distances[:k]
    neighbors = distances[:k]
    return neighbors

In [20]:
train_set = [(1, 1, 1),
             (3, 3, 3),
             (8, 8, 8),
            ]

labels = ['apple',  'banana', 'apple']
k = 1
for test_instance in [(4, 4, 4)]:
    neighbors = get_neighbors(train_set, 
                              labels, 
                              test_instance, 
                              k)
    print(neighbors)

[((1, 1, 1), 8.831760866327848, 'apple')]


#### Get vote
Once we have located the most similar neighbors for a test instance, the next task is to devise a predicted response based on those neighbors.
We can do this by allowing each neighbor to vote for their class attribute, and take the majority vote as the prediction.

In [21]:
from collections import Counter
def vote(neighbors):
    class_counter = Counter()
    for neighbor in neighbors:
        #print('neighbors', neighbors)
        #print(class_counter[neighbor[2]])
        class_counter[neighbor[2]] += 1
    #print(class_counter[neighbor[2]])
    #print(class_counter)
    return class_counter.most_common(1)[0][0]

In [22]:
train_set = [(1, 1, 1),
             (3, 3, 3),
             (8, 8, 8),
            ]

labels = ['apple',  'banana', 'apple']
k = 1
for test_instance in [(4, 4, 4)]:
    neighbors = get_neighbors(train_set, 
                              labels, 
                              test_instance, 
                              3)
    print("vote distance weights: ", vote(neighbors))

vote distance weights:  apple


'vote_prob' is a function like 'vote' but returns the class name and the probability for this class:

In [23]:
def vote_prob(neighbors):
    class_counter = Counter()
    for neighbor in neighbors:
        class_counter[neighbor[2]] += 1
    labels, votes = zip(*class_counter.most_common())
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    return winner, votes4winner/sum(votes)

In [27]:
train_set = [(1, 1, 1),
             (3, 3, 3),
             (8, 8, 8),
            ]

labels = ['apple',  'banana', 'apple']
k = 1
predictions=[]
for test_instance in [(4, 4, 4)]:
    neighbors = get_neighbors(train_set, 
                              labels, 
                              test_instance, 
                              3)
    result = vote(neighbors)
    predictions.append(result)
    print(predictions)

['apple']


In [None]:
#def getAccuracy(testSet, predictions):
#    correct = 0
#    for x in range(len(testSet)):
#        if testSet[x][-1] is predictions[x]:
#        correct += 1
#    return (correct/float(len(testSet))) * 100.0

In [39]:
train_set = [(1, 1, 1),
             (3, 3, 3),
             (8, 8, 8),
            ]

labels = ['apple',  'banana', 'apple']

test_set = [(2, 3, 8),
             (1, 4, 6),
            ]
k = 1
predictions=[]
for x in range(len(test_set)):
    neighbors = get_neighbors(train_set, 
                              labels, 
                              test_set[x], 
                              3)
    result = vote(neighbors)
    predictions.append(result)
    print('> predicted=' + repr(result))

> predicted='apple'
> predicted='apple'
