# Programming assignment 1: k-Nearest Neighbors classification

In [4]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  

In [22]:
def load_dataset():
    dataset = np.genfromtxt('01_homework_dataset.csv', delimiter=',')
    X = dataset[1:, 0:-1]
    y = dataset[1:, -1]
    return X, y

In [23]:
# prepare data
X_train, y_train = load_dataset()

## Task 1: Euclidean distance
Compute Euclidean distance between two data points.

In [14]:
def euclidean_distance(x1, x2):
    return np.linalg.norm(x1-x2)

## Task 2: get k nearest neighbors' labels
Get the labels of the *k* nearest neighbors of the datapoint *x_new*.

In [28]:
def get_neighbors_labels(X_train, y_train, x_new, k):
    N_train = X_train.shape[0]
    Distances = []
    
    # Build up a vector with the distances
    for x in X_train:
        Distances.append(euclidean_distance(x_new, x))
                
    # Get the maximum distanced index
    max_idx = np.argmax(Distances)
        
    neighbors_idx = []
    neighbors_labels = []
    neighbors_dist = []
    
    for j in range(k):
        min_dist  = Distances[max_idx]
        min_idx = max_idx
        for i in range(N_train):
            if Distances[i] < min_dist:
                allready_assigned_flag = 0
                for idx in neighbors_idx:
                    if i == idx:
                        allready_assigned_flag = 1
                        break
                if allready_assigned_flag == 0:
                    min_dist = Distances[i]
                    min_idx = i
        
        neighbors_idx.append(min_idx)
        neighbors_labels.append(y_train[min_idx])
        neighbors_dist.append(min_dist)
    
    return neighbors_labels, neighbors_dist

## Task 3: get the majority label
For the previously computed labels of the *k* nearest neighbors, compute the actual response. I.e. give back the class of the majority of nearest neighbors. Think about how a tie is handled by your solution.

In [37]:
def get_classification_response(neighbors, num_classes=3):   
    class_votes = np.zeros(num_classes)
    
    for c in neighbors:
        class_votes[int(c)] = class_votes[int(c)] + 1
    
    # Get the class index with the highest number of votes. A tie is handled by the argmax function.
    return np.argmax(class_votes)

In [34]:
def get_regression_response(neighbors, distances):
    k = len(neighbors)
    
    y_reg = 0
    Z = 0
    
    for i in range(k):
        y_reg = y_reg + neighbors[i] / distances[i]
        Z = Z + 1 / distances[i]
    
    return y_reg/Z

## Task 4: compute accuracy
Compute the accuracy of the generated predictions.

In [11]:
def compute_accuracy(y_pred, y_test):
    N_test = len(y_pred)
    pos_num = 0
    
    for i in range(N_test):
        if y_pred[i] == y_test[i]:
            pos_num = pos_num + 1
    
    return pos_num/N_test

In [35]:
# This function is given, nothing to do here.
def predict(X_train, y_train, x, k):
    neighbors, distances = get_neighbors_labels(X_train, y_train, x, k)
    print(neighbors, distances)
    y_pred = get_classification_response(neighbors)
    y_reg = get_regression_response(neighbors, distances)
    return y_pred, y_reg

## Testing

In [38]:
print('Training set: {0} samples'.format(X_train.shape[0]))

# generate prediction
k = 3
xa = [4.1, -0.1, 2.2]
xb = [6.1, 0.4, 1.3]

ya, y_a = predict(X_train, y_train, xa, k)
yb, y_b = predict(X_train, y_train, xb, k)
print(ya, y_a)
print(yb, y_b)

Training set: 15 samples
[0.0, 2.0, 1.0] [0.67082039324993714, 2.1840329667841556, 2.4738633753705961]
[2.0, 0.0, 2.0] [1.1747340124470735, 1.7464249196572976, 2.1189620100417086]
0 0.561016425974
2 1.39592451329
