In [6]:
import numpy as np
from scipy import linalg as la
from scipy.spatial import KDTree
import scipy.stats as stat
from matplotlib import pyplot as plt
# Problem 1
def exhaustive_search(X, z):
    """Solve the nearest neighbor search problem with an exhaustive search.

    Parameters:
        X ((m,k) ndarray): a training set of m k-dimensional points.
        z ((k, ) ndarray): a k-dimensional target point.

    Returns:
        ((k,) ndarray) the element (row) of X that is nearest to z.
        (float) The Euclidean distance from the nearest neighbor to z.
    """
    new_matrix = X - z             #broadcast a new array that has entries that are the differences of each entry of each row of X with each entry of z

    norm_list = np.array(la.norm(new_matrix, axis=1))            #make a numpy array full of the norms of each row of our new matrix

    closest_row = np.argmin(norm_list)
                                                                 #the min value of the norm list is the distance, the index of this value is the index of the closest row
    distance_to_closest = np.min(norm_list)

    return X[closest_row], distance_to_closest

In [7]:
# Problem 2: Write a KDTNode class.
class KDTNode:
    """Node class for K-D Trees.

    Attributes:
        left (KDTNode): a reference to this node's left child.
        right (KDTNode): a reference to this node's right child.
        value ((k,) ndarray): a coordinate in k-dimensional space.
        pivot (int): the dimension of the value to make comparisons on.
    """
    def __init__(self, x):
        """Initialize the left, right, and pivot attributes to none. Set the value attribute the input numpy array 'x'

        Parameters:
        x (np.ndarray): The value of the KDT node

        """
        if type(x) is np.ndarray:
            
            self.value = x         #set the value of the node equal to the input
            self.left = None
            self.right = None
            self.pivot = None                 #initialize the rest of the attributes to None

        else:
            raise TypeError("input must be a numpy array")            #raise error if the input is not a numpy array

In [8]:
# Problems 3 and 4
class KDT:
    """A k-dimensional binary tree for solving the nearest neighbor problem.

    Attributes:
        root (KDTNode): the root node of the tree. Like all other nodes in
            the tree, the root has a NumPy array of shape (k,) as its value.
        k (int): the dimension of the data in the tree.
    """
    def __init__(self):
        """Initialize the root and k attributes."""
        self.root = None
        self.k = None

    def find(self, data):
        """Return the node containing the data. If there is no such node in
        the tree, or if the tree is empty, raise a ValueError.
        """
        def _step(current):
            """Recursively step through the tree until finding the node
            containing the data. If there is no such node, raise a ValueError.
            """
            if current is None:                     # Base case 1: dead end.
                raise ValueError(str(data) + " is not in the tree")
            elif np.allclose(data, current.value):
                return current                      # Base case 2: data found!
            elif data[current.pivot] < current.value[current.pivot]:
                return _step(current.left)          # Recursively search left.
            else:
                return _step(current.right)         # Recursively search right.

        # Start the recursive search at the root of the tree.
        return _step(self.root)

    # Problem 3
    def insert(self, data):
        """Insert a new node containing the specified data.

        Parameters:
            data ((k,) ndarray): a k-dimensional point to insert into the tree.

        Raises:
            ValueError: if data does not have the same dimensions as other
                values in the tree.
            ValueError: if data is already in the tree
        """
        if self.root is None:                  #if the tree is empty, set the root to be a node with the data 
            new_root_node = KDTNode(data)
            new_root_node.pivot = 0
            self.root = new_root_node
            self.k = len(new_root_node.value)
        else:
            if len(data) != self.k:
                raise ValueError("Data to be inserted is not in R^" + str(self.k))  #raise value error if the data to be inserted is not of correct length
            else:
                new_node = KDTNode(data)

                def _traverse(current):
                    """Recursively step through the tree until finding an open spot for the data to be inserted.
                    Insert the data in the correct spot and raise a value error if the data already exists in the tree.
                    """
                    if np.allclose(data, current.value):
                        raise ValueError("there is already a node in the tree containing this data")    #raise error if there is already a node containing the data
                    elif current.left is None and (current.value[current.pivot] > data[current.pivot]):  #if new node with data can go to left of current node, place it there
                        current.left = new_node
                        if current.pivot == (self.k - 1):
                            new_node.pivot = 0                              #assign the correct pivot to the new node
                        else:
                            new_node.pivot = current.pivot + 1
                    elif current.right is None and (current.value[current.pivot] <= data[current.pivot]): #if new node with data can go to right of current node, place it there
                        current.right = new_node
                        if current.pivot == (self.k - 1):
                            new_node.pivot = 0
                        else:
                            new_node.pivot = current.pivot + 1
                    elif current.value[current.pivot] <= (data[current.pivot]):    #if we can't place the new node as either the left or right child of the current,
                        return _traverse(current.right)                            #but the value of the new node at the pivot is >= the value of the current at the pivot,
                                                                                   #then we go to the right child of the current node and continue. If the opposite is true, we 
                    elif current.value[current.pivot] > (data[current.pivot]):     #go to the left child and the current node and continue
                        return _traverse(current.left) 

                _traverse(self.root)

    # Problem 4
    def query(self, z):
        """Find the value in the tree that is nearest to z.

        Parameters:
            z ((k,) ndarray): a k-dimensional target point.

        Returns:
            ((k,) ndarray) the value in the tree that is nearest to z.
            (float) The Euclidean distance from the nearest neighbor to z.
        """
        def KD_search(current, nearest, d_star):
            """Recursively search through the KD Tree until we find the node tha
            is closest to the input data
            """
            if current is None:
                return nearest, d_star            #once we have reached the end of the tree, return the closest node and its distance to the data

            x = current.value
            i = current.pivot

            if la.norm(x - z) < d_star:           #if the current node is closer than the current nearest node, replace the current nearest with the current
                nearest = current
                d_star = la.norm(x - z)

            if z[i] < x[i]:
                nearest, d_star = KD_search(current.left, nearest, d_star)          #search through the tree to the left if the value at the pivot of data is less than
                                                                                    #the current value
                if z[i] + d_star >= x[i]:
                    nearest, d_star = KD_search(current.right, nearest, d_star)     #search to the right if the right node is within the the radius
            else:
                nearest, d_star = KD_search(current.right, nearest, d_star)         #search to the right of the current if the value of the data is greater than the current
                if (z[i] - d_star) <= x[i]:
                    nearest, d_star = KD_search(current.left, nearest, d_star)
            
            return nearest, d_star

        node, d_star = KD_search(self.root, self.root, la.norm(self.root.value - z))   #initialize the search at the root and with the distance being the distance from the
                                                                                       #data to the root
        return node.value, d_star


            

    def __str__(self):
        """String representation: a hierarchical list of nodes and their axes.

        Example:                           'KDT(k=2)
                    [5,5]                   [5 5]   pivot = 0
                    /   \                   [3 2]   pivot = 1
                [3,2]   [8,4]               [8 4]   pivot = 1
                    \       \               [2 6]   pivot = 0
                    [2,6]   [7,5]           [7 5]   pivot = 0'
        """
        if self.root is None:
            return "Empty KDT"
        nodes, strs = [self.root], []
        while nodes:
            current = nodes.pop(0)
            strs.append("{}\tpivot = {}".format(current.value, current.pivot))
            for child in [current.left, current.right]:
                if child:
                    nodes.append(child)
        return "KDT(k={})\n".format(self.k) + "\n".join(strs)

In [9]:
# Problem 5: Write a KNeighborsClassifier class.
class KNeighborsClassifier:
    """A k-nearest neighbors classifier that uses SciPy's KDTree to solve
    the nearest neighbor problem efficiently.
    """

    def __init__(self, n_neighbors):
        """Set the number of neighbors that we want for our 
        search to n_neighbors

        Parameters:
        n_neighbors (int): the number of closest neighbors to return in our search
        """

        self.n_neighbors = n_neighbors   #save the number of nearest neighbors that we want to collect as an attribute

    def fit(self, X, y):
        """Accept a training set and labels. Make a KD Tree out of the set, and save the labels
        """

        self.labels = y       #save the labels corresponding to each row of data in X as an attribute
        self.tree = KDTree(X)   #save the KDTree composing of the data in X as an attribute

    def predict(self, z):
        """Accepts a data array z. Queries the KD tree for the n_neighbors of X that are nearest
        to z, and returns the most common label.
        """
        distances, indices = self.tree.query(z, self.n_neighbors)    #get the indices of the n_neighbors nearest neighbors to z
        labels_closest = []
        for index in indices:
            labels_closest.append(self.labels[index])                #make a list of the labels of the nearest neighbors

        mc_label = stat.mode(labels_closest)[0][0]                   #find and return the most common label among the nearest neighbors
        
        return mc_label

In [10]:
# Problem 6
def prob6(n_neighbors, filename="mnist_subset.npz"):
    """Extract the data from the given file. Load a KNeighborsClassifier with
    the training data and the corresponding labels. Use the classifier to
    predict labels for the test data. Return the classification accuracy, the
    percentage of predictions that match the test labels.

    Parameters:
        n_neighbors (int): the number of neighbors to use for classification.
        filename (str): the name of the data file. Should be an npz file with
            keys 'X_train', 'y_train', 'X_test', and 'y_test'.

    Returns:
        (float): the classification accuracy.
    """

    data = np.load(filename)
    X_train = data["X_train"].astype(float)
    y_train = data["y_train"]
    X_test = data["X_test"].astype(float)       #load the data
    y_test = data["y_test"]

    classifier = KNeighborsClassifier(n_neighbors)     #create a classifier with the input amount of neighbors

    classifier.fit(X_train, y_train)                   #train the classifier using the file data

    success_list = []

    for i in range(0, X_test.shape[0]): 
        if classifier.predict(X_test[i]) == y_test[i]:            #check to see if our prediction is correct and add to our summing list accordingly
            success_list.append(1)
        else:
            success_list.append(0)
    
    return sum(success_list) / X_test.shape[0]                    #return the number of tests we got right divided by the total number of tests

In [11]:
prob6(n_neighbors=4)

0.91