# Exercise: code your own KNN classifier

Now it's your turn! Go ahead and complete the KNN classifier class below.

In [107]:
import numpy as np
import pandas as pd

class Knn:

    def __init__(self):
        """
        Initialize the Knn class
        self.x_train: training data
        self.y_train: training labels
        """
        # Save the training data to properties of this class
        self.x_train = []
        self.y_train = []

    def fit(self, x, y):
        """
        Save the training data to properties of this class
        Parameters
        ----------
        x: training data
        y: training labels

        Returns
        -------
        None
        """
        self.x_train = x
        self.y_train = y

    def predict(self, x, k):
        """
        Predict the class labels for the provided data
        Parameters
        ----------
        x: data to classify
        k: number of neighbors to use

        Returns
        -------
        np.array(y_hat): array of predicted class labels
        """
        y_hat = []  # Variable to store the estimated class label
        # Calculate the distance from each vector in x to the training data
        
        # Calculate the distance between one observation and a list of observations
        def get_distance(x,X):
            diff = X - x
            return np.sum(diff**2, axis=1)
        
        def get_nearest(dist,k,labels):
            df_distance = pd.DataFrame({
                'distance':dist, 
                'y':labels
            })
            df_sorted = df_distance.sort_values('distance')
            return df_sorted['y'].iloc[0:k].values
        
        def get_most_frequent_class(labels):
            label_series = pd.Series(labels)
            df = label_series.value_counts()
            max_value = df.max()
            options = df[df==max_value].index.values
            return np.random.choice(options) # If there's one option, return it; else, pick one at random
        
        for v in x:
            distance = get_distance(v,self.x_train)
            nearest = get_nearest(distance,k,np.squeeze(self.y_train))
            class_estimate = get_most_frequent_class(nearest)
            y_hat.append(class_estimate)

        # Return the estimated targets
        return np.array(y_hat)

To test this out, you'll need to apply it to some data. You'll apply this to the iris data that we split previously. Let's start by loading our training and test data:

In [108]:
# Solution

import pandas as pd
data_train = pd.read_csv("data/train.csv")
data_test = pd.read_csv("data/test.csv")

In [109]:
x_train = data_train[["SepalWidthCm","PetalWidthCm"]].values

In [110]:
y_train = data_train[["Species"]].values

In [116]:
# Initialize the KNN model
myknn = Knn()

# Train the model
x_train = data_train[["SepalWidthCm","PetalWidthCm"]].values
y_train = data_train[["Species"]].values
myknn.fit(x_train,y_train)

# Make predictions on the test data
k = 3
x_test = data_test[["SepalWidthCm","PetalWidthCm"]].values
y_hat = myknn.predict(x_test,k)

# Compare your predictions to the labels from the test data to evaluate accuracy on held out data
y_test = data_test[["Species"]].values

print(y_hat == np.squeeze(y_test))

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]


In [112]:
y_hat.shape

(45,)

In [113]:
y_test.shape

(45, 1)

In [114]:
y_hat

array(['Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa', 'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa'], dtype='<U15')

In [115]:
# def get_most_frequent_class(labels):
#     label_series = pd.Series(labels)
#     df = label_series.value_counts()
#     max_value = df.max()
#     options = df[df==max_value].index.values
#     return np.random.choice(options) # If there's one option, return it; else, pick one at random
    
# labels = ['this','that','that','other','other']
# out = get_most_frequent_class(labels)
# out