# **K-Nearest Neighbors**

1. Supervised Learning Model
2. Both Classification and Regression
3. Can be used for non linear data
4. K-Neighbors, k is odd number

### Advantages
- Works well with smaller dataset with less number of features
- Can be used for both Classification and Regression
- Easy to implement for Multi-class classification
- Different distance criteria can be used (e.g: Euclidean, Manhatten)

### Disadvantages
- Choosing optimum K value
- Less effecient with high dimensional data
- Doesn't perform well on imbalanced dataset
- Sensitive to outliers

#### Euclidian Distance and Manhatten Distance

Let $A(x_1,y_1)$ and $B(x_2,y_2)$ are two points, then the Euclidian Distance between them is:
$$
AB = \sqrt{(x_2-x_1)^2+(y_2-y_1)^2}
$$
whereas, Manhatten Distance is:
$$
AB = |x_2-x_1| + |y_2-y_1|
$$


In [1]:
import numpy as np

In [2]:
# consider 2 points in 2D-plane
p1 = (1,1)
p2 = (2,2)

In [3]:
# euclidean diatance

difference = (p2[0] - p1[0])**2 + (p2[1] - p1[1])**2

euclidean_distance = np.sqrt(difference)
euclidean_distance

np.float64(1.4142135623730951)

In [4]:
# consider 2 points in 2D-plane
p1 = (1,1,1)
p2 = (2,2,2)

# euclidean diatance

difference = (p2[0] - p1[0])**2 + (p2[1] - p1[1])**2 + (p2[2] - p1[2])**2

euclidean_distance = np.sqrt(difference)
euclidean_distance

np.float64(1.7320508075688772)

In [5]:
def euclidean_dist(a,b):
    dist = 0
    for i in range(len(a)):
        dist += (a[i] - b[i])**2
    return np.sqrt(dist)

In [6]:
euclidean_dist((1,1,1,1),(2,2,2,2))

np.float64(2.0)

In [7]:
def manhatten_dist(a,b):
    dist = 0
    for i in range(len(a)):
        dist += abs(a[i] - b[i])
    return dist

In [8]:
manhatten_dist((1,1,1,1),(2,2,2,2))

4

## K-Nearest Neighbors Classifier

In [9]:
import statistics

In [10]:
class KNN_Classifier():

    def __init__(self, distance_metric):
        self.distance_metric = distance_metric

    def get_distance_metric(self, training_data_points, test_data_point):
        if (self.distance_metric == 'euclidean'):
            dist = 0
            for i in range(len(training_data_points) - 1):
                dist += (training_data_points[i] - test_data_point[i])**2
            return np.sqrt(dist)
        
        elif (self.distance_metric == 'manhatten'):
            dist = 0
            for i in range(len(training_data_points) - 1):
                dist += abs(training_data_points[i] - test_data_point[i])
            return dist

    # getting the nearest neighbors
    def nearest_neighbors(self, X_train, test_data, k):
        distance_list = []

        for training_data in X_train:

            distance = self.get_distance_metric(training_data, test_data)
            distance_list.append((training_data,distance))

        distance_list.sort(key=lambda x : x[1])
        k_neighbors = []

        for j in range(k):
            k_neighbors.append(distance_list[j][0])
        return k_neighbors

    def predict(self, X_train, test_data, k):
        neighbors = self.nearest_neighbors(X_train, test_data, k)

        label = []
        for data in neighbors:
            label.append(data[-1])
        prediction = statistics.mode(label)
        return prediction


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
diabetes_dataset = pd.read_csv('diabetes.csv')

In [17]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [18]:
diabetes_dataset.shape

(768, 9)

In [20]:
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [21]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [22]:
Y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [23]:
X = X.to_numpy()
Y = Y.to_numpy()

In [24]:
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]],
      shape=(768, 8))

In [25]:
Y

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,

In [26]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2, stratify=Y, random_state=2)

In [27]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


In [28]:
X_train = np.insert(X_train, 8, Y_train, axis=1)

In [29]:
X_train[:,8]

array([1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0.,
       0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0.,
       0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0.,
       0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1.,
       0., 0., 1., 1., 0.

In [30]:
classifier = KNN_Classifier(distance_metric='euclidean')

In [31]:
prediction = classifier.predict(X_train,X_test[0],k=5)

In [32]:
Y_test[0]

np.int64(0)

In [33]:
prediction

np.float64(0.0)

In [34]:
X_test_size = X_test.shape[0]

In [35]:
y_pred = []
for i in range(X_test_size):
    prediction = classifier.predict(X_train, X_test[i], k = 5)
    y_pred.append(prediction)

In [36]:
y_pred

[np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float64(0.0),
 np.float64(1.0),
 np.float6

In [37]:
accuracy_score(Y_test, y_pred)

0.7272727272727273