In [5]:
import pandas as pd
data_train = pd.read_csv('sonar.data/sonar_train.csv')
data_test = pd.read_csv('sonar.data/sonar_test.csv')

import numpy as np

In [6]:
data_train

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A52,A53,A54,A55,A56,A57,A58,A59,A60,Class
0,0.0079,0.0086,0.0055,0.0250,0.0344,0.0546,0.0528,0.0958,0.1009,0.1240,...,0.0176,0.0127,0.0088,0.0098,0.0019,0.0059,0.0058,0.0059,0.0032,R
1,0.0599,0.0474,0.0498,0.0387,0.1026,0.0773,0.0853,0.0447,0.1094,0.0351,...,0.0013,0.0005,0.0227,0.0209,0.0081,0.0117,0.0114,0.0112,0.0100,M
2,0.0093,0.0269,0.0217,0.0339,0.0305,0.1172,0.1450,0.0638,0.0740,0.1360,...,0.0212,0.0091,0.0056,0.0086,0.0092,0.0070,0.0116,0.0060,0.0110,R
3,0.0151,0.0320,0.0599,0.1050,0.1163,0.1734,0.1679,0.1119,0.0889,0.1205,...,0.0061,0.0015,0.0084,0.0128,0.0054,0.0011,0.0019,0.0023,0.0062,R
4,0.0317,0.0956,0.1321,0.1408,0.1674,0.1710,0.0731,0.1401,0.2083,0.3513,...,0.0201,0.0248,0.0131,0.0070,0.0138,0.0092,0.0143,0.0036,0.0103,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,0.0731,0.1249,0.1665,0.1496,0.1443,0.2770,0.2555,0.1712,0.0466,0.1114,...,0.0444,0.0230,0.0290,0.0141,0.0161,0.0177,0.0194,0.0207,0.0057,M
135,0.0516,0.0944,0.0622,0.0415,0.0995,0.2431,0.1777,0.2018,0.2611,0.1294,...,0.0432,0.0274,0.0152,0.0120,0.0129,0.0020,0.0109,0.0074,0.0078,M
136,0.0015,0.0186,0.0289,0.0195,0.0515,0.0817,0.1005,0.0124,0.1168,0.1476,...,0.0108,0.0075,0.0089,0.0036,0.0029,0.0013,0.0010,0.0032,0.0047,M
137,0.0411,0.0277,0.0604,0.0525,0.0489,0.0385,0.0611,0.1117,0.1237,0.2300,...,0.0217,0.0038,0.0019,0.0065,0.0132,0.0108,0.0050,0.0085,0.0044,M


In [7]:
X_train = data_train.drop('Class', axis = 1).values
X_test = data_test.drop('Class', axis = 1).values

y_train = data_train['Class']
y_test = data_test['Class']

In [8]:
X_train

array([[0.0079, 0.0086, 0.0055, ..., 0.0058, 0.0059, 0.0032],
       [0.0599, 0.0474, 0.0498, ..., 0.0114, 0.0112, 0.01  ],
       [0.0093, 0.0269, 0.0217, ..., 0.0116, 0.006 , 0.011 ],
       ...,
       [0.0015, 0.0186, 0.0289, ..., 0.001 , 0.0032, 0.0047],
       [0.0411, 0.0277, 0.0604, ..., 0.005 , 0.0085, 0.0044],
       [0.027 , 0.0163, 0.0341, ..., 0.0094, 0.0105, 0.0093]])

In [9]:
y_train

0      R
1      M
2      R
3      R
4      R
      ..
134    M
135    M
136    M
137    M
138    M
Name: Class, Length: 139, dtype: object

In [10]:
def euclideanDistance(in1,in2):
    '''
    Given two vectors in1 and in2, returns euclidean distance between them
    '''
    dist = np.linalg.norm(in1-in2)
    return dist


def manhattanDistance(in1, in2):
    '''
    Given two vectors in1 and in2, returns manhattan distance between them
    '''
    dist = 0.0
    for i in range(len(in1)):
        dist += abs(in1[i] - in2[i])
    return dist

In [11]:

def getNeighbour(x_,X,dist_f): 
    '''
    getNeighbour takes in a test data point (x_), training data (X), and distance function (dist_f);
    It loops through all training data and calls the distance function to calculate the distance between the test data point
    and all data points in the training set. Finds the smallest distance and returns index of the row with the smallest 
    distance to test data point. 
    '''
    
    smallest = dist_f(x_,X[0])
    index = 0
    for i in range(len(X)):
        dist = dist_f(x_,X[i])
        if (dist <= smallest):
            smallest = dist
            index = i

    return index # index of nearest neighbour in training data    

In [12]:

def nearest_neighbour(X,y,X_,dist_function):
    '''
    nearest_neightbour function takes in training data X, training labels y, test data X_ and distance function 
    dist_function. It loops through all test data points, calls for getNeighbour to find neighbour of each point,
    and assigns label based on the neighbour's label.
    The function returns a list of predictions (list of predicted labels).
    '''
    
    predictions = []
    true_labels = []
    for i in range(len(X_)): #for all test points
        # knn classifier
        x_=X_[i] # test point x_
        # get neighbours of x_ in training data 
        neighbour = getNeighbour(x_,X,dist_function)
        # assignLabel to x_ based on neighbours
        pred_label = y[neighbour]
        predictions.append(pred_label)
        
    return predictions

def accuracy(predictions,y_test):
    '''
    Given a list of predicted and actual target values, returns prediction accuracy
    '''
    correct=0;
    for i in range(len(y_test)):
        if(predictions[i] == y_test[i]):
            correct += 1
    return (correct/float(len(y_test)))*100

In [14]:
# Generate prediction using euclidean distance measure
y_pred = nearest_neighbour(X_train,y_train,X_test,euclideanDistance)

In [17]:
# Calculate accuracy of using euclidean distance measure
euclidean_accuracy = accuracy(y_pred,y_test)
print("ACCURACY :",euclidean_accuracy," using distance function: ", euclideanDistance.__name__) 

ACCURACY : 89.85507246376811 % using distance function:  euclideanDistance


In [18]:
# Generate prediction using manhattan distance measure
y_pred = nearest_neighbour(X_train,y_train,X_test,manhattanDistance)

In [19]:
# Calculate accuracy of using manhattan distance measure
manhattan_accuracy = accuracy(y_pred,y_test)
print("ACCURACY :",manhattan_accuracy," using distance function: ", manhattanDistance.__name__) 

ACCURACY : 88.40579710144928 % using distance function:  manhattanDistance
