In [3]:
import sklearn
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier # import the K-Neighbors Algorithm
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import preprocessing # this is required for feature normalisation

In [4]:
# Read the car data from the downloaded file
data = pd.read_csv("car.data")
print(data.head())  # To check if our data is loaded correctly

  buying  maint door persons lug_boot safety  class
0  vhigh  vhigh    2       2    small    low  unacc
1  vhigh  vhigh    2       2    small    med  unacc
2  vhigh  vhigh    2       2    small   high  unacc
3  vhigh  vhigh    2       2      med    low  unacc
4  vhigh  vhigh    2       2      med    med  unacc


### Note that some of the feature entries in our data are not numeric. However, the classifying algorithm requires us to use numeric features. Therefore, we encode the non-numeric feature values using the LabelEncoder() class from the sklearn.preprocessing package.

In [5]:
# Create the encoder object
le = preprocessing.LabelEncoder()

# Encode each of the feature lists
buying = le.fit_transform(list(data["buying"]))
maint = le.fit_transform(list(data["maint"]))
door = le.fit_transform(list(data["door"]))
persons = le.fit_transform(list(data["persons"]))
lug_boot = le.fit_transform(list(data["lug_boot"]))
safety = le.fit_transform(list(data["safety"]))
clss = le.fit_transform(list(data["class"]))

In [6]:
# Combine the features into a feature list using the zip() function
X = list(zip(buying, maint, door, persons, lug_boot, safety))  # features

# Create the list of labels
y = list(clss)  # labels

# Split the data into training (90%) and testing (10%) datasets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X,
                                                                            y,
                                                                            test_size = 0.1)

The K-Neighbors classifying algorithm is a supervised learning algortihm which works as follows:
 - The model memorises the positions of each of the datapoints from the training dataset in the high-dimensional feature vector space they span (note that this can be VERY computationally heavy).
 - When the model makes a prediction for a given datapoint from the testing dataset, it loads its position in the same high-dimensional dataset.
 - It then measures the distances between this test point and each of the datapoints in our training dataset.
 - The model picks those K points with the smallest distances, i.e., the K closest 'neighbors' to the test point. (K is a hyperparameter of the model.)
 - The model counts how many times each of the possible classes appears in the chosen K points.
 - Finally, it predicts that the test datapoint belongs to that class which appears the most in the set of its K closest neighbors.

In [11]:
# Import the algoithm from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Determine the K hyperparameter
K = 9

# Define and train the model
model = KNeighborsClassifier(n_neighbors=K)
model.fit(x_train, y_train)

# Test the model on the training data and print its accuracy
acc = model.score(x_test, y_test)
print("Accuracy: " + str('{:.2f}%'.format(acc*100)))

Accuracy: 81.50%


In [12]:
# We inspect how exactly our model is performing on the test 
predicted = model.predict(x_test)
names = ["unacc", "acc", "good", "vgood"]

for x in range(len(predicted)):
    print("Predicted: ", names[predicted[x]], "Data: ", x_test[x], "Actual: ", names[y_test[x]])

Predicted:  good Data:  (2, 0, 2, 0, 2, 0) Actual:  good
Predicted:  good Data:  (1, 3, 1, 0, 2, 2) Actual:  good
Predicted:  good Data:  (3, 3, 3, 2, 2, 0) Actual:  good
Predicted:  acc Data:  (2, 1, 2, 2, 0, 2) Actual:  acc
Predicted:  unacc Data:  (3, 1, 1, 2, 2, 0) Actual:  unacc
Predicted:  good Data:  (3, 1, 1, 1, 0, 1) Actual:  good
Predicted:  good Data:  (3, 3, 1, 1, 2, 2) Actual:  good
Predicted:  good Data:  (0, 0, 2, 0, 1, 2) Actual:  good
Predicted:  good Data:  (1, 3, 0, 0, 2, 1) Actual:  good
Predicted:  good Data:  (2, 1, 0, 0, 0, 2) Actual:  good
Predicted:  good Data:  (0, 3, 2, 0, 0, 0) Actual:  good
Predicted:  unacc Data:  (3, 1, 1, 1, 1, 2) Actual:  good
Predicted:  good Data:  (3, 2, 0, 0, 2, 2) Actual:  good
Predicted:  good Data:  (3, 3, 0, 1, 0, 2) Actual:  good
Predicted:  good Data:  (0, 3, 1, 1, 2, 0) Actual:  good
Predicted:  good Data:  (2, 1, 1, 1, 1, 2) Actual:  unacc
Predicted:  unacc Data:  (1, 3, 2, 0, 2, 0) Actual:  good
Predicted:  good Data:  (3, 

In [13]:
# For each datapoint in our testing dataset, find the K closest neighbors, and 
# the 'distances' to them

for x in range(len(predicted)):
    # Now we will we see the neighbors of each point in our testing data
    distances, indeces = model.kneighbors([x_test[x]], n_neighbors=K, return_distance=True)
#     print("N: ", n)
    print("Test data point: ", x_test[x])
    print(indeces)
    print(distances)
    print('\n')

Test data point:  (2, 0, 2, 0, 2, 0)
[[121]]
[[1.]]


Test data point:  (1, 3, 1, 0, 2, 2)
[[822]]
[[1.]]


Test data point:  (3, 3, 3, 2, 2, 0)
[[624]]
[[1.]]


Test data point:  (2, 1, 2, 2, 0, 2)
[[891]]
[[1.]]


Test data point:  (3, 1, 1, 2, 2, 0)
[[999]]
[[1.]]


Test data point:  (3, 1, 1, 1, 0, 1)
[[1474]]
[[1.]]


Test data point:  (3, 3, 1, 1, 2, 2)
[[76]]
[[1.]]


Test data point:  (0, 0, 2, 0, 1, 2)
[[909]]
[[1.]]


Test data point:  (1, 3, 0, 0, 2, 1)
[[294]]
[[1.]]


Test data point:  (2, 1, 0, 0, 0, 2)
[[468]]
[[1.]]


Test data point:  (0, 3, 2, 0, 0, 0)
[[682]]
[[1.]]


Test data point:  (3, 1, 1, 1, 1, 2)
[[874]]
[[1.]]


Test data point:  (3, 2, 0, 0, 2, 2)
[[296]]
[[1.]]


Test data point:  (3, 3, 0, 1, 0, 2)
[[581]]
[[1.]]


Test data point:  (0, 3, 1, 1, 2, 0)
[[379]]
[[1.]]


Test data point:  (2, 1, 1, 1, 1, 2)
[[1226]]
[[1.]]


Test data point:  (1, 3, 2, 0, 2, 0)
[[3]]
[[1.]]


Test data point:  (3, 2, 1, 0, 1, 0)
[[109]]
[[1.]]


Test data point:  (3, 2, 0, 1

In [16]:
# Try to determine which is the best K hyperparameter to use:

best_model = None
best_accuracy = 0
best_model_k = None

for k in range(15):
    temp_model = KNeighborsClassifier(n_neighbors=k+1)
    temp_model.fit(x_train, y_train)
    
    temp_accuracy = temp_model.score(x_test, y_test)
    print("Accuracy for " + str(k + 1) + " neighbors: " + str('{:.2f}%'.format(temp_accuracy*100)))
    
    if temp_accuracy > best_accuracy:
        best_accuracy = temp_accuracy
        best_model = temp_model
        best_model_k = k + 1

print("The best model has " + str(best_model_k) +
      " neighbors and accuracy " + str('{:.2f}%'.format(best_accuracy*100)))

Accuracy for 1 neighbors: 81.50%
Accuracy for 2 neighbors: 79.77%
Accuracy for 3 neighbors: 87.28%
Accuracy for 4 neighbors: 90.75%
Accuracy for 5 neighbors: 92.49%
Accuracy for 6 neighbors: 94.22%
Accuracy for 7 neighbors: 94.80%
Accuracy for 8 neighbors: 93.64%
Accuracy for 9 neighbors: 95.95%
Accuracy for 10 neighbors: 93.06%
Accuracy for 11 neighbors: 91.91%
Accuracy for 12 neighbors: 90.17%
Accuracy for 13 neighbors: 88.44%
Accuracy for 14 neighbors: 86.71%
Accuracy for 15 neighbors: 86.13%
The best model has 9 neighbors and accuracy 95.95%


### It seems that for this dataset the optimal number of neighbors to use is 9.