In this project the Iris dataset (https://archive.ics.uci.edu/ml/datasets/Iris/) is used to build and train a k-nearest-neighbor (kNN) classifier using the provided knnClassify python class for different number of K values and recommend the best value of the K for training the model. 

In [None]:
#import libraries and tools
import numpy as np
np.random.seed(0) # set the random number seed
import mltools as ml
import matplotlib.pyplot as plt

In [None]:
iris = np.genfromtxt("iris.txt",delimiter=None) # load the text file
Y = iris[:,-1] # target value (iris species) is the last column
X = iris[:,0:-1] # features are the other columns

In [None]:
#Data dimension info
m, n = X.shape
print('Number of features:', n)
print('Number of data points:', m)
#Plot feature histograms
fig, ax = plt.subplots(1, 4, figsize=(20, 3))
for i in range(n):
  ax[i].hist(X[:,i], bins=20)
plt.show()

In [None]:
#looking at feature statistics
print('Mean:'); print(np.mean(X,axis=0))
print('Variance:'); print(np.var(X,axis=0))
print('Standard Deviation:'); print(np.std(X,axis=0))

In [None]:
# using k-Nearest-Neighbor for Predictions
X,Y = ml.shuffleData(X,Y); # shuffle data randomly
Xtr,Xva,Ytr,Yva = ml.splitData(X,Y, 0.75); # split data into 75/25 train/validation

#Visualizing the classification boundry for k=1,5,10 and 50 and compute error rates; 
#using only the first two features, for visualization purposes
fig,ax = plt.subplots(1, 4, figsize=(15, 3.5))
for i,k in enumerate([1, 5, 10, 50]):
knn = ml.knn.knnClassify()
knn.train(Xtr[:, :2],Ytr, K=k)
print(knn.K,knn.err(Xtr[:,0:2],Ytr),knn.err(Xva[:,0:2],Yva))
ml.plotClassify2D(knn, Xtr[:,0:2],Ytr, axis=ax[i])
plt.show()


In [None]:
# calculate and plot trining and validation error rtes for more values of K 
#using only the first two features
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
K=[1,2,5,10,50,100,200];
errTrain = [None]*len(K) # (preallocate storage for training error) 
errVa = [None]*len(K) # (preallocate storage for Validation error)
for i,k in enumerate(K):
    learner= ml.knn.knnClassify(Xtr[:,0:2], Ytr, K) 
    learner.train(Xtr[:,0:2], Ytr, k) #train the model
    errTrain[i]=knn.err(Xtr[:, :2],Ytr)
    errVa[i]=knn.err(Xva[:, :2],Yva)

ax.semilogx(K, errTrain, 'r-', lw=3, label='Training')
ax.semilogx(K, errVa, 'g-', lw=3, label='Validation')
ax.legend()
ax.set_xlim(9e-1, 250)
ax.set_ylim(0, 1)

plt.show()

In [None]:
#Finding the optimized value of K using all features
fig, ax = plt.subplots(1, 1, figsize=(6, 5))
K=[1,2,5,10,50,100,200];
errTrain = [None]*len(K) # (preallocate storage for training error) 
errVa = [None]*len(K) # (preallocate storage for Validation error)
for i,k in enumerate(K):
    learner= ml.knn.knnClassify(Xtr, Ytr, K) 
    learner.train(Xtr, Ytr, k) #train model
    errTrain[i]=knn.err(Xtr[:, :2],Ytr)
    errVa[i]=knn.err(Xva[:, :2],Yva)
    
ax.semilogx(K, errTrain, 'r-', lw=3, label='Training')
ax.semilogx(K, errVa, 'g-', lw=3, label='Validation')
ax.legend()
ax.set_xlim(9e-1, 250)
ax.set_ylim(0, 1)
plt.show()