# Goals for this workbook:
1. import the data
2. store the data in an array
3. group the data by class and split into groups (basically array operations)
4. compute distance between vectors of parameters
5. randomize the order of lists
6. develop a nearest neighbor algorithm
        -- Pick starting vertex
        -- Travel to the neighbor with the cheapest cost
        -- Continue traveling to new neighbor with least cost
        -- Complete cycle after visiting all cities

In [37]:
import numpy as np
import pandas as pd
import matplotlib as plt
import sklearn
import seaborn as sb
from pandas import Series, DataFrame

In [74]:
# read in file as a data frame using pandas and convert to numpy array
irisData = pd.read_csv('practice/iris.data', names=['sepLen', 'sepWdth', 'petLen', 'petWdth', 'class'])
irisArray = irisData.to_numpy()
irisData

Unnamed: 0,sepLen,sepWdth,petLen,petWdth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [39]:
# group by class
irisByClass = irisData.groupby("class")


In [44]:
# find disatance
u = [1,2,3]
v = [4,5,6]
from scipy.spatial import distance
distance.euclidean(u,v)

5.196152422706632

In [41]:
# randomize the order of lists
import random
random.shuffle(x)

In [45]:
# nearest neighbor algorithm

# split data into attributes and labels
attributes = irisData.iloc[:, :-1].values  
labels = irisData.iloc[:, -1].values 

# split data into testing/training groups
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(attributes, labels, test_size=0.10)

# scale data for uniform evaluation of distance functions
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  

# train KNN algorithm
from sklearn.neighbors import KNeighborsClassifier  
classifier = KNeighborsClassifier(n_neighbors=5)     # first parameter arbitrary; adjust with testing/eval
classifier.fit(X_train, y_train)  
# can do this part manually by defining a distance function of choice and a get neighbors function

# test the model
y_pred = classifier.predict(X_test)  

# evaluate the algorithm
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[5 0 0]
 [0 3 0]
 [0 0 7]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         5
Iris-versicolor       1.00      1.00      1.00         3
 Iris-virginica       1.00      1.00      1.00         7

      micro avg       1.00      1.00      1.00        15
      macro avg       1.00      1.00      1.00        15
   weighted avg       1.00      1.00      1.00        15



In [58]:
# adjust the value of K to minimize error
error = []

# Calculating error for K values between 1 and 40
for i in range(1, 40):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))
    
# plot the calculated error for K values
# plt.figure(figsize=(12, 6))  
# plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',  
#          markerfacecolor='blue', markersize=10)
# plt.title('Error Rate K Value')  
# plt.xlabel('K Value')  
# plt.ylabel('Mean Error')  

AttributeError: module 'matplotlib' has no attribute 'plot'

## Next steps

* randomize how data is split
* key to nearest neighbor algorithm: how the data is clustered (i.e. the distance function)
    * preprocessing to improve
        * look at data; do any data contribute more to skew
        * want to normalize the data for each column (subtract mean; divide by std dev)
    * use different distance functions
        * change weights of parameters in the distance function
* organize code into classes, functions

* euclidean distance formula
* 2 more distance formulas
* split data into known and unknown groups
* loop over unknown to find the closest known and guess class
* compare guess with true value to get % correct