In [10]:
import numpy as np
import h5py as h5
import scipy as sp
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from numpy import linalg as LA

  from ._conv import register_converters as _register_converters


## Prove 01 code

In [11]:
iris = datasets.load_iris()

# Prepare Training/ Test Se
x_tr, x_test, y_tr, y_test = train_test_split(iris.data, iris.target, train_size= 0.7)

# Use an Existing Algorithm to Create a Model
classifier = GaussianNB()
classifier.fit(x_tr, y_tr)

# Use That Model to Make Predictions
targets_predicted = classifier.predict(x_test)

correct = targets_predicted == y_test
#print( sum(targets_predicted == y_test) )
#print("Percent correct: %.2f " % (100* np.sum(correct)/float(len(correct))) )

# Prove 02

## Part 1 - Implement basic kNN algorithm

In [77]:
class kNN:
    
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, train_data, target_data):
        self.train_data  = train_data
        self.target_data = target_data
    
    # Returns the predictions for kNN
    #     In the event of a tie for a data point k is increased until the tie is broken. 
    def predict(self, t_dat):
        predictions = []
        for row in t_dat:
            # kp - is the k counter that goes up by 1 if there's a tie
            # TIE - Keeps track if there's a tie
            kp, kp_ERR_CK, TIE = 0, True, True
            while TIE == True:
                # get's the distance from the row to every other data point
                dis = np.array( self.get_distance(row) )
                
                # gets the index of the k+kp closest data points
                dis_index = np.array( [ np.argsort( dis )[x] for x in np.arange(self.k+kp) ] )
                
                # Sorts the Nearest Neighbors into an dictionary with the keys being the target classes and 
                #      the value in each key is the number of NN of that class.
                NNs_count = { key: np.sum( key*np.ones(len(dis_index)) == y_tr[dis_index] ) for key in y_tr }

                # Find the max key
                max_key   = max(NNs_count.keys(), key=lambda k: NNs_count[k])
                TIE       = False 
                # Check for ties
                for key in NNs_count:
                    if key != max_key and NNs_count[key] == NNs_count[max_key]:
                        TIE = True
                        kp += 1
                # Add the most frequent value to the prediction list
                if not(TIE): predictions.append(max_key)    
                    
                if kp>20 and kp_ERR_CK: 
                    kp_ERR_CK = False
                    print("Too many ties - there's probably an error.")
                    
        return predictions               
            
    def get_distance(self, point):
        dis = []
        for train_d in self.train_data:
            dis.append( np.sum( (np.array(point) - np.array(train_d) )**2 ) )
        return dis

## Part 2 - Be able to load and use the Iris dataset

In [20]:
clfr = kNN(k = 4)
clfr.fit(x_tr, y_tr)
clfr_trg_predicted = clfr.predict(x_test)
clfr_correct = clfr_trg_predicted == y_test
print("Percent correct: %.2f " % (100* np.sum(clfr_correct)/float(len(clfr_correct))) )

Percent correct: 93.33 


## Part 3 - Basic experimentation:

1. Play with different values of K

2. Compare to an existing implementation

In [92]:
from sklearn.neighbors import KNeighborsClassifier

for k in np.arange(12)+1:
    clfr = kNN(k = k)
    clfr.fit(x_tr, y_tr)
    clfr_trg_predicted = clfr.predict(x_test)
    clfr_correct = clfr_trg_predicted == y_test
    print("My kNN Percent correct with      k = %i: %.2f" % (k,100* np.sum(clfr_correct)/float(len(clfr_correct))))
        
    classifier = KNeighborsClassifier(n_neighbors=k)
    classifier.fit(x_tr, y_tr)
    predictions = classifier.predict(x_test)
    skl_cor = predictions == y_test
    print("sklearn kNN Percent correct with k = %i: %.2f" % (k,100*np.sum(skl_cor)/float(len(skl_cor))),"\n")

My kNN Percent correct with      k = 1: 97.78
sklearn kNN Percent correct with k = 1: 97.78 

My kNN Percent correct with      k = 2: 95.56
sklearn kNN Percent correct with k = 2: 93.33 

My kNN Percent correct with      k = 3: 95.56
sklearn kNN Percent correct with k = 3: 95.56 

My kNN Percent correct with      k = 4: 93.33
sklearn kNN Percent correct with k = 4: 91.11 

My kNN Percent correct with      k = 5: 93.33
sklearn kNN Percent correct with k = 5: 93.33 

My kNN Percent correct with      k = 6: 97.78
sklearn kNN Percent correct with k = 6: 93.33 

My kNN Percent correct with      k = 7: 97.78
sklearn kNN Percent correct with k = 7: 97.78 

My kNN Percent correct with      k = 8: 97.78
sklearn kNN Percent correct with k = 8: 93.33 

My kNN Percent correct with      k = 9: 97.78
sklearn kNN Percent correct with k = 9: 97.78 

My kNN Percent correct with      k = 10: 95.56
sklearn kNN Percent correct with k = 10: 95.56 

My kNN Percent correct with      k = 11: 95.56
sklearn kNN

# Above and Beyond

## Experiment with more datasets

### Read in Balance Data From File

1. Title: Balance Scale Weight & Distance Database
2. Relevant Information: 
	This data set was generated to model psychological
	experimental results.  Each example is classified as having the
	balance scale tip to the right, tip to the left, or be
	balanced.  The attributes are the left weight, the left
	distance, the right weight, and the right distance.  The
	correct way to find the class is the greater of 
	(left-distance * left-weight) and (right-distance *
	right-weight).  If they are equal, it is balanced.
3. Number of Instances: 625 (49 balanced, 288 left, 288 right)

4. Number of Attributes: 4 (numeric) + class name = 5

5. Attribute Information:
	1. Class Name: 3 (L, B, R)
	2. Left-Weight: 5 (1, 2, 3, 4, 5)
	3. Left-Distance: 5 (1, 2, 3, 4, 5)
	4. Right-Weight: 5 (1, 2, 3, 4, 5)
	5. Right-Distance: 5 (1, 2, 3, 4, 5)

6. Missing Attribute Values: 
	none

7. Class Distribution: 
   1. 46.08 percent are L
   2. 07.84 percent are B
   3. 46.08 percent are R

In [9]:
data = open("balance_scale.txt","rt")
contents = data.readlines()
data.close()
#print( contents )  # it's ugly. 

# Turns the string version of the data into floats in a 150x4 array
bal_data = [[float(x.split(',')[1]),
             float(x.split(',')[2]),
             float(x.split(',')[3]),
             float((x.split(',')[4]).split('\n')[0])] for x in contents]
#print(bal_data)
## gets the iris names in stings 'setosa', 'versicolor', and 'virginica'
bal_target = [ x.split(',')[0] for x in contents ]
## dictionary to make recasting iris names intointegers easy
targ_vals = {'L':0, 'B':1, 'R':2}
## loop to recasting iris names into integers
for i,name in enumerate(bal_target):
    bal_target[i] = targ_vals[name]
#print( bal_target ) # it worked

In [13]:
x_tr, x_test, y_tr, y_test = train_test_split(bal_data, bal_target, train_size= 0.7)



In [22]:
clfr = kNN(k = 7)
clfr.fit(x_tr, y_tr)
clfr_trg_predicted = clfr.predict(x_test)
clfr_correct = clfr_trg_predicted == y_test
print("Percent correct: %.2f " % (100* np.sum(clfr_correct)/float(len(clfr_correct))) )

Percent correct: 93.33 


In [23]:
from sklearn.neighbors import KNeighborsClassifier

for k in np.arange(12)+1:
    clfr = kNN(k = k)
    clfr.fit(x_tr, y_tr)
    clfr_trg_predicted = clfr.predict(x_test)
    clfr_correct = clfr_trg_predicted == y_test
    print("My kNN Percent correct with      k = %i: %.2f" % (k,100* np.sum(clfr_correct)/float(len(clfr_correct))))
        
    classifier = KNeighborsClassifier(n_neighbors=k)
    classifier.fit(x_tr, y_tr)
    predictions = classifier.predict(x_test)
    skl_cor = predictions == y_test
    print("sklearn kNN Percent correct with k = %i: %.2f" % (k,100*np.sum(skl_cor)/float(len(skl_cor))),"\n")

My kNN Percent correct with      k = 1: 95.56
sklearn kNN Percent correct with k = 1: 95.56 

My kNN Percent correct with      k = 2: 95.56
sklearn kNN Percent correct with k = 2: 91.11 

My kNN Percent correct with      k = 3: 95.56
sklearn kNN Percent correct with k = 3: 95.56 

My kNN Percent correct with      k = 4: 93.33
sklearn kNN Percent correct with k = 4: 91.11 

My kNN Percent correct with      k = 5: 93.33
sklearn kNN Percent correct with k = 5: 93.33 

My kNN Percent correct with      k = 6: 93.33
sklearn kNN Percent correct with k = 6: 93.33 

My kNN Percent correct with      k = 7: 93.33
sklearn kNN Percent correct with k = 7: 93.33 

My kNN Percent correct with      k = 8: 93.33
sklearn kNN Percent correct with k = 8: 93.33 

My kNN Percent correct with      k = 9: 93.33
sklearn kNN Percent correct with k = 9: 93.33 

My kNN Percent correct with      k = 10: 95.56
sklearn kNN Percent correct with k = 10: 93.33 

My kNN Percent correct with      k = 11: 95.56
sklearn kNN

### Read in ionosphere Data From File

1. Title: Johns Hopkins University Ionosphere database

2. Relevant Information:
   This radar data was collected by a system in Goose Bay, Labrador.  This
   system consists of a phased array of 16 high-frequency antennas with a
   total transmitted power on the order of 6.4 kilowatts.  See the paper
   for more details.  The targets were free electrons in the ionosphere.
   "Good" radar returns are those showing evidence of some type of structure 
   in the ionosphere.  "Bad" returns are those that do not; their signals pass
   through the ionosphere.  

   Received signals were processed using an autocorrelation function whose
   arguments are the time of a pulse and the pulse number.  There were 17
   pulse numbers for the Goose Bay system.  Instances in this databse are
   described by 2 attributes per pulse number, corresponding to the complex
   values returned by the function resulting from the complex electromagnetic
   signal.

3. Number of Instances: 351

4. Number of Attributes: 34 plus the class attribute
   -- All 34 predictor attributes are continuous

5. Attribute Information:     
   -- All 34 are continuous, as described above
   -- The 35th attribute is either "good" or "bad" according to the definition
      summarized above.  This is a binary classification task.

6. Missing Values: None



In [81]:
import pandas as pd
from copy import deepcopy
data = pd.read_csv("ionosphere_data.txt", header=None)


ion_train  = np.asarray( data[np.arange(34)] )
ion_target = np.asarray( data[34] )

targ_vals = {'g':0, 'b':1}
## loop to recasting iris names into integers
for i,name in enumerate(ion_target):
    ion_target[i] = targ_vals[name]

#print( ion_train )
#print( data[34])
x_tr, x_test, y_tr, y_test = train_test_split(ion_train, ion_target, train_size= 0.7)

clfr = kNN(k = 12)
clfr.fit(x_tr, y_tr)
clfr_trg_predicted = clfr.predict(x_test)
clfr_correct = clfr_trg_predicted == y_test
print("Percent correct: %.2f " % (100* np.sum(clfr_correct)/float(len(clfr_correct))) )



Percent correct: 83.02 


In [86]:
from sklearn.neighbors import KNeighborsClassifier

#for some reason KNeighborsClassifier() needed this to work...?
y_tr = y_tr.astype('int')
for k in np.arange(7,20)+1:
    clfr = kNN(k = k)
    clfr.fit(x_tr, y_tr)
    clfr_trg_predicted = clfr.predict(x_test)
    clfr_correct = clfr_trg_predicted == y_test
    print("My kNN Percent correct with      k = %i: %.2f" % (k,100* np.sum(clfr_correct)/float(len(clfr_correct))))
        
    
    classifier = KNeighborsClassifier(n_neighbors=k)
    classifier.fit(x_tr, y_tr)
    predictions = classifier.predict(x_test)
    skl_cor = predictions == y_test
    print("sklearn kNN Percent correct with k = %i: %.2f" % (k,100*np.sum(skl_cor)/float(len(skl_cor))),"\n")

My kNN Percent correct with      k = 8: 83.02
sklearn kNN Percent correct with k = 8: 79.25 

My kNN Percent correct with      k = 9: 83.02
sklearn kNN Percent correct with k = 9: 83.02 

My kNN Percent correct with      k = 10: 82.08
sklearn kNN Percent correct with k = 10: 79.25 

My kNN Percent correct with      k = 11: 82.08
sklearn kNN Percent correct with k = 11: 82.08 

My kNN Percent correct with      k = 12: 83.02
sklearn kNN Percent correct with k = 12: 80.19 

My kNN Percent correct with      k = 13: 83.02
sklearn kNN Percent correct with k = 13: 83.02 

My kNN Percent correct with      k = 14: 81.13
sklearn kNN Percent correct with k = 14: 77.36 

My kNN Percent correct with      k = 15: 81.13
sklearn kNN Percent correct with k = 15: 81.13 

My kNN Percent correct with      k = 16: 81.13
sklearn kNN Percent correct with k = 16: 80.19 

My kNN Percent correct with      k = 17: 81.13
sklearn kNN Percent correct with k = 17: 81.13 

My kNN Percent correct with      k = 18: 80.

## KD-Tree

For the record I spent about ~1 hour reading about KD-Trees, ~1.5 hours looking up how to make them, and ~1 hour trying to impliment it. So I understand what's going on, but I didn't get it working. 