In [5]:
# KNN Classification
# Read in from CSV files (training, testing)
# Need some euclidean distance (import scipy.spatial.distance as ssd)
# Given a new item from testing...
#     compute euclidean distance to every item from training (lazy)
#     sort results, keep the K smallest
#     determine which category is majority in that K

# Some code that might be useful
# for plotting... np.where(condition, if true, if false)

# sorting a 2d list, by default it sorts by position 0 of every sublist
# sort using something else: lst.sort(key = **choose another position**)

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.spatial.distance as ssd

In [7]:
# read in training, testing data
training = pd.read_csv("training.csv")
testing = pd.read_csv("testing.csv")
training.head(5)

Unnamed: 0,name,friendliness,intelligence,label
0,Choux,3,8,cat
1,Garfield,2,4,cat
2,Lion-o,5,5,cat
3,Pink Pather,3,5,cat
4,McGruff,6,5,dog


In [8]:
# try a little euclidean distance on examples from testing/training
train_ex = training[["friendliness", "intelligence"]].iloc[0]
test_ex = testing[["friendliness", "intelligence"]].iloc[0]
ssd.euclidean(train_ex, test_ex)

7.0710678118654755

In [9]:
# what's in the test_ex?
test_ex

friendliness    8
intelligence    3
Name: 0, dtype: int64

In [10]:
# compute euclidean distance on that testing item to all training data
distances = []
for i in range(len(training)):
    train = training[["friendliness", "intelligence"]].iloc[i]
    distance = ssd.euclidean(train, test_ex)
    distances.append((training["name"].iloc[i], training["label"].iloc[i], distance))
distances

[('Choux', 'cat', 7.0710678118654755),
 ('Garfield', 'cat', 6.082762530298219),
 ('Lion-o', 'cat', 3.605551275463989),
 ('Pink Pather', 'cat', 5.385164807134504),
 ('McGruff', 'dog', 2.8284271247461903),
 ('Scooby Doo', 'dog', 0.0),
 ('Odie', 'dog', 1.4142135623730951),
 ('Grizz', 'dog', 3.0),
 ('Carol', 'dog', 4.47213595499958),
 ('Clifford', 'dog', 1.0)]

In [11]:
# sort the list of tuples by distance, keep the K smallest
distances.sort(key = lambda a : a[2])
distances

[('Scooby Doo', 'dog', 0.0),
 ('Clifford', 'dog', 1.0),
 ('Odie', 'dog', 1.4142135623730951),
 ('McGruff', 'dog', 2.8284271247461903),
 ('Grizz', 'dog', 3.0),
 ('Lion-o', 'cat', 3.605551275463989),
 ('Carol', 'dog', 4.47213595499958),
 ('Pink Pather', 'cat', 5.385164807134504),
 ('Garfield', 'cat', 6.082762530298219),
 ('Choux', 'cat', 7.0710678118654755)]

In [12]:
K = 3
distances[:K]

[('Scooby Doo', 'dog', 0.0),
 ('Clifford', 'dog', 1.0),
 ('Odie', 'dog', 1.4142135623730951)]

In [13]:
# now that we have the three closest neighbors, find the most common class
# PPP5
def find_majority_class(classes, tups):
    ''' v1 uses a dictionary to keep running count of each class-                                   
                                                                                                    
        given a list of classes, and a tuple where each tuple contains                              
        a class exactly once, find and return the name of the most common                           
        class                                                                                       
    '''
    counts = {c : 0 for c in classes}
    for key in counts.keys():
        for tup in tups:
            if key in tup:
                counts[key] += 1
    return max(counts, key = counts.get)

In [14]:
# get ALL of the testing data points and find K closest neighbors
all_distances = []
for i in range(len(testing)):
    test  = testing[["friendliness", "intelligence"]].iloc[i]
    curr_distances = []
    for j in range(len(training)):
        train = training[["friendliness", "intelligence"]].iloc[j]
        dist = ssd.euclidean(test, train)
        curr_distances.append((training["name"].iloc[j], dist, training["label"].iloc[j]))
    all_distances.append(curr_distances)
all_distances

[[('Choux', 7.0710678118654755, 'cat'),
  ('Garfield', 6.082762530298219, 'cat'),
  ('Lion-o', 3.605551275463989, 'cat'),
  ('Pink Pather', 5.385164807134504, 'cat'),
  ('McGruff', 2.8284271247461903, 'dog'),
  ('Scooby Doo', 0.0, 'dog'),
  ('Odie', 1.4142135623730951, 'dog'),
  ('Grizz', 3.0, 'dog'),
  ('Carol', 4.47213595499958, 'dog'),
  ('Clifford', 1.0, 'dog')],
 [('Choux', 1.0, 'cat'),
  ('Garfield', 3.1622776601683795, 'cat'),
  ('Lion-o', 2.8284271247461903, 'cat'),
  ('Pink Pather', 2.0, 'cat'),
  ('McGruff', 3.605551275463989, 'dog'),
  ('Scooby Doo', 6.4031242374328485, 'dog'),
  ('Odie', 6.4031242374328485, 'dog'),
  ('Grizz', 5.0990195135927845, 'dog'),
  ('Carol', 3.0, 'dog'),
  ('Clifford', 7.0710678118654755, 'dog')],
 [('Choux', 8.48528137423857, 'cat'),
  ('Garfield', 7.280109889280518, 'cat'),
  ('Lion-o', 5.0, 'cat'),
  ('Pink Pather', 6.708203932499369, 'cat'),
  ('McGruff', 4.242640687119285, 'dog'),
  ('Scooby Doo', 1.4142135623730951, 'dog'),
  ('Odie', 2.0, 'do

In [15]:
# find category for everything in training
for i in range(len(all_distances)):
    all_distances[i].sort(key = lambda a : a[1])
    all_distances[i] = all_distances[i][:K]
all_distances

[[('Scooby Doo', 0.0, 'dog'),
  ('Clifford', 1.0, 'dog'),
  ('Odie', 1.4142135623730951, 'dog')],
 [('Choux', 1.0, 'cat'),
  ('Pink Pather', 2.0, 'cat'),
  ('Lion-o', 2.8284271247461903, 'cat')],
 [('Clifford', 1.0, 'dog'),
  ('Scooby Doo', 1.4142135623730951, 'dog'),
  ('Odie', 2.0, 'dog')],
 [('Lion-o', 1.0, 'cat'),
  ('McGruff', 1.4142135623730951, 'dog'),
  ('Pink Pather', 2.23606797749979, 'cat')],
 [('McGruff', 1.0, 'dog'),
  ('Carol', 1.0, 'dog'),
  ('Lion-o', 1.4142135623730951, 'cat')],
 [('Carol', 1.0, 'dog'),
  ('Lion-o', 2.0, 'cat'),
  ('Choux', 2.23606797749979, 'cat')],
 [('Scooby Doo', 1.0, 'dog'),
  ('Clifford', 1.4142135623730951, 'dog'),
  ('Odie', 2.23606797749979, 'dog')],
 [('Choux', 1.0, 'cat'),
  ('Pink Pather', 2.0, 'cat'),
  ('Lion-o', 2.8284271247461903, 'cat')],
 [('Choux', 1.0, 'cat'),
  ('Pink Pather', 2.0, 'cat'),
  ('Lion-o', 2.8284271247461903, 'cat')]]

In [16]:
predictions = []
for dist in all_distances:
    curr_class = find_majority_class(["cat", "dog"], dist)
    predictions.append(curr_class)
predictions

['dog', 'cat', 'dog', 'cat', 'dog', 'cat', 'dog', 'cat', 'cat']

In [17]:
testing["predicted"] = predictions
testing

Unnamed: 0,name,friendliness,intelligence,label,predicted
0,Sven,8,3,dog,dog
1,Asteroid,3,7,cat,cat
2,Zeke,9,2,dog,dog
3,Melvin,5,4,dog,cat
4,Remy,6,6,dog,dog
5,Lady,5,7,dog,cat
6,Cam,9,3,dog,dog
7,Tito's,3,7,cat,cat
8,Bailey,3,7,cat,cat


In [None]:
#