In [1]:
import random
import warnings
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Math
from matplotlib import style
from sklearn import cross_validation, neighbors

%matplotlib inline
style.use('fivethirtyeight')

test_runs = 25



In [2]:
accuracies = []
for _ in range(test_runs):
    def k_nearest_neighbors(data, predict, k=3):
        if len(data) >= k:
            warnings.warn("K is set to a value less than total voting groups!")
        distances = []
        for group in data:
            for features in data[group]:
                euclidian_distance = np.linalg.norm(np.array(features)-np.array(predict))
                distances.append([euclidian_distance, group])
                
        votes = [i[1] for i in sorted(distances)[:k]]
        vote_result = Counter(votes).most_common(1)[0][0]
        confidence = Counter(votes).most_common(1)[0][1] / k
        return vote_result, confidence
    df = pd.read_csv('dataset/breast-cancer-wisconsin.data')
    df.replace('?', -99999, inplace=True)
    df.drop(['id'], 1, inplace=True)
    # to prevent certain columns to be treated as a string
    full_data = df.astype(float).values.tolist()
    
    # Randomise the data
    random.shuffle(full_data)
    
    # split/splice the data
    test_size = 0.4
    train_set = {
        2: [],
        4: []
    }
    test_set = {
        2: [],
        4: []
    }
    train_data = full_data[:-int(test_size*len(full_data))]
    test_data = full_data[-int(test_size*len(full_data)):]
    
    # populate the datasets
    for i in train_data:
        train_set[i[-1]].append(i[:-1])
        
    for i in test_data:
        test_set[i[-1]].append(i[:-1])
        
        correct = 0
    total = 0
    
    for group in test_set:
        for data in test_set[group]:
            vote, confidence = k_nearest_neighbors(train_set, data, k=5)
            if group == vote:
                correct += 1
            total += 1
    accuracies.append(correct/total)
print(sum(accuracies)/len(accuracies))

0.9687455197132618


In [3]:
accuracies = []
for _ in range(test_runs):
    df = pd.read_csv('dataset/breast-cancer-wisconsin.data')
    
    # Clean up the dataset as described in names point 8
    df.replace('?', -99999, inplace=True)
    
    # drop tables that are useless
    df.drop(['id'], 1, inplace=True)
    X = np.array(df.drop(['class'], 1))

    # Create the label column
    y = np.array(df['class'])
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
    
    clf = neighbors.KNeighborsClassifier(n_jobs=8)
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    accuracies.append(accuracy)
print(sum(accuracies)/len(accuracies))

0.969428571429
