In [1]:
import numpy as np
from sklearn import preprocessing, cross_validation, neighbors
import pandas as pd

### Clean data

In [4]:
df = pd.read_csv('data/breast-cancer-wisconsin.data')
df.replace('?',-99999, inplace=True)
df.drop(['id'], 1, inplace=True)

df

Unnamed: 0,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epi_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
5,8,10,10,8,7,10,9,7,1,4
6,1,1,1,1,2,10,3,1,1,2
7,2,1,2,1,2,1,3,1,1,2
8,2,1,1,1,2,1,1,1,5,2
9,4,2,1,1,2,1,2,1,1,2


## Features & Labels

In [5]:
X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

The features X are everything except for the class. Doing df.drop returns a new dataframe with our chosen column(s) dropped. The labels, y, are just the class column.

## Create training and test samples

In [6]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

## Train classifier

In [7]:
Classifier = neighbors.KNeighborsClassifier()
Classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

## Test classifier

In [8]:
accuracy = Classifier.score(X_test, y_test)
print(accuracy)

0.971428571429


This is already very accurate.

Next, we'll fake some similar data and make a prediction.

In [25]:
example_measures = np.array([[4,2,1,1,1,2,3,2,1], [7,10,10,8,7,10,9,7,1]])

# We cannot pass 1d arrays to the classifier, so we'll reshape the data.
example_measures = example_measures.reshape(len(example_measures), -1)

In [26]:
prediction = Classifier.predict(example_measures)
print(prediction)

[2 4]
