In [33]:
# http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
# class 2 benign, class 4 malignant

In [34]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier


In [35]:
breast_cancer  = pd.read_csv("C:/Users/boyerm/Documents/Thinkful/breast_cancer.csv")

In [36]:
breast_cancer.head()

Unnamed: 0,sample_no,clumb_thickness,unif_cell_size,unif_cell_shape,marginal_adhesion,sing_ep_cell_size,bare_nuclei,bland_chromatin,normal_nuclei,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [37]:
breast_cancer.describe()

Unnamed: 0,sample_no,clumb_thickness,unif_cell_size,unif_cell_shape,marginal_adhesion,sing_ep_cell_size,bland_chromatin,normal_nuclei,mitoses,class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [38]:
# Notice that "bare nuclei" is misisng from describe, must have a bad character somewhere

In [39]:
print(breast_cancer.bare_nuclei.unique())
table = pd.crosstab(breast_cancer['bare_nuclei'], breast_cancer['class']) 
print(table)

['1' '10' '2' '4' '3' '9' '7' '?' '5' '8' '6']
class          2    4
bare_nuclei          
1            387   15
10             3  129
2             21    9
3             14   14
4              6   13
5             10   20
6              0    4
7              1    7
8              2   19
9              0    9
?             14    2


In [40]:
# there is a "?" in 16 rows, so need to remove those rows

In [41]:
## Clean data

breast_cancer = breast_cancer.drop(breast_cancer[breast_cancer.bare_nuclei == '?'].index)
print(breast_cancer.bare_nuclei.unique())

['1' '10' '2' '4' '3' '9' '7' '5' '8' '6']


In [42]:
## fix col names and "class"

In [43]:
breast_cancer=breast_cancer.rename(columns = {'unif_cell_size':'unit_cell_size'})
breast_cancer=breast_cancer.rename(columns = {'unif_cell_shape':'unit_cell_shape'})
breast_cancer=breast_cancer.rename(columns = {'class':'outcome'})

In [44]:
# drop sample column
breast_cancer = breast_cancer.drop('sample_no',1)

In [45]:
breast_cancer.head()

Unnamed: 0,clumb_thickness,unit_cell_size,unit_cell_shape,marginal_adhesion,sing_ep_cell_size,bare_nuclei,bland_chromatin,normal_nuclei,mitoses,outcome
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [47]:
# Define the training and test sizes.
trainsize = int(breast_cancer.shape[0] / 2)
bc_train = breast_cancer.iloc[:trainsize, :].copy()
bc_test = breast_cancer.iloc[trainsize:, :].copy()

bc_train_X = bc_train.drop(['outcome'],1) 
bc_test_X = bc_test.drop(['outcome'],1) 

bc_train_Y = bc_train.outcome
bc_test_Y = bc_test.outcome


## KNN Classifier

In [53]:
neighbors = KNeighborsClassifier(n_neighbors=5)

neighbors.fit(bc_train_X,bc_train_Y)

neighbors.score(bc_train_X,bc_train_Y)


# Classify, storing the result in a new variable.
y_pred_train = neighbors.predict(bc_train_X)
y_pred_test = neighbors.predict(bc_test_X)


# Display our results.
print("Train:Number of mislabeled points out of a total {} points : {}".format(
    bc_train.shape[0],
    (bc_train_Y != y_pred_train).sum()
))
print("Test: Number of mislabeled points out of a total {} points : {}".format(
    bc_test.shape[0],
    (bc_test_Y != y_pred_test).sum()
))


Train:Number of mislabeled points out of a total 341 points : 10
Test: Number of mislabeled points out of a total 342 points : 5


In [57]:
# PRetty good :) Let's see the type of error
pd.crosstab(bc_train_Y, y_pred_train)

col_0,2,4
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
2,175,8
4,2,156


In [58]:
pd.crosstab(bc_test_Y, y_pred_test)

col_0,2,4
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
2,258,3
4,2,79


## Random Forest

In [62]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
rfc.fit(bc_train_X,bc_train_Y)
cross_val_score(rfc, bc_train_X,bc_train_Y, cv=10)

array([ 0.88571429,  0.88571429,  0.94285714,  0.97058824,  0.97058824,
        0.94117647,  0.97058824,  0.94117647,  0.93939394,  0.93939394])

In [63]:
rfc.score(bc_train_X,bc_train_Y)

1.0

In [65]:
rfc.score(bc_test_X,bc_test_Y)

0.98830409356725146

In [66]:
# Classify, storing the result in a new variable.
y_pred_train = rfc.predict(bc_train_X)
y_pred_test = rfc.predict(bc_test_X)


# Display our results.
print("Train:Number of mislabeled points out of a total {} points : {}".format(
    bc_train.shape[0],
    (bc_train_Y != y_pred_train).sum()
))
print("Test: Number of mislabeled points out of a total {} points : {}".format(
    bc_test.shape[0],
    (bc_test_Y != y_pred_test).sum()
))

Train:Number of mislabeled points out of a total 341 points : 0
Test: Number of mislabeled points out of a total 342 points : 4


In [67]:
# WOW :)