In [85]:
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn.model_selection import train_test_split

In [104]:
from sklearn.metrics import accuracy_score

In [86]:
import pandas as pd

In [188]:
# View the notes for the dataset in 'breast-cancer-wisconsin.names files'

In [87]:
df = pd.read_csv('breast-cancer-wisconsin.data')

In [88]:
df.head()

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epi_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [189]:
# Class = 2 means benign
# Class = 4 means malignant

In [89]:
# Making the missing data as outliers
''' 
Remember missing data can be made as an outlier or dropped as well
drop using df.dropna(inplace = True)
Also it depends on algorithm which we are using and the dataset whether to drop or not
It also matters if what value we are choosing to make it as outlier

'''
df.replace('?',-99999, inplace = True)

In [90]:
# ID is useless as it does not matter for cancer\

In [91]:
df.drop(['id'],axis = 1, inplace = True)

In [92]:
df.head()

Unnamed: 0,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epi_cell_size,bare_nuclei,bland_chromation,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [93]:
X = np.array(df.drop(['class'], axis = 1))
Y = np.array(df['class'])

In [164]:
print(X.shape)

(699, 9)


In [94]:
# In order to maintain reproducability use random_state and use stratify since we want equal split of Y's along the
# splitting

In [95]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state = 31, stratify = Y, test_size = 0.2)

In [96]:
# Just to check to effect of stratifying in the dataset
# We have maintained 65 % begign to 35 % malignant
# We can now make use of our model

In [97]:
benign = 0
malignant = 0
for y in Y_train:
    if(y == 2):
        benign += 1
    elif(y == 4):
        malignant += 1

print(benign)
print(malignant)

366
193


In [181]:
# Using the model KNN

In [172]:
model = neighbors.KNeighborsClassifier(n_neighbors = 7, weights = 'uniform', n_jobs = -1)

In [173]:
model.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=7, p=2,
           weights='uniform')

In [174]:
accuracy = model.score(X_train,Y_train)

In [175]:
print(accuracy)

0.9749552772808586


In [179]:
# Making predictions on X_test to validate our claim

In [176]:
predictions = model.predict(X_test)

In [177]:
accuracy = accuracy_score(predictions, Y_test)

In [178]:
print(accuracy)

0.9857142857142858


In [182]:
# Make predictions with a single np array to check the output
# Make new array which is not seen anywher in train

In [228]:
example_measures = np.array([4,2,1,1,1,2,3,2,1]).reshape(1,-1)

In [229]:
prediction = model.predict(example_measures)

In [230]:
print(prediction)

[2]


In [231]:
# Making more predictions

In [232]:
example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,1,1,2,3,2,1]])
example_measures = example_measures.reshape(2, -1)
prediction = model.predict(example_measures)
print(prediction)

[2 2]


In [233]:
# Lets make it generic

In [244]:
example_measures = np.array([[4,2,1,1,1,2,3,2,1],[4,2,1,1,1,2,3,2,1],[2,7,4,6,3,2,3,4,1]])
example_measures = example_measures.reshape(len(example_measures), -1)
prediction = model.predict(example_measures)
print(prediction)

[2 2 4]
