In [2]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
%matplotlib inline

In [3]:
# Importing the dataset

pc_df = pd.read_csv('prostate_cancer.csv')

# Get a preview of the dataset


pc_df.head(10)

Unnamed: 0,id,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,1,M,23,12,151,954,0.143,0.278,0.242,0.079
1,2,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,3,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,4,M,14,16,78,386,0.07,0.284,0.26,0.097
4,5,M,9,19,135,1297,0.141,0.133,0.181,0.059
5,6,B,25,25,83,477,0.128,0.17,0.209,0.076
6,7,M,16,26,120,1040,0.095,0.109,0.179,0.057
7,8,M,15,18,90,578,0.119,0.165,0.22,0.075
8,9,M,19,24,88,520,0.127,0.193,0.235,0.074
9,10,M,25,11,84,476,0.119,0.24,0.203,0.082


In [10]:
# Split dataset into attributes and labels

x = pc_df.iloc[:, 2:].values  
y = pc_df.iloc[:, 1].values  

In [9]:
y

array(['M', 'B', 'M', 'M', 'M', 'B', 'M', 'M', 'M', 'M', 'M', 'M', 'B',
       'M', 'M', 'M', 'M', 'M', 'M', 'B', 'B', 'B', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B',
       'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'M', 'B', 'M',
       'M', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'B',
       'M', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'M'], dtype=object)

In [11]:
x

array([[2.300e+01, 1.200e+01, 1.510e+02, 9.540e+02, 1.430e-01, 2.780e-01,
        2.420e-01, 7.900e-02],
       [9.000e+00, 1.300e+01, 1.330e+02, 1.326e+03, 1.430e-01, 7.900e-02,
        1.810e-01, 5.700e-02],
       [2.100e+01, 2.700e+01, 1.300e+02, 1.203e+03, 1.250e-01, 1.600e-01,
        2.070e-01, 6.000e-02],
       [1.400e+01, 1.600e+01, 7.800e+01, 3.860e+02, 7.000e-02, 2.840e-01,
        2.600e-01, 9.700e-02],
       [9.000e+00, 1.900e+01, 1.350e+02, 1.297e+03, 1.410e-01, 1.330e-01,
        1.810e-01, 5.900e-02],
       [2.500e+01, 2.500e+01, 8.300e+01, 4.770e+02, 1.280e-01, 1.700e-01,
        2.090e-01, 7.600e-02],
       [1.600e+01, 2.600e+01, 1.200e+02, 1.040e+03, 9.500e-02, 1.090e-01,
        1.790e-01, 5.700e-02],
       [1.500e+01, 1.800e+01, 9.000e+01, 5.780e+02, 1.190e-01, 1.650e-01,
        2.200e-01, 7.500e-02],
       [1.900e+01, 2.400e+01, 8.800e+01, 5.200e+02, 1.270e-01, 1.930e-01,
        2.350e-01, 7.400e-02],
       [2.500e+01, 1.100e+01, 8.400e+01, 4.760e+02, 1.1

In [12]:
# Split into training and testing datasets

from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20) 

In [13]:
# Feature scaling

from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(x_train)

x_train = scaler.transform(x_train)  
x_test = scaler.transform(x_test) 

In [15]:
# Training the model

from sklearn.neighbors import KNeighborsClassifier  
classifier = KNeighborsClassifier(n_neighbors=5)  
classifier.fit(x_train, y_train)  

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [17]:
# Making predictions

y_pred = classifier.predict(x_test)  
y_pred

array(['M', 'M', 'M', 'M', 'B', 'M', 'M', 'B', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'B', 'M', 'B', 'B'], dtype=object)

In [18]:
# Evaluating the algorithm

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[ 5  3]
 [ 0 12]]
             precision    recall  f1-score   support

          B       1.00      0.62      0.77         8
          M       0.80      1.00      0.89        12

avg / total       0.88      0.85      0.84        20



In [19]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.85
