In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

## Classification metrics 

* Could build model that predicts all email as real and would be accurate 99% of the time
* When one class is more frequent this is called class imbalance
* We can generate a confusion matrix (T/F +/-)
* Class of interest is called the positive class
* Accuracy = tp + tn / (tp + fp + tn + fn)
* Precision = tp / (tp + fp)
* Sensitivity / Recall = tp / (tp + fn)
* F1 score = 2 * (precision * recall) / precision + recall
* High precision - not many real emails predicted as spam
* High sensitivity - predict most spam emails correctly


In [14]:
df = datasets.load_breast_cancer()
print(df.keys())
print(df['feature_names'])

dict_keys(['target', 'DESCR', 'feature_names', 'data', 'target_names'])
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [17]:
# features and target
X = df.data
y = df.target
print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [22]:
# percent of cancer in target set
np.mean(y)

0.62741652021089633

In [18]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [19]:
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=2,
           weights='uniform')

In [23]:
y_pred = knn.predict(X_test)

#to test accuracy we use score method of the model
print(knn.score(X_test, y_test))

0.964912280702


In [25]:
# confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

[[ 59   4]
 [  2 106]]
             precision    recall  f1-score   support

          0       0.97      0.94      0.95        63
          1       0.96      0.98      0.97       108

avg / total       0.96      0.96      0.96       171

