# $k$-Nearest Neighbors (kNN): Classification - Breast Cancer

In [None]:
#!pip3 install mglearn

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


import matplotlib.pyplot as plt
import mglearn.plots
# from mglearn.datasets import make_wave
import numpy as np

np.random.seed(1)


## Load data

Let us start by loading data

In [None]:
cancer = load_breast_cancer()
print(cancer.DESCR)

In [None]:
cancer.feature_names

In [None]:
cancer.target_names

Let us split the data in training and test data
(https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, 
    cancer.target,
    stratify=cancer.target,
    shuffle=True
)
print(f"Train shape: {X_train.shape} \nTest shape: {X_test.shape}")

# Train and test

Prepare and train the classifier
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [None]:
knc = KNeighborsClassifier(
    n_neighbors=3, 
    p=1 # Minkowski metric
).fit(X_train, y_train)

do prediction on the test data

In [None]:
predicted = knc.predict(X_test)
predicted

which predictions were correct?

In [None]:
predicted == y_test

We can check the models's accuracy (mean accuracy = (TP+TN)/n), meaning the model predicted the class correctly for x% of the samples in the test dataset.
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier.score)

In [None]:
knc.score(X_test, y_test)

I.e., the accuracy (percentage of correctness) is

In [None]:
sum(predicted == y_test) / len(predicted)

And the predicted probabilities are

In [None]:
knc.predict_proba(X_test)

## Metrics

We can also run a full set of metrics

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score,  confusion_matrix, classification_report

y_pred = predicted

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

# Plot some projections
We can also do some plots, but this data as more than 2 dimensions... lets plot some projections 

In [None]:
fig = plt.figure(figsize=(30, 30))

n_components = 6

for i in range(n_components):
    for j in range(n_components):
        if i < j: 
            # get an "ax"
            ax = fig.add_subplot(n_components, n_components, (i * n_components) + j + 1) 
            
            # get the indeces of the 'malignant' / 'benign'
            idx_cl_1 = (y_test == 0).nonzero()
            idx_cl_2 = (y_test == 1).nonzero()
            
            # see which ones where correctly predicted
            idx_falses = (~(y_test == predicted)).nonzero()
            
            # and do the plot on the ax
            ax.scatter(X_test[idx_cl_1, i], X_test[idx_cl_1, j], marker='8', s=50, label='class 1')
            ax.scatter(X_test[idx_cl_2, i], X_test[idx_cl_2, j], marker='8', s=50, label='class 2')
            ax.scatter(X_test[idx_falses, i], X_test[idx_falses, j], marker='*', s=50, c='r', label='wrong!')
            ax.legend()
            ax.set_xlabel(cancer.feature_names[i])
            ax.set_ylabel(cancer.feature_names[j])
        
            # ax.legend()
plt.show()

# Knn analysis using part of the features

In [None]:
n_components = 6

fig = plt.figure(figsize=(30, 30))

for i in range(n_components):
    for j in range(n_components):
        if i < j: 
            # get an ax
            ax = fig.add_subplot(n_components, n_components, (i * n_components) + j + 1) 
            
            # get i and j features
            partial_x_train = X_train[:, (i, j)]
            
            # train a knn model with those features, k=5
            knc = KNeighborsClassifier(n_neighbors=5).fit(partial_x_train, y_train)
            
            # plot "knn separator", using the (parcial) train data
            mglearn.plots.plot_2d_separator(knc, partial_x_train, fill=True, eps=0.5, alpha=.4)

            # predict unseen data
            predicted = knc.predict(X_test[:, (i,j)])
            
            # and now do predictions and plots for the test data
            ## get indices for 'malignant' / 'benign' / wrongly predicted
            idx_cl_1 = (y_test==0).nonzero()
            idx_cl_2 = (y_test==1).nonzero()
            idx_falses = (~(y_test==predicted)).nonzero()

            ax.scatter(X_test[idx_cl_1, i], X_test[idx_cl_1, j], marker='8', s=50, label='class 1')
            ax.scatter(X_test[idx_cl_2, i], X_test[idx_cl_2, j], marker='8', s=50, label='class 2')
            ax.scatter(X_test[idx_falses, i], X_test[idx_falses, j], marker='*', s=50, c='r', label='wrong!')
            
            ax.legend()
            ax.set_xlabel(cancer.feature_names[i])
            ax.set_ylabel(cancer.feature_names[j])

        # ax.legend()
plt.show()


# Test the number of neighbors effect 

Let’s investigate whether we can confirm the connection between model complexity and generalization that we discussed above.
We will do this on the real world breast cancer dataset.
We begin by splitting the dataset into a training and a test set. Then we will evaluate training and test set performance with different numbers of neighbors.

In [None]:
scores_train = []
scores_test = []

list_ks = range(1, 30)

#compute the train and test scores
for k in list_ks:
    knc = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)
    scores_train.append(knc.score(X_train, y_train))
    scores_test.append(knc.score(X_test, y_test))
    
fig = plt.figure(figsize=(15,5))
ax = fig.add_subplot(111)
ax.plot(list_ks, scores_train, label='train')
ax.plot(list_ks, scores_test, label='test')
ax.legend()
plt.ylabel('accuracy (TP + TN) / n')
plt.xlabel('n_neighbors')

plt.show()