In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
cancer = load_breast_cancer()
cancer_df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)

In [None]:
def kMeans_init_centroids(X, K):
    """
    This function initializes K centroids that are to be 
    used in K-Means on the dataset X
    
    Args:
        X (ndarray): Data points 
        K (int):     number of centroids/clusters
    
    Returns:
        centroids (ndarray): Initialized centroids
    """
    
    # Randomly reorder the indices of examples
    randidx = np.random.permutation(X.shape[0])
    
    # Take the first K examples as centroids
    centroids = X[randidx[:K]]
    
    return centroids

In [None]:
def find_closest_centroids(X, centroids):
    """
    Computes the centroid memberships for every example
    
    Args:
        X (ndarray): (m, n) Input values      
        centroids (ndarray): (K, n) centroids
    
    Returns:
        idx (array_like): (m,) closest centroids
    
    """

    m = X.shape[0]
    K = centroids.shape[0]
    idx = np.zeros(m, dtype=int) # the index of the closest centroid to each x

    for i in range(m):
        distances = np.zeros(K)
        for j in range(K):
            distances[j] = np.linalg.norm(X[i] - centroids[j])
        idx[i] = np.argmin(distances)
    
    return idx

In [None]:
def compute_centroids(X, idx, K):
    """
    Returns the new centroids by computing the means of the 
    data points assigned to each centroid.
    
    Args:
        X (ndarray):   (m, n) Data points
        idx (ndarray): (m,) Array containing index of closest centroid for each 
                       example in X. Concretely, idx[i] contains the index of 
                       the centroid closest to example i
        K (int):       number of centroids
    
    Returns:
        centroids (ndarray): (K, n) New centroids computed
    """
    
    m, n = X.shape
    centroids = np.zeros((K, n))
    
    for k in range(K):
        points = X[idx == k]
        centroids[k] = np.mean(points, axis = 0)
    
    return centroids

In [None]:
def compute_cost(X, centroids, idx):
    cost = 0.
    m = X.shape[0]

    for i in range(m):
        centroid = centroids[idx[i]]
        cost += np.linalg.norm(X[i] - centroid)**2
    cost /= m

    return cost

In [None]:
def run_kMeans(X, initial_centroids, max_iters=10, cost_epsilon=0.01):
    """
    Runs the K-Means algorithm on data matrix X, where each row of X
    is a single example
    """
    
    m, n = X.shape
    K = initial_centroids.shape[0]
    centroids = initial_centroids
    previous_centroids = centroids    
    idx = np.zeros(m)
    cost_history = []
    previous_cost = float('inf')

    for i in range(max_iters):
        print("K-Means iteration %d/%d" % (i, max_iters-1))
        
        # For each example in X, assign it to the closest centroid
        idx = find_closest_centroids(X, centroids)
            
        # Given the memberships, compute new centroids
        centroids = compute_centroids(X, idx, K)

        # Compute cost of new centroids
        cost = compute_cost(X, centroids, idx)
        cost_history.append(cost)
        if (abs(cost - previous_cost) < cost_epsilon):
            print(f"K-means converged with cost {cost}")
            break

        previous_cost = cost
    
    return centroids, idx, cost_history

In [None]:
X = cancer_df.values

In [None]:
K = 2 # the goal is to cluster the samples to distinguish between malignant and benign cases.
max_iters = 10
initial_centroids = kMeans_init_centroids(X, K)
centroids, idx, cost_history = run_kMeans(X, initial_centroids, max_iters)

In [None]:
plt.plot(cost_history)

In [None]:
idx.sum()

In [None]:
cancer.target.sum()

In [None]:
fx = idx

In [None]:
print(f"Accuracy: {np.mean(cancer.target == fx)}")

In [None]:
# Compute a baseline for accuracy
unique, counts = np.unique(cancer.target, return_counts=True)
print(dict(zip(unique, counts)))
baseline_accuracy = counts.max() / counts.sum()
print("Majority baseline:", baseline_accuracy)

In [None]:
cm = confusion_matrix(cancer.target, fx)
print(f'TN: {cm[0, 0]}, FP: {cm[0, 1]},')
print(f'FN: {cm[1, 0]}, TP: {cm[1, 1]}')

In [None]:
tn, fp, fn, tp = cm.ravel()
true_positive_rate = tp / (tp + fn)
precision = tp / (tp + fp)
true_negative_rate = tn / (tn + fp)

In [None]:
print(f"True Positive Rate: {true_positive_rate: 0.3f}")
print(f"Precision: {precision: 0.3f}")
print(f"True Negative Rate: {true_negative_rate: 0.3f}")