# K-Means Parameter Selection

### Description:
In this notebook we look at selecting k for k-means using a few different datasets.

## Imports

In [16]:
import numpy as np
from sklearn.preprocessing import scale
from sklearn.datasets import load_digits

from Toolbox.multi_validity_checker import multi_silhouette
from Toolbox.multi_Cluster import multi_k_means

K_VALUES = [3,4,5,6,7,8,9,10,11,12]
INITS = ['k-means++', 'random']

np.random.seed(42)


## Setting the parameters of our exploration

Here we set some values that will be used in our exploration of k-means.

#### INITS
k-means, as implmented by scikit-learn, has three seperate ways to choose the initial centroid location.

In [None]:
# select values of k to trial
K_VALUES = [3,4,5,6,7,8,9,10,11,12]

# Which methods of centroid initialisation do we want to try? 
INITS = ['k-means++', 'random']

# to make our notebook deterministic, we will set a random seed
np.random.seed(42)

# Digits

## Load the data and standardise it


In [17]:
X_digits, y_digits = load_digits(return_X_y = True)

n_samples, n_features = X_digits.shape
n_digits = len(np.unique(y_digits))

scaled_inputs = scale(X_digits)

## Define a function that benchmarks k-means

In [21]:
def benchmark_kmeans(inputs,print_=True):    
    if print_:
        print("n_digits: %d, \t n_samples %d, \t n_features %d"
              % (n_digits, n_samples, n_features))
        print(70 * '_')
        heading_string = 'init/k-values\t'
        for k in K_VALUES:
            heading_string += '%-8i' % k
        print(heading_string)

    init_by_k_scores = {}
    for init in INITS:
        multi_estimated_labels = multi_k_means(inputs, K_VALUES, init=init)
        init_by_k_scores[init] = multi_silhouette(inputs, multi_estimated_labels)
        if print_:
            row_string = '%-9s' % init
            for k_scores in init_by_k_scores[init].values():
                row_string += '\t%.3f' % k_scores
            print(row_string)


In [22]:
benchmark_kmeans(scaled_inputs, True)

n_digits: 10, 	 n_samples 1797, 	 n_features 64
______________________________________________________________________
init/k-values	6       7       8       9       10      11      12      
k-means++	0.098	0.127	0.125	0.137	0.141	0.155	0.142
random   	0.112	0.115	0.125	0.124	0.147	0.147	0.153


# Iris