In [1]:
import numpy as np
import pandas as pd
from bokeh.plotting import output_notebook, show
from bokeh.layouts import gridplot
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from utils import scatterplot_bokeh


output_notebook()
iris = load_iris()

# training pca
z = PCA(n_components=2).fit_transform(iris.data)
zx, zy = z[:,0], z[:,1]

# store true labels
true = iris.target

# ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
columns = [name[:-5].replace(' ', '_') for name in iris['feature_names']]

iris_df = pd.DataFrame(data=iris.data, columns=columns)
iris_df['target'] = iris.target
iris_df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# training k-means
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=30, verbose=0)
labels = kmeans.fit_predict(iris.data)
title = 'k-means k=3'
_ = scatterplot_bokeh(iris_df, labels=labels, x=zx, y=zy,
    title=title, height=400, width=400, show_inline=True)

In [3]:
# unsupervised
from sklearn.metrics import silhouette_score
# supervised
from sklearn.metrics import mutual_info_score
from sklearn.metrics import homogeneity_completeness_v_measure
from bokeh.models.widgets import Div


def evaluate(X, true, predicted, model):
    n_clusters = np.where(np.unique(predicted) >= 0)[0].shape[0]
    coverage = 100 * np.where(predicted >= 0)[0].shape[0] / X.shape[0]

    inertia = model.inertia_
    sil_score = silhouette_score(X, predicted)
    homogeneity, completeness, v_measure = \
        homogeneity_completeness_v_measure(true, predicted)
    mi = mutual_info_score(true, predicted)

    div = Div(text=f"""</br></br>
<b>Clustering quality</b></br>
   -------------------------</br>
<b>n clusters    </b>: {n_clusters}</br>
<b>coverage      </b>: {coverage:.4}</br>
<b>inertia       </b>: {inertia:.4}</br>
<b>silhouette    </b>: {sil_score:.4}</br>
<b>homogeneity   </b>: {homogeneity:.4}</br>
<b>completeness  </b>: {completeness:.4}</br>
<b>v_measure     </b>: {v_measure:.4}</br>
<b>M.I.          </b>: {mi:.4}</br>""",
            width=300, height=300)
    return div

show(evaluate(iris.data, true, labels, kmeans))

In [4]:
from bokeh.layouts import row, column


rows = []
for k in [2, 3, 5, 8]:
    # train k-means
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=30, verbose=0)
    labels = kmeans.fit_predict(iris.data)

    # prepare figures
    title = f'k-means k={k}'
    p = scatterplot_bokeh(iris_df, labels=labels, x=zx, y=zy,
        title=title, height=400, width=400, show_inline=False)
    pre = evaluate(iris.data, true, labels, kmeans)
    rows.append(row(p, pre))

layout = column(*rows)
show(layout)