# k-means method Clustering sample (Non Hierarchical Clustering)

- Assigns each observation to one of the "k" clusters by minimizing the within-cluster sum of squares.
- "k" in k-means represents the number of clusters you want to create, which needs to be specified beforehand.
- Uses iterative refinement to improve cluster assignments.
- Can get trapped in local minima, so often it's good to run the algorithm multiple times with different starting conditions.
- Sensitive to the initial placement of centroids.
  - An unfavorable start can lead to suboptimal clustering.
- Assumes clusters to be spherical and equally sized, which might not always be the case.


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import adjusted_rand_score


In [None]:
iris = load_iris()

iris.data.shape

In [None]:
iris.target_names

In [None]:
iris.feature_names

In [None]:
"""Preparing Iris data"""
df_iris = pd.DataFrame(
    iris.data,
    columns=iris.feature_names
)

display(df_iris)

In [None]:
# Display data description.
display(df_iris.describe())

In [None]:
# Visualize feature relationships with a pair plot.
sns.pairplot(df_iris)
plt.show()

Midway Result Discussion 
- The __correlation trend__ between PETAL and WIDTH can be confirmed.

Data preprocessing is completed here by preparing and understanding the data.

In [None]:
"""Apply k-means clustering."""
model = KMeans(
    # cluster length
    n_clusters=3,
    # random seed: for fix the result every time.
    random_state=0,
    # way of initialize the cluster center.(random)
    init='random',
    n_init=10
)

# Clustering model building.
model.fit(df_iris)

In [None]:
model.predict(df_iris)

In [None]:
df_iris['cluster'] = model.predict(df_iris)

In [None]:
# Visualize clustering results.
sns.pairplot(df_iris, hue='cluster')
plt.show()

In [None]:
# Display cluster centers from the created model.
cluster_center = pd.DataFrame(
    # obtain each cluster center
    model.cluster_centers_
)
display(cluster_center)

In [None]:
cluster_center.columns = df_iris.columns[:4]
display(cluster_center)

In [None]:
column_name_sepal_length: str = 'sepal length (cm)'
column_name_width_length: str = 'sepal width (cm)'

plt.scatter(
    df_iris[column_name_sepal_length],
    df_iris[column_name_width_length],
    c=df_iris['cluster']
)
# Show each cluseter center
plt.scatter(
    cluster_center[column_name_sepal_length],
    cluster_center[column_name_width_length],
    marker='*',
    color='red'
)
plt.xlabel(column_name_sepal_length)
plt.ylabel(column_name_width_length)
plt.show()

In [None]:
# Display mean feature values by cluster.
display(
    df_iris.groupby('cluster').mean().round(3)
)

In [None]:
# Add true labels to dataframe.
df_iris['target'] = iris.target
df_iris.replace(
    {'target': {
        0: 'setosa',
        1: 'versicolor',
        2: 'virginica'
    }},
    inplace=True
)

In [None]:
# Display mean feature values by true label.
display(
    df_iris.groupby('target').mean().round(3)
)

In [None]:
# Evaluate clustering.
ari = adjusted_rand_score(
    iris.target,
    df_iris['cluster']
)

# Adusted Rand Index: ARI
f'ARI: {ari:.2f}'

In [None]:
# Evaluate acuracy for compare with ARI.
accuracy = accuracy_score(
    iris.target,
    df_iris['cluster']
)

f'Acuracy: {accuracy:.2f}'

In [None]:
"""Change init cluster center positions from random to k-means++."""
df_iris = pd.DataFrame(
    iris.data,
    columns=iris.feature_names
)

model = KMeans(
    n_clusters=3,
    random_state=0,
    # Note: By default, k-means is selected.
    init='k-means++',
    n_init=10
)
cls_data = df_iris.copy()
model.fit(cls_data)
cls_data['cluster'] = model.predict(cls_data)

sns.pairplot(cls_data, hue='cluster')
plt.show()

In [None]:
# Evaluate.
ari = adjusted_rand_score(
    iris.target,
    cls_data['cluster']
)

f'ARI: {ari:.2f}'

- ARI was the same. Accuracy remained the same.

In [None]:
model = KMeans(
    n_clusters=2,
    random_state=0,
    n_init=10
)
cls_data = df_iris.copy()
model.fit(cls_data)
cls_data['cluster'] = model.predict(cls_data)

sns.pairplot(cls_data, hue='cluster')
plt.show()

In [None]:
ari = adjusted_rand_score(
    iris.target,
    cls_data['cluster']
)

f'ARI: {ari:.2f}'

- ARI was found to be more accurate at 3 than at this 2 setting used.