# Representative-based / Prototype-based Clustering

**Load necessary packages and apply custom configurations**

In [None]:
import warnings; 
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore",category=UserWarning)
warnings.simplefilter(action="ignore",category=FutureWarning)

import matplotlib.pyplot as plt
#plt.style.use('ggplot')
plt.style.use('seaborn-v0_8-muted')
plt.rcParams['figure.figsize'] = (8, 8)
plt.rcParams['grid.linestyle'] = ':'   
plt.rcParams['axes.grid'] = False

import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})
#sns.color_palette("RdBu", n_colors=10)

# Interactive plots embedded within the notebook
#%matplotlib notebook 
# Static images of plots embedded within the notebook
# %matplotlib inline   
%config InlineBackend.figure_formats = {'png', 'retina'}

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels as sm
import sklearn as sk

#pd.options.plotting.backend = "plotly" 
# Conflict with options in original matplotlib.

print('Numpy version', np.__version__)
print('Pandas version', pd.__version__)
print('Seaborn version', sns.__version__)
print('Statsmodels version', sm.__version__)
print('Sklearn version', sk.__version__)

In [None]:
font_size=13
params = {'legend.fontsize': 'large',
          'figure.figsize': (5,4),
          'axes.labelsize': font_size,
          'axes.titlesize': font_size,
          'xtick.labelsize': font_size*0.8,
          'ytick.labelsize': font_size*0.8,
          'axes.titlepad': 25}
plt.rcParams.update(params)

## K-Means clustering

### Load the data

In [None]:
from sklearn.datasets import make_blobs

X, _ = make_blobs(n_samples=800, centers=4, random_state=42)

plt.figure(figsize=(5,4))
plt.scatter(X[:, 0], X[:, 1]);
plt.tight_layout();

### Fit K-means model

<font color='blue'> Import the KMeans function, specify the number of clusters K and fit the K-means model.  </font>
    
Run ```pip install --upgrade threadpoolctl==3.1.0``` if fit() generates errors.

<font color='blue'>Get the cluster labels, sum of squared distance, and cluster centroids

In [None]:
np.set_printoptions(precision=3)
print(f"Sum of squared errors : {sse:.3f}")
print(f"\nCentroids : \n {centroids}") 

### Visualize the clustering 

In [None]:
import matplotlib.pyplot as plt

plt.subplots(figsize=(5,4))
plt.scatter(X[:, 0], X[:, 1], c=labels, s=5, cmap='Accent')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);
plt.tight_layout();

### Elbow method

<font color='blue'>Determine SSE for different number of clusters K

<font color='blue'>Plot SSE vs. the number of cluster K

In [None]:
_,ax=plt.subplots(figsize=(5,4))
ax.plot(list(sse.keys()), list(sse.values()), marker='o', alpha=0.5, ms=8);
plt.tight_layout();

### Other Clustering Indices

<font color='blue'>Silhouette coefficient

<font color='blue'>Calinski Harabasz Index

<font color=blue> Davies-Bouldin Index </font>

In [None]:
from sklearn.metrics import davies_bouldin_score

davies_bouldin_score(X, labels)