### Comparing different clustering algorithms

http://scikit-learn.org/stable/modules/clustering.html

http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py

In [1]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist, pdist
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.neighbors import kneighbors_graph
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from sklearn import cluster, mixture
from collections import defaultdict
import time, warnings
%matplotlib inline

In [2]:
# Use a smaller data set to save time
df = pd.read_csv('PHBsample14_sss.csv', low_memory=False)

In [3]:
# drop the column resulted from sampling of the original data set
df.drop('Unnamed: 0', axis=1, inplace=True)
# In order to run K-means, drop all the categoricald data for now.
df = df.select_dtypes(include=['float64', 'int64'])
# Impute missing values with means
df = df.fillna(df.mean())

In [4]:
pca = PCA(2, svd_solver='randomized')
pca.fit(df)
df_reduced = pca.fit_transform(df)
df_reduced = StandardScaler().fit_transform(df_reduced)

In [5]:
X = df_reduced

In [6]:
params = {'quantile': .3,
                'eps': .3,
                'damping': .9,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 7}

In [7]:
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

In [9]:
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
    X, n_neighbors=params['n_neighbors'], include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
    
kmeans = cluster.KMeans(n_clusters=params['n_clusters'], random_state=0)
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
ward = cluster.AgglomerativeClustering(
    n_clusters=params['n_clusters'], linkage='ward',
    connectivity=connectivity)
spectral = cluster.SpectralClustering(
    n_clusters=params['n_clusters'], eigen_solver='arpack',
    affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=params['eps'])
affinity_propagation = cluster.AffinityPropagation(
    damping=params['damping'], preference=params['preference'])
average_linkage = cluster.AgglomerativeClustering(
    linkage="average", affinity="cityblock",
    n_clusters=params['n_clusters'], connectivity=connectivity)
birch = cluster.Birch(n_clusters=params['n_clusters'])
gmm = mixture.GaussianMixture(
    n_components=params['n_clusters'], covariance_type='full')

clustering_algorithms = (
        ('KMeans', kmeans),
        ('MiniBatchKMeans', two_means),
        #('AffinityPropagation', affinity_propagation),
        ('MeanShift', ms),
        #('SpectralClustering', spectral),
        ('Ward', ward),
        ('AgglomerativeClustering', average_linkage),
        ('DBSCAN', dbscan),
        ('Birch', birch),
        ('GaussianMixture', gmm)
    )

In [25]:
res = defaultdict(list)
for name, algorithm in clustering_algorithms:
    t0 = time.time()
    # catch warnings related to kneighbors_graph
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="the number of connected components of the " +
            "connectivity matrix is [0-9]{1,2}" +
            " > 1. Completing it to avoid stopping the tree early.",
            category=UserWarning)
        warnings.filterwarnings(
            "ignore",
            message="Graph is not fully connected, spectral embedding" +
            " may not work as expected.",
            category=UserWarning)
    algorithm.fit(X)
    t1 = time.time()
    
    res[name].append(t1-t0)
    if name == 'KMeans' or name == 'MiniBatchKMeans':
        inertia = algorithm.inertia_
    else:
        inertia = 'N/A'
    res[name].append(inertia)
    if name == 'GaussianMixture':
        labels = algorithm.predict(X)
    else:
        labels = algorithm.labels_
        score = silhouette_score(X, labels, metric='euclidean', sample_size=3000)
    res[name].append(score)
    print(name + " done")

KMeans done
MiniBatchKMeans done
MeanShift done


  affinity='euclidean')


Ward done


  affinity=affinity)


AgglomerativeClustering done
DBSCAN done
Birch done
GaussianMixture done


In [27]:
res

defaultdict(list,
            {'AgglomerativeClustering': [16.376465320587158,
              'N/A',
              0.47501714371998371],
             'Birch': [1.4826855659484863, 'N/A', 0.59386392573767688],
             'DBSCAN': [9.213496685028076, 'N/A', 0.39426293078878188],
             'GaussianMixture': [0.7206833362579346,
              'N/A',
              0.59386392573767688],
             'KMeans': [0.6918013095855713,
              7309.4642493447109,
              0.62467899274492711],
             'MeanShift': [0.49723100662231445, 'N/A', 0.54130706500698167],
             'MiniBatchKMeans': [0.28776049613952637,
              7315.0807751539096,
              0.61821769410051175],
             'Ward': [25.345725297927856, 'N/A', 0.62250415425345351]})

In [32]:
df = pd.DataFrame.from_dict(res, orient='index')
df.columns = ['Run time', 'Within-cluster sum of squares(inertia)', \
                                                         'Silhouette score']

In [33]:
df

Unnamed: 0,Run time,Within-cluster sum of squares(inertia),Silhouette score
KMeans,0.691801,7309.46,0.624679
MiniBatchKMeans,0.28776,7315.08,0.618218
MeanShift,0.497231,,0.541307
Ward,25.345725,,0.622504
AgglomerativeClustering,16.376465,,0.475017
DBSCAN,9.213497,,0.394263
Birch,1.482686,,0.593864
GaussianMixture,0.720683,,0.593864
