<a href="https://colab.research.google.com/github/nitleenk/Clustering/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from sklearn import datasets
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load dataset
iris = datasets.load_iris()
X = iris.data

# Define numbers of clusters
clusters = [3, 4, 5]

# Define preprocessing pipelines
preprocessing_options = {
    'no_preprocessing': None,
    'transform': StandardScaler(),
    'pca': PCA(n_components=2),
    'transform_normalization': Pipeline([
        ('normalization', StandardScaler()),
    ]),
    'transform_normalization_pca': Pipeline([
        ('normalization', StandardScaler()),
        ('pca', PCA(n_components=2))
    ]),
}

# Define clustering algorithms
clustering_algorithms = {
    'KMeans': KMeans,
    'Hierarchical': AgglomerativeClustering,
    'MeanShift': MeanShift,
}

# Iterate over each clustering algorithm
for algorithm_name, algorithm_class in clustering_algorithms.items():
    print(f"Clustering Algorithm: {algorithm_name}")

    # Initialize clustering algorithm
    if algorithm_name == 'MeanShift':
        # MeanShift does not require specifying the number of clusters
        algorithm_instance = algorithm_class()
    else:
        algorithm_instance = algorithm_class()

    # Iterate over each preprocessing option
    for preprocessing_name, preprocessor in preprocessing_options.items():
        print(f"Preprocessing: {preprocessing_name}")

        for c in clusters:
            print(f"Number of clusters: {c}")

            if preprocessor is None:
                X_processed = X
            else:
                X_processed = preprocessor.fit_transform(X)

            if algorithm_name != 'MeanShift':
                # Fit clustering algorithm
                algorithm_instance.set_params(n_clusters=c)
                labels = algorithm_instance.fit_predict(X_processed)
            else:
                labels = algorithm_instance.fit_predict(X_processed)
                c = len(set(labels))

            # Calculate evaluation metrics
            silhouette = silhouette_score(X_processed, labels)
            calinski = calinski_harabasz_score(X_processed, labels)
            davies = davies_bouldin_score(X_processed, labels)

            print(f"Silhouette Score: {silhouette}")
            print(f"Calinski-Harabasz Index: {calinski}")
            print(f"Davies-Bouldin Index: {davies}")
            print("-----------------------------")
        print("\n")


Clustering Algorithm: KMeans
Preprocessing: no_preprocessing
Number of clusters: 3
Silhouette Score: 0.5528190123564095
Calinski-Harabasz Index: 561.62775662962
Davies-Bouldin Index: 0.6619715465007465
-----------------------------
Number of clusters: 4
Silhouette Score: 0.49805050499728737
Calinski-Harabasz Index: 530.7658081872851
Davies-Bouldin Index: 0.7803069838811107
-----------------------------
Number of clusters: 5
Silhouette Score: 0.48874888709310566
Calinski-Harabasz Index: 495.54148767768777
Davies-Bouldin Index: 0.8059652120182739
-----------------------------


Preprocessing: transform
Number of clusters: 3
Silhouette Score: 0.45994823920518635
Calinski-Harabasz Index: 241.90440170183157
Davies-Bouldin Index: 0.8335949464754338
-----------------------------
Number of clusters: 4




Silhouette Score: 0.4188923398171004
Calinski-Harabasz Index: 206.55006422171436
Davies-Bouldin Index: 0.9070610785239831
-----------------------------
Number of clusters: 5
Silhouette Score: 0.34551099599809465
Calinski-Harabasz Index: 203.2674193279362
Davies-Bouldin Index: 0.9451729577717423
-----------------------------


Preprocessing: pca
Number of clusters: 3
Silhouette Score: 0.5976764219497545
Calinski-Harabasz Index: 693.7084334188474
Davies-Bouldin Index: 0.5648157434964132
-----------------------------
Number of clusters: 4
Silhouette Score: 0.5591106064458163
Calinski-Harabasz Index: 718.3740550643823
Davies-Bouldin Index: 0.6042417593095712
-----------------------------
Number of clusters: 5
Silhouette Score: 0.5509643081192234
Calinski-Harabasz Index: 684.9866654445937
Davies-Bouldin Index: 0.6363210531659548
-----------------------------


Preprocessing: transform_normalization
Number of clusters: 3
Silhouette Score: 0.45994823920518635
Calinski-Harabasz Index: 241.9044



Silhouette Score: 0.41511334907493763
Calinski-Harabasz Index: 206.2286230970932
Davies-Bouldin Index: 0.9223648795504499
-----------------------------
Number of clusters: 5
Silhouette Score: 0.3459012795948778
Calinski-Harabasz Index: 202.9515253173917
Davies-Bouldin Index: 0.9483171900518057
-----------------------------


Preprocessing: transform_normalization_pca
Number of clusters: 3
Silhouette Score: 0.5091683341503112
Calinski-Harabasz Index: 293.8565155630174
Davies-Bouldin Index: 0.7099307974801774
-----------------------------
Number of clusters: 4
Silhouette Score: 0.4406187797454989
Calinski-Harabasz Index: 264.3921541655833
Davies-Bouldin Index: 0.7571303717287916
-----------------------------
Number of clusters: 5
Silhouette Score: 0.4110850628660997
Calinski-Harabasz Index: 278.4513952839227
Davies-Bouldin Index: 0.7802925992541686
-----------------------------


Clustering Algorithm: Hierarchical
Preprocessing: no_preprocessing
Number of clusters: 3
Silhouette Score: 0.



Silhouette Score: 0.4466890410285909
Calinski-Harabasz Index: 222.71916382215363
Davies-Bouldin Index: 0.8034665302876753
-----------------------------
Number of clusters: 4
Silhouette Score: 0.4006363159855973
Calinski-Harabasz Index: 201.2514543140406
Davies-Bouldin Index: 0.9788214982592804
-----------------------------
Number of clusters: 5
Silhouette Score: 0.33058726295230545
Calinski-Harabasz Index: 192.6812827176413
Davies-Bouldin Index: 0.9742492313121824
-----------------------------


Preprocessing: pca
Number of clusters: 3
Silhouette Score: 0.59847503536569
Calinski-Harabasz Index: 688.617547802037
Davies-Bouldin Index: 0.5604962418488778
-----------------------------
Number of clusters: 4
Silhouette Score: 0.5409772344640317
Calinski-Harabasz Index: 673.9462643465866
Davies-Bouldin Index: 0.6546236191154585
-----------------------------
Number of clusters: 5
Silhouette Score: 0.5487843719726775
Calinski-Harabasz Index: 665.883112063557
Davies-Bouldin Index: 0.652573487517