# K-Means Clustering



In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 1. Load the Iris dataset
iris = datasets.load_iris()
X = iris.data

In [3]:
# 2. Define the preprocessing techniques
preprocessing_techniques = {
    "No data processing": None,
    "Using normalization": Normalizer(),
    "Using standardization": StandardScaler(),
    "Using PCA": PCA(n_components=2),
    "Using T+N": StandardScaler(with_mean=False),
    "Using T + N + PCA": PCA(n_components=2)
}

In [4]:
# 3. Define the parameters
parameters = {
    "c": [4, 5, 6],
}

In [5]:
# 4. Create lists to store scores
silhouette_scores = []
calinski_harabasz_scores = []
davies_bouldin_scores = []

In [6]:
# 5. Iterate over preprocessing techniques
for technique_name, technique in preprocessing_techniques.items():
    # Apply preprocessing technique
    if technique is not None:
        X_processed = technique.fit_transform(X)
    else:
        X_processed = X

    # Iterate over parameters
    for c in parameters["c"]:
        # Perform K-means clustering
        kmeans = KMeans(n_clusters=c, random_state=42)
        kmeans.fit(X_processed)

        # Evaluate clustering performance
        silhouette = silhouette_score(X_processed, kmeans.labels_)
        calinski_harabasz = calinski_harabasz_score(X_processed, kmeans.labels_)
        davies_bouldin = davies_bouldin_score(X_processed, kmeans.labels_)

        # Append scores to the lists
        silhouette_scores.append((technique_name, c, silhouette))
        calinski_harabasz_scores.append((technique_name, c, calinski_harabasz))
        davies_bouldin_scores.append((technique_name, c, davies_bouldin))

In [7]:
# 7. Create DataFrame for silhouette scores
silhouette_df = pd.DataFrame(silhouette_scores, columns=["Preprocessing Technique", "c", "Silhouette Score"])

# Merge "Preprocessing Technique" and "c" columns into a single column
silhouette_df["Parameter"] = silhouette_df["Preprocessing Technique"] + " (c=" + silhouette_df["c"].astype(str) + ")"

# Drop the "Preprocessing Technique" and "c" columns
silhouette_df.drop(columns=["Preprocessing Technique", "c"], inplace=True)

In [8]:
# 8. Create DataFrame for Calinski-Harabasz scores
calinski_df = pd.DataFrame(calinski_harabasz_scores, columns=["Preprocessing Technique", "c", "Calinski-Harabasz Score"])

# Merge "Preprocessing Technique" and "c" columns into a single column
calinski_df["Parameter"] = calinski_df["Preprocessing Technique"] + " (c=" + calinski_df["c"].astype(str) + ")"

# Drop the "Preprocessing Technique" and "c" columns
calinski_df.drop(columns=["Preprocessing Technique", "c"], inplace=True)

In [9]:
# 9. Create DataFrame for Davies-Bouldin scores
davies_df = pd.DataFrame(davies_bouldin_scores, columns=["Preprocessing Technique", "c", "Davies-Bouldin Score"])

# Merge "Preprocessing Technique" and "c" columns into a single column
davies_df["Parameter"] = davies_df["Preprocessing Technique"] + " (c=" + davies_df["c"].astype(str) + ")"

# Drop the "Preprocessing Technique" and "c" columns
davies_df.drop(columns=["Preprocessing Technique", "c"], inplace=True)

In [10]:
# 10. Merge all DataFrames
result_df = silhouette_df.merge(calinski_df, on="Parameter").merge(davies_df, on="Parameter")

# 11. Reorder the columns
result_df = result_df[["Parameter", "Silhouette Score", "Calinski-Harabasz Score", "Davies-Bouldin Score"]]

In [11]:
# 12. Display the DataFrame
print("K-Means Clustering DataFrame:")
result_df

K-Means Clustering DataFrame:


Unnamed: 0,Parameter,Silhouette Score,Calinski-Harabasz Score,Davies-Bouldin Score
0,No data processing (c=4),0.498051,530.765808,0.780307
1,No data processing (c=5),0.488749,495.541488,0.805965
2,No data processing (c=6),0.364834,473.850607,0.914158
3,Using normalization (c=4),0.540241,1157.648904,0.789422
4,Using normalization (c=5),0.356338,1012.196204,1.065944
5,Using normalization (c=6),0.319809,939.665126,1.096092
6,Using standardization (c=4),0.386941,207.265914,0.869814
7,Using standardization (c=5),0.341947,203.268233,0.953046
8,Using standardization (c=6),0.326745,187.100484,1.055439
9,Using PCA (c=4),0.557741,719.123544,0.615069


# Hierarical Clustering





In [12]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data

# Define the preprocessing techniques
preprocessing_techniques = {
    "No data processing": None,
    "Using normalization": Normalizer(),
    "Using standardization": StandardScaler(),
    "Using PCA": PCA(n_components=2),
    "Using T+N": StandardScaler(with_mean=False),
    "Using T + N + PCA": PCA(n_components=2)
}

# Define the parameters
parameters = {
    "n_clusters": [4, 5, 6],
}

# Create lists to store scores
silhouette_scores = []
calinski_harabasz_scores = []
davies_bouldin_scores = []

# Iterate over preprocessing techniques
for technique_name, technique in preprocessing_techniques.items():
    # Apply preprocessing technique
    if technique is not None:
        X_processed = technique.fit_transform(X)
    else:
        X_processed = X

    # Iterate over parameters
    for n_clusters in parameters["n_clusters"]:
        # Perform Hierarchical clustering
        hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
        hierarchical.fit(X_processed)

        # Evaluate clustering performance
        silhouette = silhouette_score(X_processed, hierarchical.labels_)
        calinski_harabasz = calinski_harabasz_score(X_processed, hierarchical.labels_)
        davies_bouldin = davies_bouldin_score(X_processed, hierarchical.labels_)

        # Append scores to the lists
        silhouette_scores.append((technique_name, n_clusters, silhouette))
        calinski_harabasz_scores.append((technique_name, n_clusters, calinski_harabasz))
        davies_bouldin_scores.append((technique_name, n_clusters, davies_bouldin))

In [13]:
# Create DataFrame for silhouette scores
silhouette_df = pd.DataFrame(silhouette_scores, columns=["Preprocessing Technique", "n_clusters", "Silhouette Score"])

# Merge "Preprocessing Technique" and "n_clusters" columns into a single column
silhouette_df["Parameter"] = silhouette_df["Preprocessing Technique"] + " (n_clusters=" + silhouette_df["n_clusters"].astype(str) + ")"

# Drop the "Preprocessing Technique" and "n_clusters" columns
silhouette_df.drop(columns=["Preprocessing Technique", "n_clusters"], inplace=True)

# Create DataFrame for Calinski-Harabasz scores
calinski_df = pd.DataFrame(calinski_harabasz_scores, columns=["Preprocessing Technique", "n_clusters", "Calinski-Harabasz Score"])

# Merge "Preprocessing Technique" and "n_clusters" columns into a single column
calinski_df["Parameter"] = calinski_df["Preprocessing Technique"] + " (n_clusters=" + calinski_df["n_clusters"].astype(str) + ")"

# Drop the "Preprocessing Technique" and "n_clusters" columns
calinski_df.drop(columns=["Preprocessing Technique", "n_clusters"], inplace=True)

# Create DataFrame for Davies-Bouldin scores
davies_df = pd.DataFrame(davies_bouldin_scores, columns=["Preprocessing Technique", "n_clusters", "Davies-Bouldin Score"])

# Merge "Preprocessing Technique" and "n_clusters" columns into a single column
davies_df["Parameter"] = davies_df["Preprocessing Technique"] + " (n_clusters=" + davies_df["n_clusters"].astype(str) + ")"

# Drop the "Preprocessing Technique" and "n_clusters" columns
davies_df.drop(columns=["Preprocessing Technique", "n_clusters"], inplace=True)

# Merge all DataFrames
result_df = silhouette_df.merge(calinski_df, on="Parameter").merge(davies_df, on="Parameter")

# Reorder the columns
result_df = result_df[["Parameter", "Silhouette Score", "Calinski-Harabasz Score", "Davies-Bouldin Score"]]

# Display the DataFrame
print("Hierarcial Clustering DataFrame:")
result_df

Hierarcial Clustering DataFrame:


Unnamed: 0,Parameter,Silhouette Score,Calinski-Harabasz Score,Davies-Bouldin Score
0,No data processing (n_clusters=4),0.488967,515.078906,0.795264
1,No data processing (n_clusters=5),0.484383,488.484904,0.820417
2,No data processing (n_clusters=6),0.359238,464.949392,0.926663
3,Using normalization (n_clusters=4),0.479108,1063.773023,0.868267
4,Using normalization (n_clusters=5),0.465668,917.203228,0.994574
5,Using normalization (n_clusters=6),0.286263,852.335213,1.21629
6,Using standardization (n_clusters=4),0.400636,201.251454,0.978821
7,Using standardization (n_clusters=5),0.330587,192.681283,0.974249
8,Using standardization (n_clusters=6),0.314855,172.123007,0.989479
9,Using PCA (n_clusters=4),0.540977,673.946264,0.654624


# K-means Shift Clustering

In [14]:
import numpy as np
import pandas as pd
from sklearn.cluster import MeanShift
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import warnings
from sklearn import datasets

warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data

# Define the preprocessing techniques
preprocessing_techniques = {
    "No data processing": None,
    "Using normalization": Normalizer(),
    "Using standardization": StandardScaler(),
    "Using PCA": PCA(n_components=2),
    "Using T+N": StandardScaler(with_mean=False),
    "Using T + N + PCA": PCA(n_components=2)
}

# Define the bandwidth parameter values
bandwidths = [0.2, 0.5, 1.0, 1.5]

# Create lists to store scores
silhouette_scores = []
calinski_harabasz_scores = []
davies_bouldin_scores = []

# Iterate over preprocessing techniques
for technique_name, technique in preprocessing_techniques.items():
    # Apply preprocessing technique
    if technique is not None:
        X_processed = technique.fit_transform(X)
    else:
        X_processed = X

    # Iterate over bandwidth parameter values
    for bandwidth in bandwidths:
        # Perform MeanShift clustering
        mean_shift = MeanShift(bandwidth=bandwidth)
        labels = mean_shift.fit_predict(X_processed)

        # Check if more than one cluster is identified
        if len(np.unique(labels)) > 1:
            # Evaluate clustering performance
            silhouette = silhouette_score(X_processed, labels)
            calinski_harabasz = calinski_harabasz_score(X_processed, labels)
            davies_bouldin = davies_bouldin_score(X_processed, labels)

            # Append scores to the lists
            silhouette_scores.append((technique_name, bandwidth, silhouette))
            calinski_harabasz_scores.append((technique_name, bandwidth, calinski_harabasz))
            davies_bouldin_scores.append((technique_name, bandwidth, davies_bouldin))

In [15]:
# Create DataFrame for silhouette scores
silhouette_df = pd.DataFrame(silhouette_scores, columns=["Preprocessing Technique", "c", "Silhouette Score"])

# Merge "Preprocessing Technique" and "c" columns into a single column
silhouette_df["Parameter"] = silhouette_df["Preprocessing Technique"] + " (c=" + silhouette_df["c"].astype(str) + ")"

# Drop the "Preprocessing Technique" and "c" columns
silhouette_df.drop(columns=["Preprocessing Technique", "c"], inplace=True)

# Create DataFrame for Calinski-Harabasz scores
calinski_df = pd.DataFrame(calinski_harabasz_scores, columns=["Preprocessing Technique", "c", "Calinski-Harabasz Score"])

# Merge "Preprocessing Technique" and "c" columns into a single column
calinski_df["Parameter"] = calinski_df["Preprocessing Technique"] + " (c=" + calinski_df["c"].astype(str) + ")"

# Drop the "Preprocessing Technique" and "c" columns
calinski_df.drop(columns=["Preprocessing Technique", "c"], inplace=True)

# Create DataFrame for Davies-Bouldin scores
davies_df = pd.DataFrame(davies_bouldin_scores, columns=["Preprocessing Technique", "c", "Davies-Bouldin Score"])

# Merge "Preprocessing Technique" and "c" columns into a single column
davies_df["Parameter"] = davies_df["Preprocessing Technique"] + " (c=" + davies_df["c"].astype(str) + ")"

# Drop the "Preprocessing Technique" and "c" columns
davies_df.drop(columns=["Preprocessing Technique", "c"], inplace=True)

# Merge all DataFrames
result_df = silhouette_df.merge(calinski_df, on="Parameter").merge(davies_df, on="Parameter")

# Reorder the columns
result_df = result_df[["Parameter", "Silhouette Score", "Calinski-Harabasz Score", "Davies-Bouldin Score"]]

# Display the DataFrame
print("K-means Shift Clustering DataFrame:")
result_df

K-means Shift Clustering DataFrame:


Unnamed: 0,Parameter,Silhouette Score,Calinski-Harabasz Score,Davies-Bouldin Score
0,No data processing (c=0.2),0.150514,328.43622,0.276689
1,No data processing (c=0.5),0.307012,214.407732,0.696916
2,No data processing (c=1.0),0.685788,509.703427,0.388552
3,No data processing (c=1.5),0.685788,509.703427,0.388552
4,Using normalization (c=0.2),0.819244,1633.998496,0.240689
5,Using standardization (c=0.2),0.114349,319.411235,0.176182
6,Using standardization (c=0.5),0.300961,131.731027,0.689627
7,Using standardization (c=1.0),0.350613,137.348049,0.785001
8,Using standardization (c=1.5),0.58175,251.349339,0.593313
9,Using PCA (c=0.2),0.35154,815.378994,0.535089
