In [27]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


In [28]:

from google.colab import files

# Upload dataset file to Google Colab
uploaded = files.upload()


Saving exasens.zip to exasens (2).zip


In [29]:
# Read the dataset into a Pandas DataFrame
exasens_df = pd.read_csv('exasens.zip')


In [30]:
# Drop columns with no useful information
exasens_df = exasens_df.drop(columns=['Unnamed: 9', 'Unnamed: 10'])

# Display unique values in columns "Unnamed: 11" and "Unnamed: 12"
print("Unique values in Unnamed: 11:", exasens_df["Unnamed: 11"].unique())
print("Unique values in Unnamed: 12:", exasens_df["Unnamed: 12"].unique())

Unique values in Unnamed: 11: [nan 'Gender ' 'Male=1' 'Female=0']
Unique values in Unnamed: 12: [nan 'Somking' 'Non-smoker=1' 'Ex-smoker=2' 'Active-smoker=3']


In [31]:
# Map values in column "Unnamed: 11" to meaningful labels
exasens_df["Gender"] = exasens_df["Unnamed: 11"].map({"Male=1": "Male", "Female=0": "Female"})

# Map values in column "Unnamed: 12" to meaningful labels
exasens_df["Smoking Status"] = exasens_df["Unnamed: 12"].map({"Non-smoker=1": "Non-smoker",
                                                             "Ex-smoker=2": "Ex-smoker",
                                                             "Active-smoker=3": "Active-smoker"})

# Drop the original columns "Unnamed: 11" and "Unnamed: 12"
exasens_df = exasens_df.drop(columns=["Unnamed: 11", "Unnamed: 12"])

# Display the updated DataFrame
print(exasens_df.head())

  Diagnosis     ID Imaginary Part    Unnamed: 3 Real Part    Unnamed: 5  \
0       NaN    NaN            NaN           NaN       NaN           NaN   
1       NaN    NaN           Min          Avg.       Min          Avg.    
2      COPD  301-4        -320.61  -300.5635307   -495.26  -464.1719907   
3      COPD  302-3        -325.39  -314.7503595   -473.73  -469.2631404   
4      COPD  303-3           -323  -317.4360556   -476.12  -471.8976667   

   Gender   Age  Smoking Smoking Status  
0     NaN   NaN      NaN            NaN  
1     NaN   NaN      NaN            NaN  
2    Male  77.0      2.0     Non-smoker  
3  Female  72.0      2.0      Ex-smoker  
4     NaN  73.0      3.0  Active-smoker  


In [32]:
# Drop the original column "Unnamed: 3"
exasens_df = exasens_df.drop(columns=["Unnamed: 3"])

# Handle missing values in the "Gender" and "Smoking Status" columns
exasens_df["Gender"] = exasens_df["Gender"].fillna("Unknown")
exasens_df["Smoking Status"] = exasens_df["Smoking Status"].fillna("Unknown")

# Display the updated DataFrame
print(exasens_df.head())


  Diagnosis     ID Imaginary Part Real Part    Unnamed: 5   Gender   Age  \
0       NaN    NaN            NaN       NaN           NaN  Unknown   NaN   
1       NaN    NaN           Min       Min          Avg.   Unknown   NaN   
2      COPD  301-4        -320.61   -495.26  -464.1719907     Male  77.0   
3      COPD  302-3        -325.39   -473.73  -469.2631404   Female  72.0   
4      COPD  303-3           -323   -476.12  -471.8976667  Unknown  73.0   

   Smoking Smoking Status  
0      NaN        Unknown  
1      NaN        Unknown  
2      2.0     Non-smoker  
3      2.0      Ex-smoker  
4      3.0  Active-smoker  


In [33]:
num_columns = exasens_df.shape[1]
print("Number of columns in the DataFrame:", num_columns)


Number of columns in the DataFrame: 9


In [34]:
# Select relevant columns for clustering
data = exasens_df[['Imaginary Part', 'Real Part', 'Age', 'Smoking']]

# Drop rows with any missing values
data = data.dropna()

# Display the first few rows of the data
print(data.head())

  Imaginary Part Real Part   Age  Smoking
2        -320.61   -495.26  77.0      2.0
3        -325.39   -473.73  72.0      2.0
4           -323   -476.12  73.0      3.0
5        -327.78   -473.73  76.0      2.0
6        -325.39   -478.52  65.0      2.0


In [81]:
from sklearn.cluster import AgglomerativeClustering
#case 1
# Define cluster sizes
cluster_sizes = [3, 4, 5]

# Perform hierarchical clustering and compute evaluation scores for each cluster size
for n_clusters in cluster_sizes:
    # Hierarchical clustering without data preprocessing
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clusterer.fit_predict(data)

    # Compute evaluation scores
    silhouette_score_case1 = silhouette_score(data, labels)
    calinski_harabasz_score_case1 = calinski_harabasz_score(data, labels)
    davies_bouldin_score_case1 = davies_bouldin_score(data, labels)

    # Print evaluation scores
    print(f"Cluster Size: {n_clusters}")
    print("Silhouette Score:", silhouette_score_case1)
    print("Calinski-Harabasz Score:", calinski_harabasz_score_case1)
    print("Davies-Bouldin Score:", davies_bouldin_score_case1)
    print()


Cluster Size: 3
Silhouette Score: 0.6809951620177085
Calinski-Harabasz Score: 158.22879754321895
Davies-Bouldin Score: 0.3166580344213665

Cluster Size: 4
Silhouette Score: 0.46591514307522336
Calinski-Harabasz Score: 158.86934917865534
Davies-Bouldin Score: 0.7013821812582803

Cluster Size: 5
Silhouette Score: 0.4560543765199609
Calinski-Harabasz Score: 196.48488744981475
Davies-Bouldin Score: 0.644263516101366



In [82]:
from sklearn.preprocessing import StandardScaler
#case2
# Step 1: Normalize the data
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data)

# Define cluster sizes
cluster_sizes = [3, 4, 5]

# Perform hierarchical clustering and compute evaluation scores for each cluster size
for n_clusters in cluster_sizes:
    # Hierarchical clustering with data normalization
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clusterer.fit_predict(data_normalized)

    # Compute evaluation scores
    silhouette_score_case2 = silhouette_score(data_normalized, labels)
    calinski_harabasz_score_case2 = calinski_harabasz_score(data_normalized, labels)
    davies_bouldin_score_case2 = davies_bouldin_score(data_normalized, labels)

    # Print evaluation scores
    print(f"Cluster Size: {n_clusters}")
    print("Silhouette Score:", silhouette_score_case2)
    print("Calinski-Harabasz Score:", calinski_harabasz_score_case2)
    print("Davies-Bouldin Score:", davies_bouldin_score_case2)
    print()


Cluster Size: 3
Silhouette Score: 0.48719638809432453
Calinski-Harabasz Score: 63.51533603703061
Davies-Bouldin Score: 0.8108792413231795

Cluster Size: 4
Silhouette Score: 0.5044317585807708
Calinski-Harabasz Score: 82.78756755013343
Davies-Bouldin Score: 0.5770681494530369

Cluster Size: 5
Silhouette Score: 0.4719870873641092
Calinski-Harabasz Score: 92.90322028891832
Davies-Bouldin Score: 0.7222329627112438



In [83]:
from sklearn.manifold import TSNE
#case 3
# Step 1: Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
data_transformed = tsne.fit_transform(data)

# Define cluster sizes
cluster_sizes = [3, 4, 5]

# Perform hierarchical clustering and compute evaluation scores for each cluster size
for n_clusters in cluster_sizes:
    # Hierarchical clustering with data transformation
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clusterer.fit_predict(data_transformed)

    # Compute evaluation scores
    silhouette_score_case3 = silhouette_score(data_transformed, labels)
    calinski_harabasz_score_case3 = calinski_harabasz_score(data_transformed, labels)
    davies_bouldin_score_case3 = davies_bouldin_score(data_transformed, labels)

    # Print evaluation scores
    print(f"Cluster Size: {n_clusters}")
    print("Silhouette Score:", silhouette_score_case3)
    print("Calinski-Harabasz Score:", calinski_harabasz_score_case3)
    print("Davies-Bouldin Score:", davies_bouldin_score_case3)
    print()


Cluster Size: 3
Silhouette Score: 0.51251394
Calinski-Harabasz Score: 121.90938396024809
Davies-Bouldin Score: 0.5602781393453417

Cluster Size: 4
Silhouette Score: 0.5543718
Calinski-Harabasz Score: 202.72968542813317
Davies-Bouldin Score: 0.669103454237249

Cluster Size: 5
Silhouette Score: 0.59341735
Calinski-Harabasz Score: 263.61786472365606
Davies-Bouldin Score: 0.4965002174765741



In [84]:
from sklearn.decomposition import PCA

# Step 1: Apply PCA for dimensionality reduction
pca = PCA(n_components=2, random_state=42)
data_pca = pca.fit_transform(data)

# Define cluster sizes
cluster_sizes = [3, 4, 5]

# Perform hierarchical clustering and compute evaluation scores for each cluster size
for n_clusters in cluster_sizes:
    # Hierarchical clustering with PCA
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clusterer.fit_predict(data_pca)

    # Compute evaluation scores
    silhouette_score_case4 = silhouette_score(data_pca, labels)
    calinski_harabasz_score_case4 = calinski_harabasz_score(data_pca, labels)
    davies_bouldin_score_case4 = davies_bouldin_score(data_pca, labels)

    # Print evaluation scores
    print(f"Cluster Size: {n_clusters}")
    print("Silhouette Score:", silhouette_score_case4)
    print("Calinski-Harabasz Score:", calinski_harabasz_score_case4)
    print("Davies-Bouldin Score:", davies_bouldin_score_case4)
    print()


Cluster Size: 3
Silhouette Score: 0.803880554154936
Calinski-Harabasz Score: 286.0256700742083
Davies-Bouldin Score: 0.16348174949901995

Cluster Size: 4
Silhouette Score: 0.6957180062185246
Calinski-Harabasz Score: 421.7363699368483
Davies-Bouldin Score: 0.4337468311920525

Cluster Size: 5
Silhouette Score: 0.6919304135275811
Calinski-Harabasz Score: 750.363207557014
Davies-Bouldin Score: 0.3172915071443015



In [85]:
# Step 1: Normalize the data
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data)

# Step 2: Apply PCA for dimensionality reduction
pca = PCA(n_components=2, random_state=42)
data_normalized_pca = pca.fit_transform(data_normalized)

# Define cluster sizes
cluster_sizes = [3, 4, 5]

# Perform hierarchical clustering and compute evaluation scores for each cluster size
for n_clusters in cluster_sizes:
    # Hierarchical clustering with data normalization and PCA
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clusterer.fit_predict(data_normalized_pca)

    # Compute evaluation scores
    silhouette_score_case5 = silhouette_score(data_normalized_pca, labels)
    calinski_harabasz_score_case5 = calinski_harabasz_score(data_normalized_pca, labels)
    davies_bouldin_score_case5 = davies_bouldin_score(data_normalized_pca, labels)

    # Print evaluation scores
    print(f"Cluster Size: {n_clusters}")
    print("Silhouette Score:", silhouette_score_case5)
    print("Calinski-Harabasz Score:", calinski_harabasz_score_case5)
    print("Davies-Bouldin Score:", davies_bouldin_score_case5)
    print()


Cluster Size: 3
Silhouette Score: 0.6347578334659718
Calinski-Harabasz Score: 145.09367244483911
Davies-Bouldin Score: 0.5867486004984078

Cluster Size: 4
Silhouette Score: 0.6452399656367522
Calinski-Harabasz Score: 172.90961717559352
Davies-Bouldin Score: 0.4407006499776256

Cluster Size: 5
Silhouette Score: 0.6649749983101734
Calinski-Harabasz Score: 182.05243838683546
Davies-Bouldin Score: 0.3159913074119944



In [93]:
# Step 1: Normalize the data
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data)

# Step 2: Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
data_transformed = tsne.fit_transform(data_normalized)

# Step 3: Apply PCA for further dimensionality reduction
pca = PCA(n_components=2, random_state=42)
data_normalized_pca = pca.fit_transform(data_transformed)

# Define cluster sizes
cluster_sizes = [3, 4, 5]

# Perform hierarchical clustering and compute evaluation scores for each cluster size
for n_clusters in cluster_sizes:
    # Hierarchical clustering with data normalization, transformation (t-SNE), and PCA
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clusterer.fit_predict(data_normalized_pca)

    # Compute evaluation scores
    silhouette_score_case6 = silhouette_score(data_normalized_pca, labels)
    calinski_harabasz_score_case6 = calinski_harabasz_score(data_normalized_pca, labels)
    davies_bouldin_score_case6 = davies_bouldin_score(data_normalized_pca, labels)

    # Print evaluation scores
    print(f"Cluster Size: {n_clusters}")
    print("Silhouette Score:", silhouette_score_case6)
    print("Calinski-Harabasz Score:", calinski_harabasz_score_case6)
    print("Davies-Bouldin Score:", davies_bouldin_score_case6)
    print()


Cluster Size: 3
Silhouette Score: 0.6312269
Calinski-Harabasz Score: 262.50280170466186
Davies-Bouldin Score: 0.42876812694379357

Cluster Size: 4
Silhouette Score: 0.63441366
Calinski-Harabasz Score: 367.84434808619426
Davies-Bouldin Score: 0.54910967568382

Cluster Size: 5
Silhouette Score: 0.67512
Calinski-Harabasz Score: 430.81585919079686
Davies-Bouldin Score: 0.4020903218916816



In [39]:
from tabulate import tabulate
# Define the column names
column_names = [
   "Parameter"
]
# Define the column names
column_names = [
   "","c=3","c=4","c=5","c=3","c=4","c=5","c=3","c=4","c=5","c=3","c=4","c=5","c=3","c=4","c=5","c=3","c=4","c=5"
]

# Create an empty list to store the table data
table_data = []

# Add rows to the table data
# These are just the first few rows as an example, replace with your own values

table_data.append(["Silhouette",  0.68, 0.46, 0.45, 0.48, 0.50,0.47 ,0.51,0.55,0.59, 0.80,0.69,0.69,0.63,0.64, 0.66,0.63,0.63,0.67])
table_data.append(["Calinski-Harabasz", 158.22, 158.86, 196.48,63.51,82.78,92.90,121.90,202.72,263.61,286.02,421.73,750.36,145.09,172.90,182.05,262.50,367.84,430.81 ])
table_data.append(["Davies-Bouldin", 0.31, 0.70, 0.64, 0.81, 0.57, 0.72,0.56,0.66,0.49,0.16, 0.43,0.31,0.58,0.44,0.31,0.42,0.54,0.40 ])
# ... Add more rows with your values here ...


# Print the table with headers and column alignments
print(tabulate(table_data, headers=column_names, tablefmt="grid"))


+-------------------+--------+--------+--------+-------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|                   |    c=3 |    c=4 |    c=5 |   c=3 |   c=4 |   c=5 |    c=3 |    c=4 |    c=5 |    c=3 |    c=4 |    c=5 |    c=3 |    c=4 |    c=5 |    c=3 |    c=4 |    c=5 |
| Silhouette        |   0.68 |   0.46 |   0.45 |  0.48 |  0.5  |  0.47 |   0.51 |   0.55 |   0.59 |   0.8  |   0.69 |   0.69 |   0.63 |   0.64 |   0.66 |   0.63 |   0.63 |   0.67 |
+-------------------+--------+--------+--------+-------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
| Calinski-Harabasz | 158.22 | 158.86 | 196.48 | 63.51 | 82.78 | 92.9  | 121.9  | 202.72 | 263.61 | 286.02 | 421.73 | 750.36 | 145.09 | 172.9  | 182.05 | 262.5  | 367.84 | 430.81 |
+-------------------+--------+--------+--------+-------+-------+-------+--------+--------+-----