# Study Replication

## Section: Configuration and Initial Setup

Description: This section sets up necessary configurations and initializes variables for the study replication.

In [1]:
# Import necessary libraries
import itertools
import networkx as nx
import numpy as np
import pandas as pd
import random
import requests
from itertools import combinations
from io import StringIO
from kmodes.kmodes import KModes
from scipy.cluster.hierarchy import fcluster, linkage
from scipy.spatial.distance import pdist
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF
from sklearn.impute import SimpleImputer
from sklearn.manifold import SpectralEmbedding, TSNE
from sklearn.metrics import adjusted_rand_score as ARI, normalized_mutual_info_score as NMI, fowlkes_mallows_score as FMI
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras import layers, Sequential
import warnings

In [2]:
# Adjust the Pandas display options for better readability
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 300)

# Ignore all warnings
warnings.filterwarnings('ignore')

In [3]:
# List of datasets and their corresponding URLs
dataset_urls = {
    "Soybean (Small)": "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data",
    "Zoo": "https://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data",
    "Heart Disease": "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data",
    "Breast Cancer Wisconsin (Original)": "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
    "Dermatology": "https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data",
    "Letter Recognition (E, F)": "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data",
    "Molecular Biology (Splice-junction Gene Sequences)": "https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/splice-junction-gene-sequences/splice.data",
    "Mushroom": "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",
    "Iris": "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
    "ISOLET (5)": "https://mikaelvincent.dev/datasets/isolet/isolet5.data",
    "Optical Recognition of Handwritten Digits": "https://mikaelvincent.dev/datasets/opticaldigits/optdigits.data",
    "Pen-Based Recognition of Handwritten Digits": "https://mikaelvincent.dev/datasets/pendigits/pendigits.data"
}
# COIL20 not implemented; I don't understand how images can be translated into categories; I can't find a pre-processed version either :)

In [4]:
# Set the true number of clusters for each dataset
n_clusters_dict = {
    "Soybean (Small)": 4,
    "Zoo": 7,
    "Heart Disease": 2,
    "Breast Cancer Wisconsin (Original)": 2,
    "Dermatology": 6,
    "Letter Recognition (E, F)": 2,
    "Molecular Biology (Splice-junction Gene Sequences)": 3,
    "Mushroom": 2,
    "Iris": 3,
    "ISOLET (5)": 26,
    "Optical Recognition of Handwritten Digits": 10,
    "Pen-Based Recognition of Handwritten Digits": 10
}

In [5]:
# List of datasets for clustering ensemble task
datasets_for_ensemble = [
    "Iris",
    "ISOLET (5)",
    "Optical Recognition of Handwritten Digits",
    "Pen-Based Recognition of Handwritten Digits"
]

In [6]:
# Set the number of runs for benchmarking
num_runs = 10

In [7]:
n_runs = 60  # Number of k-means runs for clustering ensemble

## Section: Data Preparation

Description: This section fetches, cleans, and prepares datasets for clustering.

In [8]:
dataframes = {}  # Dictionary to store cleaned dataframes

In [9]:
for name, url in dataset_urls.items():
    response = requests.get(url, verify=False)
    data = response.text
    
    # Convert the CSV/Text data into a DataFrame
    data_io = StringIO(data)
    df = pd.read_csv(data_io, header=None)

    # Set targets and features
    if name == "Letter Recognition (E, F)":
        y = df.iloc[:, 0]
        X = df.iloc[:, 1:]
    elif name == "Mushroom":
        y = df.iloc[:, 0]
        X = df.iloc[:, 1:]
    elif name == "Molecular Biology (Splice-junction Gene Sequences)":
        y = df.iloc[:, 0].str.strip()
        X = pd.DataFrame([list(seq.strip()) for seq in df.iloc[:, 2]])
    else:
        X, y = df.iloc[:, :-1], df.iloc[:, -1]

    # Drop columns with only 1 unique value
    for col in X.columns:
        if len(X[col].unique()) <= 1:
            X.drop(columns=[col], inplace=True) # Diregard warning as it is behaving as expected
    
    # Store in the dataframes dictionary
    dataframes[name] = {'features': X, 'targets': y}

In [10]:
def preprocess_datasets(dataframes):
    if 'Zoo' in dataframes:
        zoo_df = dataframes['Zoo']['features']
        zoo_df = zoo_df.drop(columns=[0])
        dataframes['Zoo']['features'] = zoo_df

    if 'Heart Disease' in dataframes:
        hd_df = dataframes['Heart Disease']['features']
        columns_to_drop = [0, 3, 4, 7, 9]
        hd_df = hd_df.drop(columns=hd_df.columns[columns_to_drop])
        dataframes['Heart Disease']['features'] = hd_df
        y_hd = dataframes['Heart Disease']['targets']
        dataframes['Heart Disease']['targets'] = y_hd.apply(lambda x: 0 if x == 0 else 1)
    
    if 'Breast Cancer Wisconsin (Original)' in dataframes:
        bcw_df = dataframes['Breast Cancer Wisconsin (Original)']['features']
        bcw_df = bcw_df.drop(columns=bcw_df.columns[0])
        dataframes['Breast Cancer Wisconsin (Original)']['features'] = bcw_df
    
    if 'Dermatology' in dataframes:
        derm_df = dataframes['Dermatology']['features']
        derm_df = derm_df.drop(columns=derm_df.columns[-1])
        dataframes['Dermatology']['features'] = derm_df

    if 'Letter Recognition (E, F)' in dataframes:
        lr_ef_df = dataframes['Letter Recognition (E, F)']['features']
        lr_ef_targets = dataframes['Letter Recognition (E, F)']['targets']
        mask = lr_ef_targets.isin(['E', 'F'])
        dataframes['Letter Recognition (E, F)']['features'] = lr_ef_df[mask]
        dataframes['Letter Recognition (E, F)']['targets'] = lr_ef_targets[mask]

    return dataframes

In [11]:
# Apply preprocessing to datasets
dataframes = preprocess_datasets(dataframes)

## Section: Clustering Ensemble on Numerical Datasets

In [12]:
def run_multiple_kmeans(features, n_clusters, n_runs):
    all_labels = []
    for i in range(n_runs):
        random_state = random.randint(0, 1000)
        kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=random_state, n_init=10)
        labels = kmeans.fit_predict(features)
        all_labels.append(labels)
    return np.array(all_labels).T

In [13]:
for dataset_name in datasets_for_ensemble:
    # Extracting features and targets from the preloaded datasets
    features = dataframes[dataset_name]["features"]
    targets = dataframes[dataset_name]["targets"]

    # Converting targets to numerical labels if they aren't already
    if targets.dtype.kind in 'O':  # Check if targets are object type (e.g., strings)
        targets = LabelEncoder().fit_transform(targets)

    # Determine the number of clusters from the unique elements in targets
    n_clusters = len(np.unique(targets))

    # Run multiple k-means and collect results
    ensembled_features = run_multiple_kmeans(features, n_clusters, n_runs)
    
    # Convert numpy array to DataFrame and replace the original data
    ensembled_features_df = pd.DataFrame(ensembled_features, index=features.index)
    dataframes[dataset_name]["features"] = ensembled_features_df

## Section: Clustering Algorithms and Helper Functions

Description: Define functions for different clustering techniques

In [14]:
def perform_kmodes(features, n_clusters):
    """Perform clustering using KModes algorithm."""
    km = KModes(n_clusters=n_clusters, init='random', n_init=5)
    clusters = km.fit_predict(features)
    return clusters

In [15]:
def perform_ordinal_encoding(features, true_labels, n_clusters):
    """Perform Ordinal Encoding followed by clustering."""
    encoder = LabelEncoder()
    features_encoded = features.apply(encoder.fit_transform)
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    return kmeans.fit_predict(features_encoded, n_clusters)

In [16]:
def perform_one_hot_encoding(features, true_labels, n_clusters):
    """Perform One-Hot Encoding followed by clustering."""
    encoder = OneHotEncoder()
    features_encoded = encoder.fit_transform(features).toarray()
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    return kmeans.fit_predict(features_encoded, n_clusters)

In [17]:
def perform_link(features, n_clusters):
    encoder = OneHotEncoder()
    features_encoded = encoder.fit_transform(features).toarray()
    # Calculate pairwise dissimilarities (1 - similarity), ensuring non-negative distances
    ochiai_distance = pdist(features_encoded, lambda u, v: max(0, 1 - ochiai_coefficient_for_link(u, v)))
    link_matrix = linkage(ochiai_distance, method='average')
    clusters = fcluster(link_matrix, t=n_clusters, criterion='maxclust')
    return clusters

In [18]:
def ochiai_coefficient_for_link(b1, b2):
    intersection = np.dot(b1, b2)
    norm_b1 = np.sqrt(np.dot(b1, b1))
    norm_b2 = np.sqrt(np.dot(b2, b2))
    denominator = (norm_b1 * norm_b2)
    if denominator == 0:
        return 0  # Return 0 if either or both vectors are all zeros
    return intersection / denominator

In [19]:
def perform_cde(features, n_clusters):
    """Perform Categorical Data Embedding and clustering using t-SNE and k-Means."""
    encoder = OneHotEncoder(sparse_output=False)
    features_encoded = encoder.fit_transform(features)
    tsne_model = TSNE(n_components=2, perplexity=30, learning_rate=200)
    features_embedded = tsne_model.fit_transform(features_encoded)
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    clusters = kmeans.fit_predict(features_embedded)
    return clusters

In [20]:
def perform_hierarchical_clustering(features, n_clusters, method='ward'):
    """Perform Hierarchical Clustering with handling for '?' characters and categorical variables."""
    
    # Handle '?' characters and replace them appropriately
    for column in features.columns:
        if features[column].dtype == object:
            # Check if the column contains '?'
            if '?' in features[column].unique():
                if features[column].str.isnumeric().any():
                    # Assume numeric column with missing values represented as '?'
                    # Convert '?' to NaN and then fill with the mean of the column
                    features[column] = pd.to_numeric(features[column], errors='coerce')
                    features[column].fillna(features[column].mean(), inplace=True)
                else:
                    # Categorical column
                    mode_value = features[column].mode()[0]
                    features[column] = features[column].replace('?', mode_value)
            # Encode categorical variables
            le = LabelEncoder()
            features[column] = le.fit_transform(features[column].astype(str))
        else:
            # Directly fill missing values with mean if any
            if features[column].isnull().any():
                features[column].fillna(features[column].mean(), inplace=True)

    # Ensure all data is in float format to avoid conversion errors in linkage
    features = features.astype(float)

    # Create the linkage matrix
    Z = linkage(features, method=method)

    # Create clusters by cutting the dendrogram at the specified number of clusters
    clusters = fcluster(Z, t=n_clusters, criterion='maxclust')
    return clusters

In [21]:
def perform_cdc_dr(features, n_clusters, embedding_method='SE', operation='Joint'):
    """
    Perform CDC_DR algorithm with specified graph embedding method and operation.
    :param features: DataFrame of features
    :param n_clusters: Number of clusters to form
    :param embedding_method: 'NE', 'SE', 'NMF', or 'AE'
    :param operation: 'Joint' or 'Mean'
    :return: clusters - Cluster labels for each sample
    """
    
    # Construct similarity graph from features
    graph = construct_similarity_graph(features)

    # Apply graph embedding technique
    embedded_graph = graph_embedding(graph, method=embedding_method)

    value_to_index = create_value_to_index_mapping(features)
    
    # Ensure integrated_data is 2D before clustering
    if operation == 'Joint':
        integrated_data = joint_operation(embedded_graph, features, value_to_index)
    elif operation == 'Mean':
        integrated_data = mean_operation(embedded_graph, features, value_to_index)
    else:
        raise ValueError("Operation must be either 'Joint' or 'Mean'.")
    
    # Impute missing values in integrated_data
    imputer = SimpleImputer(strategy='mean')
    integrated_data_imputed = imputer.fit_transform(integrated_data)
    
    # Cluster the integrated data
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    clusters = kmeans.fit_predict(integrated_data_imputed)

    return clusters

In [22]:
def construct_similarity_graph(features):
    """
    Construct a similarity graph from features based on categorical values.
    :param features: DataFrame of features, each row is a sample and columns are categorical features
    :return: graph - A NetworkX graph with nodes representing categorical values and weighted edges
    """
    # Step 1: Prepare all unique categorical values and their indices
    unique_values_dict = {}
    for column in features:
        unique_values = np.unique(features[column])
        for val in unique_values:
            unique_values_dict[f"{column}_{val}"] = np.where(features[column] == val)[0]
    
    # Step 2: Calculate similarity between all pairs of unique categorical values
    graph = nx.Graph()
    for (val1, indices1), (val2, indices2) in combinations(unique_values_dict.items(), 2):
        # Calculate similarity (e.g., using Ochiai coefficient)
        sim = ochiai_coefficient(indices1, indices2)  # Define this function based on your chosen similarity metric
        if sim > 0:  # If the similarity is non-zero, add an edge
            graph.add_edge(val1, val2, weight=sim)
    
    # Add all nodes explicitly in case some have no edges
    for val in unique_values_dict.keys():
        if val not in graph:
            graph.add_node(val)

    return graph

In [23]:
def ochiai_coefficient(indices1, indices2):
    """
    Calculate Ochiai coefficient between two sets of indices
    :param indices1: array-like list of indices for the first categorical value
    :param indices2: array-like list of indices for the second categorical value
    :return: Ochiai coefficient as float
    """
    set1 = set(indices1)
    set2 = set(indices2)
    intersection = len(set1.intersection(set2))
    if intersection == 0: return 0  # No overlap
    return intersection / np.sqrt(len(set1) * len(set2))  # Ochiai coefficient formula

In [24]:
def graph_embedding(graph, method='SE', dimensions=2):
    """
    Apply graph embedding method to the constructed graph.
    :param graph: NetworkX graph
    :param method: string representing the graph embedding method: 'NE', 'SE', 'NMF', 'AE'
    :param dimensions: the number of dimensions for the embedding
    :return: embedded_graph - An array-like embedded representation of the graph
    """
    # Convert graph to adjacency matrix and then to numpy ndarray
    adjacency_matrix = nx.to_numpy_matrix(graph)
    adjacency_matrix = np.asarray(adjacency_matrix)
    
    if method == 'NE':
        # Directly use the adjacency matrix as features (no embedding)
        embedded_graph = adjacency_matrix

    elif method == 'SE':
        # Apply Spectral Embedding
        embedding_model = SpectralEmbedding(n_components=dimensions)
        embedded_graph = embedding_model.fit_transform(adjacency_matrix)

    elif method == 'NMF':
        # Apply Non-negative Matrix Factorization for embedding
        model = NMF(n_components=dimensions, init='random', max_iter=10000)
        embedded_graph = model.fit_transform(adjacency_matrix)

    elif method == 'AE':
        # Apply Autoencoder for graph embedding
        n_nodes = adjacency_matrix.shape[0]
        input_dim = n_nodes
        autoencoder = Sequential([
            layers.Dense(64, activation='relu', input_shape=(input_dim,)),
            layers.Dense(dimensions, activation='relu'),  # Embedding layer
            layers.Dense(64, activation='relu'),
            layers.Dense(input_dim, activation='sigmoid')
        ])
        autoencoder.compile(optimizer='adam', loss='mse')
        adjacency_matrix_norm = adjacency_matrix / np.max(adjacency_matrix)  # Normalize adjacency matrix
        autoencoder.fit(adjacency_matrix_norm, adjacency_matrix_norm, epochs=50, verbose=0)
        encoder = Sequential(autoencoder.layers[:2])  # The first two layers are the encoder
        embedded_graph = encoder.predict(adjacency_matrix_norm, verbose=0)

    else:
        raise NotImplementedError(f"Graph embedding method {method} is not implemented.")
    
    return np.array(embedded_graph)

In [25]:
def create_value_to_index_mapping(features):
    """
    Create a mapping from each unique categorical value to a unique index.
    :param features: DataFrame of features, each column is a categorical feature
    :return: Dictionary of value to index mapping
    """
    # Extracting unique values from each feature
    unique_values = set()
    for column in features.columns:
        unique_values.update(features[column].unique())

    # Creating a mapping from unique values to an index
    value_to_index = {value: idx for idx, value in enumerate(unique_values)}
    return value_to_index

In [26]:
def joint_operation(embedded_graph, features, value_to_index):
    # Concatenates the embeddings for each categorical value in each sample
    joint_embedded = []
    for _, row in features.iterrows():
        joint_vector = []
        for value in row:
            index = value_to_index[value]  # Map each categorical value to its index in the embedded graph
            joint_vector.extend(embedded_graph[index])
        joint_embedded.append(joint_vector)
    return np.array(joint_embedded)

In [27]:
def mean_operation(embedded_graph, features, value_to_index):
    # Calculates the mean of the embeddings for each categorical value in each sample
    mean_embedded = []
    for _, row in features.iterrows():
        vectors = [embedded_graph[value_to_index[value]] for value in row]
        mean_vector = np.mean(vectors, axis=0)
        mean_embedded.append(mean_vector)
    return np.array(mean_embedded)

## Section: Main Execution - Running Clustering Algorithms

Description: This section executes the defined clustering algorithms on the prepared datasets and collects the results.

In [28]:
def run_clustering_algorithms(dataframes, n_clusters_dict, num_runs=10):
    results_list = []
    for name, data in dataframes.items():
        print("Processing:", name)
        features = data['features']
        true_labels = data['targets'].squeeze()  # Assuming targets are in a single column
        n_clusters = n_clusters_dict.get(name, 2)  # Default to 2 clusters if not specified

        metrics = {'KModes': [], 'Ordinal': [], 'One-Hot': [], 'Link': [], 'CDE': [], 'Hierarchical': []}  # Initialize a dictionary to store results for each method

        # Include CDC_DR methods in metrics dictionary
        embedding_methods = ['NE', 'SE', 'NMF', 'AE']  # Non-Embedding, Spectral Embedding, Nonnegative Matrix Factorization, Autoencoder
        operations = ['Joint', 'Mean']  # The two types of operations
        
        for em in embedding_methods:
            for op in operations:
                key_name = f"CDC_DR+{em} ({op})"
                metrics[key_name] = []

        for _ in range(num_runs):
            # KModes
            km_clusters = perform_kmodes(features, n_clusters)
            ari, nmi, fmi = calculate_metrics(true_labels, km_clusters)
            metrics['KModes'].append((ari, nmi, fmi))

            # Ordinal Encoding
            ord_clusters = perform_ordinal_encoding(features, true_labels, n_clusters)
            ari, nmi, fmi = calculate_metrics(true_labels, ord_clusters)
            metrics['Ordinal'].append((ari, nmi, fmi))

            # One-Hot Encoding
            oh_clusters = perform_one_hot_encoding(features, true_labels, n_clusters)
            ari, nmi, fmi = calculate_metrics(true_labels, oh_clusters)
            metrics['One-Hot'].append((ari, nmi, fmi))

            # Link with Ochiai Coefficient
            link_clusters = perform_link(features, n_clusters)
            ari, nmi, fmi = calculate_metrics(true_labels, link_clusters)
            metrics['Link'].append((ari, nmi, fmi))

            # CDE with t-SNE and k-Means
            cde_clusters = perform_cde(features, n_clusters)
            ari, nmi, fmi = calculate_metrics(true_labels, cde_clusters)
            metrics['CDE'].append((ari, nmi, fmi))

            # Hierarchical Clustering
            hier_clusters = perform_hierarchical_clustering(features, n_clusters)
            ari, nmi, fmi = ARI(true_labels, hier_clusters), NMI(true_labels, hier_clusters), FMI(true_labels, hier_clusters)
            metrics['Hierarchical'].append((ari, nmi, fmi))

            # CDC_DR with various embedding methods and operations
            for embedding_method in ['NE', 'SE', 'NMF', 'AE']:
                for operation in ['Joint', 'Mean']:
                    cdc_dr_clusters = perform_cdc_dr(features, n_clusters, embedding_method, operation)
                    ari, nmi, fmi = calculate_metrics(true_labels, cdc_dr_clusters)
                    metrics[f"CDC_DR+{embedding_method} ({operation})"].append((ari, nmi, fmi))

        # Calculate mean and standard deviation for each method and append to results list
        for method, values in metrics.items():
            ari_vals, nmi_vals, fmi_vals = zip(*values)
            ari_mean, ari_std = np.mean(ari_vals), np.std(ari_vals)
            nmi_mean, nmi_std = np.mean(nmi_vals), np.std(nmi_vals)
            fmi_mean, fmi_std = np.mean(fmi_vals), np.std(fmi_vals)
            results_list.append({
                "Dataset": name,
                "Method": method,
                "ARI": f"{ari_mean:.4f}±{ari_std:.2f}",
                "NMI": f"{nmi_mean:.4f}±{nmi_std:.2f}",
                "FMI": f"{fmi_mean:.4f}±{fmi_std:.2f}"
            })

    # Convert list of dictionaries to DataFrame for results
    results_df = pd.DataFrame(results_list)
    return results_df

In [29]:
def calculate_metrics(true_labels, predicted_labels):
    """Calculate clustering metrics: Adjusted Rand Index, Normalized Mutual Information, and Folkes-Mallows Index."""
    return ARI(true_labels, predicted_labels), NMI(true_labels, predicted_labels), FMI(true_labels, predicted_labels)

In [30]:
def reformat_results(results_df):
    # Expanding 'ARI', 'NMI', and 'FMI' columns into multiple rows with a new 'Metric' column
    expanded_df = pd.melt(results_df, id_vars=["Dataset", "Method"], value_vars=["ARI", "NMI", "FMI"], var_name="Metric", value_name="Value")
    expanded_df[['Metric_Value', 'Std']] = expanded_df['Value'].str.split('±', expand=True)
    expanded_df.drop(columns=['Value'], inplace=True)  # Removing the original combined column
    
    # Convert the 'Metric_Value' and 'Std' columns to numeric types
    expanded_df['Metric_Value'] = expanded_df['Metric_Value'].astype(float)
    expanded_df['Std'] = expanded_df['Std'].astype(float)

    # Concatenate the metric value and standard deviation back into a single string
    expanded_df['Metric_Value'] = expanded_df['Metric_Value'].map('{:.4f}'.format) + "±" + expanded_df['Std'].map('{:.2f}'.format)
    
    # Ensuring the order of datasets and methods remains consistent with the original DataFrame
    dataset_order = results_df['Dataset'].unique()
    method_order = results_df['Method'].unique()

    # Creating a pivot table to restructure the DataFrame as required
    pivot_df = expanded_df.pivot_table(index=["Dataset", "Metric"], columns="Method", values="Metric_Value", aggfunc='first')
    
    # Reindexing the pivot table to maintain the original order
    pivot_df = pivot_df.reindex(dataset_order, level='Dataset')
    pivot_df = pivot_df.reindex(method_order, axis='columns')

    return pivot_df

In [31]:
# Running all algorithms and storing the results
results = run_clustering_algorithms(dataframes, n_clusters_dict, num_runs)

Processing: Soybean (Small)
Processing: Zoo
Processing: Heart Disease
Processing: Breast Cancer Wisconsin (Original)
Processing: Dermatology
Processing: Letter Recognition (E, F)
Processing: Molecular Biology (Splice-junction Gene Sequences)
Processing: Mushroom
Processing: Iris
Processing: ISOLET (5)
Processing: Optical Recognition of Handwritten Digits
Processing: Pen-Based Recognition of Handwritten Digits


In [32]:
# Use the function to reformat the results
formatted_results = reformat_results(results)

## Section: Presentation of Results

In [33]:
# Prepare data for the table
table_data = []

for name, content in dataframes.items():
    X, y = content['features'], content['targets']
    table_data.append({
        "Name": name,
        "Number of Samples": X.shape[0],
        "Number of Features": X.shape[1],
        "Number of Unique Values in Target": len(pd.unique(y))
    })

# Convert table data into a DataFrame for pretty printing
table_df = pd.DataFrame(table_data)
print(table_df)

print("\n* Number of features for numerical datasets are based on the # of runs of kmeans in the clustering ensemble. For an accurate measure of the original versions of the datasets, run this before running the clustering ensemble.")

                                                 Name  Number of Samples  Number of Features  Number of Unique Values in Target
0                                     Soybean (Small)                 47                  21                                  4
1                                                 Zoo                101                  16                                  7
2                                       Heart Disease                303                   8                                  2
3                  Breast Cancer Wisconsin (Original)                699                   9                                  2
4                                         Dermatology                366                  33                                  6
5                           Letter Recognition (E, F)               1543                  16                                  2
6   Molecular Biology (Splice-junction Gene Sequen...               3190                  60            

In [35]:
# Print the formatted results
print(formatted_results)

Method                                                          KModes      Ordinal      One-Hot          Link          CDE Hierarchical CDC_DR+NE (Joint) CDC_DR+NE (Mean) CDC_DR+SE (Joint) CDC_DR+SE (Mean) CDC_DR+NMF (Joint) CDC_DR+NMF (Mean) CDC_DR+AE (Joint) CDC_DR+AE (Mean)
Dataset                                            Metric                                                                                                                                                                                                                             
Soybean (Small)                                    ARI     0.9097±0.14  0.5458±0.00  1.0000±0.00   1.0000±0.00  0.9420±0.08  1.0000±0.00       1.0000±0.00      0.0120±0.01       0.5562±0.00      0.1079±0.00        1.0000±0.00      -0.0046±0.00       0.7776±0.32      0.0013±0.03
                                                   FMI     0.9319±0.11  0.6573±0.00  1.0000±0.00   1.0000±0.00  0.9565±0.06  1.0000±0.00       1.0000±0.00      0.2

# Questions

The Adjusted Rand Index (ARI), Normalized Mutual Information (NMI), and Fowlkes-Mallows Index (FMI) are all metrics used to evaluate the performance of clustering algorithms by comparing the clustering results with ground truth labels.

## Adjusted Rand Index (ARI)

- **Advantages**: Corrects for chance grouping, making it more reliable in indicating actual similarities between clusters and ground truth. Values range from -1 to 1, where 1 indicates perfect agreement, and values less than 0 indicate independent labelings.
- **Disadvantages**: Can be more computationally intensive than simpler metrics; less intuitive to interpret without statistical background.
- **Use Case**: Best for benchmarking different clustering algorithms when a ground truth is available.

## Normalized Mutual Information (NMI)

- **Advantages**: Also adjusts for chance and is normalized between 0 and 1, where 1 indicates a perfect match between clusters and ground truth. It measures mutual information in a way that accounts for the sizes of different clusters.
- **Disadvantages**: Its normalization can sometimes lead to misleading results, especially if the number of clusters is very different from the number of ground truth categories.
- **Use Case**: Useful in cases where clusters are of varying sizes and a normalized measure is preferred.

## Fowlkes-Mallows Index (FMI)

- **Advantages**: Based on the geometric mean of precision and recall, providing a balance between them. Simple to calculate and understand, with values ranging from 0 to 1, where 1 indicates perfect clustering.
- **Disadvantages**: Does not adjust for chance, which can make it overly optimistic in scenarios where random clustering might appear effective.
- **Use Case**: Effective for quick assessments where computational simplicity is needed and the number of clusters is close to the number of categories in ground truth.

## When to Use Each

- **ARI** is preferred when an accurate and chance-corrected measure is critical.
- **NMI** is suitable for comparing clusters of varying sizes and when normalization of mutual information is beneficial.
- **FMI** is good for quick, intuitive assessments of clustering when the expected number of clusters is not significantly different from the number of ground truth categories.