In [1]:
#!pip3 install umap-learn
#!pip3 install leidenalg

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
import umap.umap_ as umap
import leidenalg
import igraph as ig
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Preprocess data
def preprocess_data(data, dataset_name):
    print(f"Preprocessing {dataset_name} data...")
    # Log transformation to reduce skewness (add small constant to avoid log(0))
    log_data = np.log1p(data)
    # Standardize the data
    scaler = StandardScaler()
    preprocessed_data = scaler.fit_transform(log_data)
    # Filter low variance genes
    print(f"Filtering low variance genes for {dataset_name}...")
    variance_threshold = 0.1
    variances = np.var(preprocessed_data, axis=0)
    high_var_indices = np.where(variances > variance_threshold)[0]
    preprocessed_data = preprocessed_data[:, high_var_indices]
    print(f"{dataset_name} data shape after filtering: {preprocessed_data.shape}")
    return preprocessed_data

In [4]:
# Apply dimensionality reduction
def apply_dim_reduction(preprocessed_data, labels, dataset_name):
    print(f"Applying dimensionality reduction for {dataset_name}...")

    # Apply PCA
    print("Applying PCA...")
    n_components = min(50, preprocessed_data.shape[1], preprocessed_data.shape[0])
    pca = PCA(n_components=n_components)
    pca_data = pca.fit_transform(preprocessed_data)
    print(f"PCA data shape: {pca_data.shape}")

    # Explained variance
    explained_var = pca.explained_variance_ratio_
    cumulative_var = np.cumsum(explained_var)
    print(f"Explained variance by first 10 components: {explained_var[:10]}")
    print(f"Cumulative explained variance by first 10 components: {cumulative_var[:10]}")

    # Plot explained variance
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, len(explained_var) + 1), explained_var, alpha=0.5, align='center')
    plt.step(range(1, len(cumulative_var) + 1), cumulative_var, where='mid')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.title(f'Explained Variance by Principal Components - {dataset_name}')
    plt.tight_layout()
    plt.savefig(f'pca_explained_variance_{dataset_name.lower()}.png')
    plt.close()

    # Apply t-SNE
    print("Applying t-SNE...")
    tsne = TSNE(n_components=2, perplexity=min(30, preprocessed_data.shape[0] - 1), random_state=42)
    tsne_data = tsne.fit_transform(preprocessed_data)
    print(f"t-SNE data shape: {tsne_data.shape}")

    # Apply UMAP
    print("Applying UMAP...")
    n_neighbors = min(15, preprocessed_data.shape[0] - 1)
    reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=0.1, random_state=42)
    umap_data = reducer.fit_transform(preprocessed_data)
    print(f"UMAP data shape: {umap_data.shape}")

    # Store reduced data in a dictionary for easy access
    reduced_data = {
        'pca': pca_data,
        'tsne': tsne_data,
        'umap': umap_data
    }

    # Visualize dimensionality reduction results with labels
    plt.figure(figsize=(18, 6))

    # Set color palette for the number of unique labels
    unique_labels = np.unique(labels)
    n_labels = len(unique_labels)
    color_palette = sns.color_palette("hls", n_labels)
    label_color_map = {label: color_palette[i] for i, label in enumerate(unique_labels)}

    # PCA Plot
    plt.subplot(1, 3, 1)
    for label in unique_labels:
        idx = labels == label
        plt.scatter(pca_data[idx, 0], pca_data[idx, 1], c=[label_color_map[label]], label=f'Class {label}', alpha=0.7)
    plt.title(f'PCA Projection - {dataset_name}')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.legend()

    # t-SNE Plot
    plt.subplot(1, 3, 2)
    for label in unique_labels:
        idx = labels == label
        plt.scatter(tsne_data[idx, 0], tsne_data[idx, 1], c=[label_color_map[label]], label=f'Class {label}', alpha=0.7)
    plt.title(f't-SNE Projection - {dataset_name}')
    plt.xlabel('t-SNE1')
    plt.ylabel('t-SNE2')
    plt.legend()

    # UMAP Plot
    plt.subplot(1, 3, 3)
    for label in unique_labels:
        idx = labels == label
        plt.scatter(umap_data[idx, 0], umap_data[idx, 1], c=[label_color_map[label]], label=f'Class {label}', alpha=0.7)
    plt.title(f'UMAP Projection - {dataset_name}')
    plt.xlabel('UMAP1')
    plt.ylabel('UMAP2')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f'dimensionality_reduction_comparison_{dataset_name.lower()}.png')
    plt.close()

    return reduced_data

In [5]:
# Apply K-means clustering
def apply_kmeans(reduced_data, labels, dataset_name):
    print(f"Applying K-means clustering for {dataset_name}...")
    dim_reduction_methods = list(reduced_data.keys())
    k_range = range(2, min(11, reduced_data['pca'].shape[0]))
    kmeans_results = {}

    for method in dim_reduction_methods:
        # Calculate inertia and silhouette scores for different k values
        inertia = []
        silhouette_scores = []

        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            cluster_labels = kmeans.fit_predict(reduced_data[method])
            inertia.append(kmeans.inertia_)

            # Calculate silhouette score if k > 1
            if k > 1:
                silhouette_scores.append(silhouette_score(reduced_data[method], cluster_labels))
            else:
                silhouette_scores.append(0)  # Silhouette score not defined for k=1

        # Find optimal k using silhouette score
        optimal_k = k_range[np.argmax(silhouette_scores)]

        # Rerun K-means with optimal k
        optimal_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
        cluster_labels = optimal_kmeans.fit_predict(reduced_data[method])

        # Store results
        kmeans_results[method] = {
            'labels': cluster_labels,
            'optimal_k': optimal_k,
            'inertia': inertia,
            'silhouette_scores': silhouette_scores
        }

        print(f"K-means on {method} {dataset_name} data: Optimal k = {optimal_k}, Silhouette Score = {silhouette_scores[optimal_k-2]:.4f}")

        # Calculate Adjusted Rand Index if true labels are available
        ari = adjusted_rand_score(labels, cluster_labels)
        print(f"K-means on {method} {dataset_name} data: Adjusted Rand Index = {ari:.4f}")

        # Plot K-means results
        plt.figure(figsize=(18, 6))

        # Plot elbow curve
        plt.subplot(1, 3, 1)
        plt.plot(k_range, inertia, 'o-')
        plt.xlabel('Number of clusters (k)')
        plt.ylabel('Inertia')
        plt.title('Elbow Method for Optimal k')
        plt.axvline(x=optimal_k, color='r', linestyle='--')

        # Plot silhouette scores
        plt.subplot(1, 3, 2)
        plt.plot(k_range, silhouette_scores, 'o-')
        plt.xlabel('Number of clusters (k)')
        plt.ylabel('Silhouette Score')
        plt.title('Silhouette Scores for Different k')
        plt.axvline(x=optimal_k, color='r', linestyle='--')

        # Plot clustering results
        plt.subplot(1, 3, 3)
        scatter = plt.scatter(reduced_data[method][:, 0], reduced_data[method][:, 1], c=cluster_labels, cmap='viridis', s=50, alpha=0.7)
        plt.colorbar(scatter, label='Cluster')
        plt.title(f'K-means Clustering on {method.upper()} (k={optimal_k})')
        plt.xlabel(f'{method.upper()}1')
        plt.ylabel(f'{method.upper()}2')

        plt.tight_layout()
        plt.savefig(f'kmeans_{method}_results_{dataset_name.lower()}.png')
        plt.close()

    return kmeans_results

In [6]:
# Apply hierarchical clustering
def apply_hierarchical(reduced_data, labels, dataset_name):
    print(f"Applying hierarchical clustering for {dataset_name}...")
    dim_reduction_methods = list(reduced_data.keys())
    k_range = range(2, min(11, reduced_data['pca'].shape[0]))
    hierarchical_results = {}

    for method in dim_reduction_methods:
        # Calculate linkage matrix for dendrogram
        Z = linkage(reduced_data[method], method='ward')

        # Calculate silhouette scores for different numbers of clusters
        silhouette_scores = []

        for k in k_range:
            hc = AgglomerativeClustering(n_clusters=k, linkage='ward')
            cluster_labels = hc.fit_predict(reduced_data[method])

            # Calculate silhouette score if k > 1
            if k > 1:
                silhouette_scores.append(silhouette_score(reduced_data[method], cluster_labels))
            else:
                silhouette_scores.append(0)  # Silhouette score not defined for k=1

        # Find optimal k using silhouette score
        optimal_k = k_range[np.argmax(silhouette_scores)]

        # Rerun hierarchical clustering with optimal k
        optimal_hc = AgglomerativeClustering(n_clusters=optimal_k, linkage='ward')
        cluster_labels = optimal_hc.fit_predict(reduced_data[method])

        # Store results
        hierarchical_results[method] = {
            'labels': cluster_labels,
            'optimal_k': optimal_k,
            'linkage': Z,
            'silhouette_scores': silhouette_scores
        }

        print(f"Hierarchical on {method} {dataset_name} data: Optimal k = {optimal_k}, Silhouette Score = {silhouette_scores[optimal_k-2]:.4f}")

        # Calculate Adjusted Rand Index if true labels are available
        ari = adjusted_rand_score(labels, cluster_labels)
        print(f"Hierarchical on {method} {dataset_name} data: Adjusted Rand Index = {ari:.4f}")

        # Plot dendrogram
        plt.figure(figsize=(12, 8))
        dendrogram(Z, leaf_rotation=90., leaf_font_size=10., color_threshold=None, truncate_mode='lastp', p=30)
        plt.title(f'Hierarchical Clustering Dendrogram ({method.upper()} - {dataset_name})')
        plt.xlabel('Sample index')
        plt.ylabel('Distance')
        plt.axhline(y=Z[-optimal_k+1, 2], c='k', ls='--', lw=1)  # Add line at cut height for optimal k
        plt.tight_layout()
        plt.savefig(f'hierarchical_{method}_dendrogram_{dataset_name.lower()}.png')
        plt.close()

        # Plot clustering results
        plt.figure(figsize=(10, 8))
        scatter = plt.scatter(reduced_data[method][:, 0], reduced_data[method][:, 1], c=cluster_labels, cmap='viridis', s=50, alpha=0.7)
        plt.colorbar(scatter, label='Cluster')
        plt.title(f'Hierarchical Clustering on {method.upper()} (k={optimal_k} - {dataset_name})')
        plt.xlabel(f'{method.upper()}1')
        plt.ylabel(f'{method.upper()}2')
        plt.tight_layout()
        plt.savefig(f'hierarchical_{method}_clusters_{dataset_name.lower()}.png')
        plt.close()

    return hierarchical_results

In [7]:
# Apply Leiden clustering
def apply_leiden(reduced_data, labels, dataset_name):
    print(f"Applying Leiden clustering for {dataset_name} dataset...")
    dim_reduction_methods = list(reduced_data.keys())
    resolution_range = np.arange(0.1, 2.1, 0.1)
    leiden_results = {}

    for method in dim_reduction_methods:
        # Set k nearest neighbors parameter
        k_nearest_neighbors = min(15, reduced_data[method].shape[0] - 1)

        # Compute distance matrix
        dist_matrix = squareform(pdist(reduced_data[method], metric='euclidean'))

        # Create KNN graph
        knn_graph = np.zeros((reduced_data[method].shape[0], reduced_data[method].shape[0]))
        for i in range(reduced_data[method].shape[0]):
            # Find k nearest neighbors for each sample
            indices = np.argsort(dist_matrix[i])[1:k_nearest_neighbors+1]  # exclude self
            knn_graph[i, indices] = 1

        # Make the graph symmetric
        knn_graph = np.maximum(knn_graph, knn_graph.T)

        # Convert to igraph
        g = ig.Graph.Adjacency((knn_graph > 0).tolist(), mode='undirected')

        # Evaluate different resolutions
        silhouette_scores = []
        cluster_labels_list = []

        for resolution in resolution_range:
            # Apply Leiden algorithm
            partition = leidenalg.find_partition(g, leidenalg.RBConfigurationVertexPartition,
                                                resolution_parameter=resolution)

            cluster_labels = np.array(partition.membership)
            cluster_labels_list.append(cluster_labels)

            # Calculate silhouette score if more than one cluster
            n_clusters = len(set(cluster_labels))
            if n_clusters > 1:
                silhouette_scores.append(silhouette_score(reduced_data[method], cluster_labels))
            else:
                silhouette_scores.append(0)  # Silhouette score not defined for k=1

        # Find optimal resolution parameter
        optimal_idx = np.argmax(silhouette_scores)
        optimal_resolution = resolution_range[optimal_idx]
        optimal_cluster_labels = cluster_labels_list[optimal_idx]
        optimal_n_clusters = len(set(optimal_cluster_labels))

        # Store clustering results
        leiden_results[method] = {
            'labels': optimal_cluster_labels,
            'optimal_resolution': optimal_resolution,
            'silhouette_scores': silhouette_scores,
            'n_clusters': optimal_n_clusters
        }

        print(f"Leiden on {method} data ({dataset_name}): Optimal resolution = {optimal_resolution:.1f}, "
              f"Number of clusters = {optimal_n_clusters}, "
              f"Silhouette Score = {silhouette_scores[optimal_idx]:.4f}")

        # Calculate Adjusted Rand Index if true labels are available
        ari = adjusted_rand_score(labels, optimal_cluster_labels)
        print(f"Leiden on {method} data ({dataset_name}): Adjusted Rand Index = {ari:.4f}")

        # Plot Leiden results
        plt.figure(figsize=(18, 6))

        # Plot silhouette scores for different resolutions
        plt.subplot(1, 3, 1)
        plt.plot(resolution_range, silhouette_scores, 'o-')
        plt.xlabel('Resolution Parameter')
        plt.ylabel('Silhouette Score')
        plt.title('Silhouette Scores for Different Resolutions')
        plt.axvline(x=optimal_resolution, color='r', linestyle='--')

        # Plot number of clusters for different resolutions
        plt.subplot(1, 3, 2)
        n_clusters_list = [len(set(labels)) for labels in cluster_labels_list]
        plt.plot(resolution_range, n_clusters_list, 'o-')
        plt.xlabel('Resolution Parameter')
        plt.ylabel('Number of Clusters')
        plt.title('Number of Clusters for Different Resolutions')
        plt.axvline(x=optimal_resolution, color='r', linestyle='--')

        # Plot clustering results
        plt.subplot(1, 3, 3)
        scatter = plt.scatter(reduced_data[method][:, 0], reduced_data[method][:, 1],
                            c=optimal_cluster_labels, cmap='viridis', s=50, alpha=0.7)
        plt.colorbar(scatter, label='Cluster')
        plt.title(f'Leiden Clustering on {method.upper()} (res={optimal_resolution:.1f}, k={optimal_n_clusters})')
        plt.xlabel(f'{method.upper()}1')
        plt.ylabel(f'{method.upper()}2')

        plt.tight_layout()
        plt.savefig(f'leiden_{method}_results_{dataset_name.lower()}.png')
        plt.close()

    return leiden_results

In [8]:
# Calculate Jaccard Index
def jaccard_index(labels_true, labels_pred):
    n = len(labels_true)
    pairs_true = set()
    pairs_pred = set()

    for i in range(n):
        for j in range(i+1, n):
            if labels_true[i] == labels_true[j]:
                pairs_true.add((i, j))
            if labels_pred[i] == labels_pred[j]:
                pairs_pred.add((i, j))

    intersection = pairs_true.intersection(pairs_pred)
    union = pairs_true.union(pairs_pred)

    if len(union) == 0:
        return 0

    return len(intersection) / len(union)

In [9]:
# Compare all methods
def compare_methods(kmeans_results, hierarchical_results, leiden_results, labels, dataset_name):
    print(f"Comparing all clustering methods for {dataset_name}...")
    comparison_results = []
    dim_reduction_methods = list(kmeans_results.keys())

    for method in dim_reduction_methods:
        kmeans_labels = kmeans_results[method]['labels']
        hierarchical_labels = hierarchical_results[method]['labels']
        leiden_labels = leiden_results[method]['labels']

        kmeans_ari = adjusted_rand_score(labels, kmeans_labels)
        hierarchical_ari = adjusted_rand_score(labels, hierarchical_labels)
        leiden_ari = adjusted_rand_score(labels, leiden_labels)

        kmeans_ji = jaccard_index(labels, kmeans_labels)
        hierarchical_ji = jaccard_index(labels, hierarchical_labels)
        leiden_ji = jaccard_index(labels, leiden_labels)

        comparison_results.append({
            'Dim Reduction': method.upper(),
            'Method': 'K-means',
            'Clusters': kmeans_results[method]['optimal_k'],
            'ARI': kmeans_ari,
            'Jaccard': kmeans_ji,
            'Silhouette': max(kmeans_results[method]['silhouette_scores'])
        })

        comparison_results.append({
            'Dim Reduction': method.upper(),
            'Method': 'Hierarchical',
            'Clusters': hierarchical_results[method]['optimal_k'],
            'ARI': hierarchical_ari,
            'Jaccard': hierarchical_ji,
            'Silhouette': max(hierarchical_results[method]['silhouette_scores'])
        })

        comparison_results.append({
            'Dim Reduction': method.upper(),
            'Method': 'Leiden',
            'Clusters': leiden_results[method]['n_clusters'],
            'ARI': leiden_ari,
            'Jaccard': leiden_ji,
            'Silhouette': max(leiden_results[method]['silhouette_scores'])
        })

    # Create comparison DataFrame
    comparison_df = pd.DataFrame(comparison_results)
    print(comparison_df)

    # Save comparison results
    comparison_df.to_csv(f'clustering_comparison_results_{dataset_name.lower()}.csv', index=False)

    # Create heatmaps of comparison results
    plt.figure(figsize=(12, 6))
    heatmap_data = comparison_df.pivot_table(
        index='Method',
        columns='Dim Reduction',
        values='ARI'
    )
    sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.3f')
    plt.title(f'Adjusted Rand Index Comparison - {dataset_name}')
    plt.tight_layout()
    plt.savefig(f'ari_comparison_heatmap_{dataset_name.lower()}.png')
    plt.close()

    # Create a heatmap for Jaccard Index
    plt.figure(figsize=(12, 6))
    heatmap_data = comparison_df.pivot_table(
        index='Method',
        columns='Dim Reduction',
        values='Jaccard'
    )
    sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.3f')
    plt.title(f'Jaccard Index Comparison - {dataset_name}')
    plt.tight_layout()
    plt.savefig(f'jaccard_comparison_heatmap_{dataset_name.lower()}.png')
    plt.close()

    # Create a heatmap for Silhouette Score
    plt.figure(figsize=(12, 6))
    heatmap_data = comparison_df.pivot_table(
        index='Method',
        columns='Dim Reduction',
        values='Silhouette'
    )
    sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.3f')
    plt.title(f'Silhouette Score Comparison - {dataset_name}')
    plt.tight_layout()
    plt.savefig(f'silhouette_comparison_heatmap_{dataset_name.lower()}.png')
    plt.close()

    return comparison_df

In [10]:
# Load Arcene dataset
'''
print("Loading Arcene dataset...")
arcene_train_path = r"/Users/leilamichal/Desktop/Genomics/Project/arcene/ARCENE/arcene_train.data"
arcene_labels_path = r"/Users/leilamichal/Desktop/Genomics/Project/arcene/ARCENE/arcene_train.labels"
data_arcene = pd.read_csv(arcene_train_path, delim_whitespace=True, header=None)
labels_arcene = pd.read_csv(arcene_labels_path, delim_whitespace=True, header=None)[0].values
print("Arcene Train Shape:", data_arcene.shape)
'''

'\nprint("Loading Arcene dataset...")\narcene_train_path = r"/Users/leilamichal/Desktop/Genomics/Project/arcene/ARCENE/arcene_train.data"\narcene_labels_path = r"/Users/leilamichal/Desktop/Genomics/Project/arcene/ARCENE/arcene_train.labels"\ndata_arcene = pd.read_csv(arcene_train_path, delim_whitespace=True, header=None)\nlabels_arcene = pd.read_csv(arcene_labels_path, delim_whitespace=True, header=None)[0].values\nprint("Arcene Train Shape:", data_arcene.shape)\n'

In [11]:
# Process Arcene dataset
'''
print("\n====== PROCESSING ARCENE DATASET ======\n")
preprocessed_arcene = preprocess_data(data_arcene, "Arcene")
reduced_data_arcene = apply_dim_reduction(preprocessed_arcene, labels_arcene, "Arcene")
kmeans_results_arcene = apply_kmeans(reduced_data_arcene, labels_arcene, "Arcene")
hierarchical_results_arcene = apply_hierarchical(reduced_data_arcene, labels_arcene, "Arcene")
leiden_results_arcene = apply_leiden(reduced_data_arcene, labels_arcene, "Arcene")
comparison_arcene = compare_methods(kmeans_results_arcene, hierarchical_results_arcene, leiden_results_arcene, labels_arcene, "Arcene")
'''



In [12]:
# Load Gene Expression dataset
print("Loading Gene Expression dataset...")
gene_data_path = r"/Users/leilamichal/Desktop/Genomics/Project/Gene Expression Cancer RNA-Seq/data.csv"
gene_labels_path = r"/Users/leilamichal/Desktop/Genomics/Project/Gene Expression Cancer RNA-Seq/labels.csv"
gene_data = pd.read_csv(gene_data_path)
gene_labels = pd.read_csv(gene_labels_path)
data_gene_exp = gene_data.iloc[:, 1:]  # Skip first column if it's an ID
labels_gene_exp = gene_labels["Class"].values
print("Gene Expression Data Shape:", data_gene_exp.shape)

Loading Gene Expression dataset...
Gene Expression Data Shape: (801, 20531)


In [13]:
# Process Gene Expression dataset
print("\n====== PROCESSING GENE EXPRESSION DATASET ======\n")
preprocessed_gene_exp = preprocess_data(data_gene_exp, "GeneExp")
reduced_data_gene_exp = apply_dim_reduction(preprocessed_gene_exp, labels_gene_exp, "GeneExp")
kmeans_results_gene_exp = apply_kmeans(reduced_data_gene_exp, labels_gene_exp, "GeneExp")
hierarchical_results_gene_exp = apply_hierarchical(reduced_data_gene_exp, labels_gene_exp, "GeneExp")
leiden_results_gene_exp = apply_leiden(reduced_data_gene_exp, labels_gene_exp, "GeneExp")
comparison_gene_exp = compare_methods(kmeans_results_gene_exp, hierarchical_results_gene_exp, leiden_results_gene_exp, labels_gene_exp, "GeneExp")



Preprocessing GeneExp data...
Filtering low variance genes for GeneExp...
GeneExp data shape after filtering: (801, 20264)
Applying dimensionality reduction for GeneExp...
Applying PCA...
PCA data shape: (801, 50)
Explained variance by first 10 components: [0.10444959 0.08560711 0.07638704 0.05198099 0.04065827 0.02890545
 0.0223515  0.02086762 0.01626109 0.01196373]
Cumulative explained variance by first 10 components: [0.10444959 0.1900567  0.26644374 0.31842473 0.35908301 0.38798845
 0.41033995 0.43120757 0.44746866 0.45943239]
Applying t-SNE...
t-SNE data shape: (801, 2)
Applying UMAP...


  warn(


UMAP data shape: (801, 2)
Applying K-means clustering for GeneExp...
K-means on pca GeneExp data: Optimal k = 7, Silhouette Score = 0.2462
K-means on pca GeneExp data: Adjusted Rand Index = 0.7979
K-means on tsne GeneExp data: Optimal k = 5, Silhouette Score = 0.7341
K-means on tsne GeneExp data: Adjusted Rand Index = 0.9925
K-means on umap GeneExp data: Optimal k = 5, Silhouette Score = 0.8891
K-means on umap GeneExp data: Adjusted Rand Index = 0.9925
Applying hierarchical clustering for GeneExp...
Hierarchical on pca GeneExp data: Optimal k = 7, Silhouette Score = 0.2463
Hierarchical on pca GeneExp data: Adjusted Rand Index = 0.8295
Hierarchical on tsne GeneExp data: Optimal k = 5, Silhouette Score = 0.7341
Hierarchical on tsne GeneExp data: Adjusted Rand Index = 0.9925
Hierarchical on umap GeneExp data: Optimal k = 5, Silhouette Score = 0.8891
Hierarchical on umap GeneExp data: Adjusted Rand Index = 0.9925
Applying Leiden clustering for GeneExp dataset...
Leiden on pca data (GeneExp

In [14]:
print("\n====== ANALYSIS COMPLETE ======\n")
#print("Results for Arcene dataset:")
#print(comparison_arcene)
print("\nResults for Gene Expression dataset:")
print(comparison_gene_exp)




Results for Gene Expression dataset:
  Dim Reduction        Method  Clusters       ARI   Jaccard  Silhouette
0           PCA       K-means         7  0.797902  0.724120    0.246198
1           PCA  Hierarchical         7  0.829469  0.763170    0.246290
2           PCA        Leiden         6  0.887511  0.839177    0.239471
3          TSNE       K-means         5  0.992538  0.988756    0.734129
4          TSNE  Hierarchical         5  0.992538  0.988756    0.734129
5          TSNE        Leiden         7  0.756781  0.672646    0.661165
6          UMAP       K-means         5  0.992538  0.988756    0.889073
7          UMAP  Hierarchical         5  0.992538  0.988756    0.889073
8          UMAP        Leiden         7  0.763490  0.680701    0.766656


In [15]:
results = {
    '''
    "arcene": {
        "preprocessed": preprocessed_arcene,
        "reduced_data": reduced_data_arcene,
        "kmeans": kmeans_results_arcene,
        "hierarchical": hierarchical_results_arcene,
        "leiden": leiden_results_arcene,
        "comparison": comparison_arcene
    },
    '''
    "gene_exp": {
        "preprocessed": preprocessed_gene_exp,
        "reduced_data": reduced_data_gene_exp,
        "kmeans": kmeans_results_gene_exp,
        "hierarchical": hierarchical_results_gene_exp,
        "leiden": leiden_results_gene_exp,
        "comparison": comparison_gene_exp
    }
}
print("Analysis complete! Results saved to CSV and visualizations saved as PNG files.")

Analysis complete! Results saved to CSV and visualizations saved as PNG files.
