# 1. Dimensionality Reduction

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap
import re
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from pathlib import Path
import plotly.graph_objects as go
import seaborn as sns

# Set random seeds for reproducibility
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
save_dir = "/Users/jen-hung/Desktop/KTH/mnp-liver/results/"

## Data preprocessing

In [7]:
def preprocess_dataframe(df, nan_threshold=0.05):
    # Select feature columns
    feature_columns = [col for col in df.columns if not col.startswith(('Metadata_', 'Image_')) and not col.endswith('_ObjectNumber')]
    # print("Selected feature columns:", feature_columns)
    print("Number of feature columns:", len(feature_columns))
    
    # Extract features
    X = df[feature_columns]
    
    # Calculate NaN and Inf counts
    nan_counts = X.isna().sum()
    inf_counts = np.isinf(X).sum()
    
    # Identify columns with at least one NaN or Inf
    columns_with_nan_or_inf = nan_counts[nan_counts > 0].index.union(inf_counts[inf_counts > 0].index)
    
    if len(columns_with_nan_or_inf) > 0:
        print("\nColumns with at least one NaN or Inf value:")
        print("\n{:<60} {:>10} {:>10}".format("Column", "NaN Count", "Inf Count"))
        print("-" * 80)
        for col in columns_with_nan_or_inf:
            print("{:<60} {:>10} {:>10}".format(col, nan_counts[col], inf_counts[col]))
        print(f"Total columns with NaN or Inf: {len(columns_with_nan_or_inf)}")
    
    # Filter columns based on NaN and Inf threshold
    threshold = X.shape[0] * nan_threshold
    valid_columns = [col for col in X.columns if X[col].isna().sum() < threshold and np.isinf(X[col]).sum() < threshold]
    # print("\nValid columns after filtering (>50% valid data):", valid_columns)
    print("Number of valid columns:", len(valid_columns))
    
    if not valid_columns:
        raise ValueError("No valid columns remain after filtering.")
    
    # Select valid columns
    X = X[valid_columns]
    
    # Replace inf with NaN
    X = X.replace([np.inf, -np.inf], np.nan)
    
    # Fill NaN with median
    X = X.fillna(X.median())
    
    # Check for remaining NaN values
    nan_count_after_fill = X.isna().sum().sum()
    print("\nNaN count after filling with median:", nan_count_after_fill)
    if nan_count_after_fill > 0:
        print("Warning: Some NaN values remain. Filling with zero.")
        X = X.fillna(0)
    
    # Check if data is valid
    if X.shape[0] == 0 or X.shape[1] == 0:
        raise ValueError("No rows/columns remain after preprocessing.")
    
    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, valid_columns

## 2D & 3D Plot

In [4]:
def plot_dimensionality_reduction(X_scaled, df, valid_columns, metadata_column, method_name, title, tsne_perplexity=30, n_neighbors=15, min_dist=0.1, continuous=True, n_components=2, save_path=None):
    if method_name == 'PCA':
        reducer = PCA(n_components=n_components)
        x_label = 'PC1'
        y_label = 'PC2'
        z_label = 'PC3' if n_components == 3 else None
    elif method_name == 't-SNE':
        reducer = TSNE(n_components=n_components, perplexity=tsne_perplexity, learning_rate='auto', random_state=42)
        x_label = 't-SNE 1'
        y_label = 't-SNE 2'
        z_label = 't-SNE 3' if n_components == 3 else None
    elif method_name == 'UMAP':
        reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, random_state=42)
        x_label = 'UMAP 1'
        y_label = 'UMAP 2'
        z_label = 'UMAP 3' if n_components == 3 else None
    elif method_name == 'LDA':
        labels = df[metadata_column].astype(str)
        reducer = LDA(n_components=n_components)
        X_reduced = reducer.fit_transform(X_scaled, labels)
        x_label = 'LD1'
        y_label = 'LD2'
        z_label = 'LD3' if n_components == 3 else None
    else:
        raise ValueError("Unsupported method")
    
    if method_name != 'LDA':
        X_reduced = reducer.fit_transform(X_scaled)
    
    # 2D Plot
    if n_components == 2:
        fig_2d = go.Figure()
        if continuous:
            concentrations = df[metadata_column].apply(convert_concentration)
            fig_2d.add_trace(go.Scatter(x=X_reduced[:, 0], y=X_reduced[:, 1], mode='markers',
                                        marker=dict(color=concentrations, colorscale='Viridis', showscale=True, colorbar=dict(title=f'{metadata_column} (g)'))))
            title += ' (Continuous)'
        else:
            labels = df[metadata_column].astype(str)
            unique_labels = sorted(labels.unique(), key=convert_concentration, reverse=True)
            color_map = {label: f'rgb({r}, {g}, {b})' for label, (r, g, b) in zip(unique_labels, sns.color_palette('tab10', len(unique_labels)))}
            for label in unique_labels:
                mask = labels == label
                fig_2d.add_trace(go.Scatter(
                    x=X_reduced[mask, 0], 
                    y=X_reduced[mask, 1], 
                    mode='markers',
                    marker=dict(color=color_map[label], size=3),
                    name=str(label),
                    showlegend=True
                ))
            fig_2d.update_layout(showlegend=True, legend=dict(itemsizing='constant'))
            title += ' (Categorical)'
        fig_2d.update_layout(title=title + f' (2D, {len(valid_columns)} features)', xaxis_title=x_label, yaxis_title=y_label)
        fig_2d.show()
        # Save 2D plot as PNG if save_path is provided
        if save_path:
            fig_2d.write_image(f"{save_path}_2D.png", width=1200, height=800)
    
    # 3D Plot (if n_components = 3)
    if n_components == 3:
        fig_3d = go.Figure()
        if continuous:
            concentrations = df[metadata_column].apply(convert_concentration)
            fig_3d.add_trace(go.Scatter3d(x=X_reduced[:, 0], y=X_reduced[:, 1], z=X_reduced[:, 2], mode='markers',
                                          marker=dict(color=concentrations, colorscale='Viridis', showscale=True, colorbar=dict(title=f'{metadata_column} (g)'))))
            title += ' (Continuous)'
        else:
            labels = df[metadata_column].astype(str)
            unique_labels = sorted(labels.unique(), key=convert_concentration, reverse=True)
            color_map = {label: f'rgb({r}, {g}, {b})' for label, (r, g, b) in zip(unique_labels, sns.color_palette('tab10', len(unique_labels)))}
            for label in unique_labels:
                mask = labels == label
                fig_3d.add_trace(go.Scatter3d(
                    x=X_reduced[mask, 0], 
                    y=X_reduced[mask, 1], 
                    z=X_reduced[mask, 2], 
                    mode='markers',
                    marker=dict(color=color_map[label], size=3),
                    name=str(label),
                    showlegend=True
                ))
            fig_3d.update_layout(showlegend=True, legend=dict(itemsizing='constant'))
            title += ' (Categorical)'
        fig_3d.update_layout(title=title + f' (3D, {len(valid_columns)} features)', 
                             scene=dict(xaxis_title=x_label, yaxis_title=y_label, zaxis_title=z_label))
        fig_3d.show()
        # Save 3D plot as PNG if save_path is provided
        if save_path:
            fig_3d.write_image(f"{save_path}_3D.png", width=1200, height=800)
            
    return fig_2d if n_components == 2 else fig_3d
    
    # Function to convert concentration strings to numerical values (grams)
def convert_concentration(value):
    if pd.isna(value) or value == "":
        return 0.0
    try:
        value = str(value).lower().replace(" ", "")  # Normalize input
        # Use regex to extract number and unit
        match = re.match(r'(\d*\.?\d+)([mnu]?g)?', value)
        if match:
            num = float(match.group(1))
            unit = match.group(2) or ''
            if unit == 'mg':
                return num * 1e-3  # Convert mg to g
            elif unit == 'ug':
                return num * 1e-6  # Convert ug to g
            elif unit == 'ng':
                return num * 1e-9  # Convert ng to g
            elif unit == 'g' or not unit:  # Includes pure numbers or 'g'
                return num
        return float(value)  # Fallback for pure numbers
    except ValueError:
        return 0.0  # Default to 0 for invalid entries
    


## Load Dataset

In [5]:
# Load data
csv_dir = Path("/Users/jen-hung/Desktop/KTH/mnp-liver/csv_data")
hep_path = csv_dir / "hep"
f1 = hep_path / "df_SingleCell_AO_HEPG2_102912.csv"
f2 = hep_path / "df_SingleCell_AO_HEPG2_110341.csv"
f3 = hep_path / "df_SingleCell_AO_HEPG2_231222.csv"

df1 = pd.read_csv(f1, sep=",", header=0)
# df2 = pd.read_csv(f2, sep=",", header=0)
# df3 = pd.read_csv(f3, sep=",", header=0)

# Preprocess each DataFrame
# dataframes = {'df1': df1, 'df2': df2, 'df3': df3}
dataframes = {'df1': df1}
preprocessed_data = {}

# print(f"Original df1 shape: {df1.shape}")
# print(f"Original df2 shape: {df2.shape}")
# print(f"Original df3 shape: {df3.shape}")

for name, df in dataframes.items():
    print(f"Original {name} shape: {df.shape}")
    print(f"Preprocessing {name}...")
    X_scaled, valid_columns = preprocess_dataframe(df)
    preprocessed_data[name] = {'X_scaled': X_scaled, 'valid_columns': valid_columns, 'df': df}
    print(f"Preprocessed {name} with {len(valid_columns)} valid columns.\n")


  df1 = pd.read_csv(f1, sep=",", header=0)


Original df1 shape: (45596, 1639)
Preprocessing df1...
Number of feature columns: 1627

Columns with at least one NaN or Inf value:

Column                                                        NaN Count  Inf Count
--------------------------------------------------------------------------------
Cells_AreaShape_NormalizedMoment_0_0                              45596          0
Cells_AreaShape_NormalizedMoment_0_1                              45596          0
Cells_AreaShape_NormalizedMoment_1_0                              45596          0
Nuclei_AreaShape_NormalizedMoment_0_0                             45596          0
Nuclei_AreaShape_NormalizedMoment_0_1                             45596          0
Nuclei_AreaShape_NormalizedMoment_1_0                             45596          0
Nuclei_Mean_Nucleoli_AreaShape_Area                               20434          0
Nuclei_Mean_Nucleoli_AreaShape_BoundingBoxArea                    20434          0
Nuclei_Mean_Nucleoli_AreaShape_Bounding

In [None]:
# Define columns of interest
columns_of_interest = [
    'Cells_AreaShape_NormalizedMoment_0_0',
    'Cells_AreaShape_NormalizedMoment_0_1',
    'Cells_AreaShape_NormalizedMoment_1_0',
    'Nuclei_AreaShape_NormalizedMoment_0_0',
    'Nuclei_AreaShape_NormalizedMoment_0_1',
    'Nuclei_AreaShape_NormalizedMoment_1_0'
]

# Print first 10 values for specified columns
print("\nFirst 10 values for specified columns:")
for col in columns_of_interest:
    if col in df1.columns:
        print(f"\n{col}:")
        print(df1[col].head(10).to_list())
    else:
        print(f"\n{col}: Not found in DataFrame")

In [None]:
column_name = 'Metadata_Well' 

value_counts = df1[column_name].value_counts().sort_index()
print(f"Unique values and their counts in column '{column_name}':")
print(value_counts)

value_counts = df2[column_name].value_counts().sort_index()
print(f"Unique values and their counts in column '{column_name}':")
print(value_counts)

value_counts = df3[column_name].value_counts().sort_index()
print(f"Unique values and their counts in column '{column_name}':")
print(value_counts)

In [None]:
df1.head()

Unnamed: 0,Image_ImageNumber,Cells_ObjectNumber,Metadata_Plate,Metadata_QCFlag,Metadata_Site,Metadata_Well,Cells_AreaShape_Area,Cells_AreaShape_BoundingBoxArea,Cells_AreaShape_BoundingBoxMaximum_X,Cells_AreaShape_BoundingBoxMaximum_Y,...,Cytoplasm_Texture_Variance_AOPI_5_00_256,Cytoplasm_Texture_Variance_AOPI_5_01_256,Cytoplasm_Texture_Variance_AOPI_5_02_256,Cytoplasm_Texture_Variance_AOPI_5_03_256,Metadata_plate_map_name,Metadata_well_position,Metadata_cell,Metadata_compound,Metadata_concentration_perliter,Metadata_control_type
0,1,1,231223_102912_Plate_1_,0,1,B10,426,525,232,37,...,739.7275,12.4,186.75,530.388889,231223_102912_Plate 1_HEP,B10,HepG2,MPs,1mg,trt
1,1,2,231223_102912_Plate_1_,0,1,B10,2545,5664,658,62,...,116.208066,87.81701,131.425272,136.359616,231223_102912_Plate 1_HEP,B10,HepG2,MPs,1mg,trt
2,1,3,231223_102912_Plate_1_,0,1,B10,2597,4686,422,74,...,4194.475495,4431.097009,4279.63264,4227.895725,231223_102912_Plate 1_HEP,B10,HepG2,MPs,1mg,trt
3,1,4,231223_102912_Plate_1_,0,1,B10,1451,2420,443,87,...,615.933816,497.685663,715.751508,925.54385,231223_102912_Plate 1_HEP,B10,HepG2,MPs,1mg,trt
4,1,5,231223_102912_Plate_1_,0,1,B10,2572,3685,688,91,...,55.162092,60.125388,46.747763,51.916868,231223_102912_Plate 1_HEP,B10,HepG2,MPs,1mg,trt


In [8]:
objectnumber_columns = [col for col in df1.columns if '_ObjectNumber' in col]
print(objectnumber_columns)

['Cells_ObjectNumber']


## PCA

In [None]:
# Apply dimensionality reduction methods to each DataFrame
metadata_column = 'Metadata_concentration_perliter'
method = 'PCA'

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        # 2D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=2)
        plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                      method, f"{method} of {name}", continuous=False, n_components=2)
        
        # 3D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=3)
        plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                      method, f"{method} of {name}", continuous=False, n_components=3)
    else:
        print(f"Warning: {metadata_column} not found in {name}.")

## t-SNE

In [None]:
# Apply dimensionality reduction methods to each DataFrame
metadata_column = 'Metadata_concentration_perliter'
method = 't-SNE'
save_dir = '/Users/jen-hung/Desktop/KTH/mnp-liver/results/'

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        # 2D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=2)
        plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                      method, f"{method} of {name}", continuous=False, n_components=2, 
                                      save_path=save_dir + f"{name}_{method}")
        
        # 3D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=3)
        plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                      method, f"{method} of {name}", continuous=False, n_components=3, 
                                      save_path=save_dir + f"{name}_{method}")
    else:
        print(f"Warning: {metadata_column} not found in {name}.")

## UMAP

In [None]:
# Apply dimensionality reduction methods to each DataFrame
metadata_column = 'Metadata_concentration_perliter'
method = 'UMAP'
# save_dir = '/Users/jen-hung/Desktop/KTH/mnp-liver/results/'

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        # 2D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=2)
        plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                      method, f"{method} of {name}", continuous=False, n_components=2, 
                                      save_path=save_dir + f"{name}_{method}")
        
        # 3D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=3)
        plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                      method, f"{method} of {name}", continuous=False, n_components=3, 
                                      save_path=save_dir + f"{name}_{method}")
    else:
        print(f"Warning: {metadata_column} not found in {name}.")

## LDA

In [15]:
# Apply dimensionality reduction methods to each DataFrame
metadata_column = 'Metadata_concentration_perliter'
method = 'LDA'
# save_dir = "/home/jen-hungwang/Desktop/"

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        # 2D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=2)
        plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                      method, f"{method} of {name}", continuous=False, n_components=2, 
                                      save_path=save_dir + f"{name}_{method}")
        
        # 3D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=3)
        fig_original = plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                      method, f"{method} of {name}", continuous=False, n_components=3,
                                      save_path=save_dir + f"{name}_{method}")
    else:
        print(f"Warning: {metadata_column} not found in {name}.")

# 2. Fine-tune dimensionality reduction methods

## t-SNE

In [None]:
# Apply dimensionality reduction methods to each DataFrame
metadata_column = 'Metadata_concentration_perliter'
method = 't-SNE'
perplexity = [2, 5, 10, 20, 30, 50, 100]
save_dir = "/home/jen-hungwang/Desktop/t-sne/"

for name, data in preprocessed_data.items():
    for p in perplexity:
        if metadata_column in data['df'].columns:
            print(f"Hyperparameters: perplexity={p}")
            # 2D plots
            # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
            #                                 method, f"{method} of {name}", continuous=True, n_components=2)
            plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                          method, f"{method} of {name}, perplexity={p}", tsne_perplexity=p, continuous=False, n_components=2, save_path=save_dir + f"{name}_{method}_p{p}")
            
            # 3D plots
            # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
            #                                 method, f"{method} of {name}", continuous=True, n_components=3)
            # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
            #                               method, f"{method} of {name}, perplexity={p}", tsne_perplexity=p, continuous=False, n_components=3, save_path=save_dir + f"{name}_{method}_p{p}")
        else:
            print(f"Warning: {metadata_column} not found in {name}.")

## UMAP

In [None]:
# Apply dimensionality reduction methods to each DataFrame
metadata_column = 'Metadata_concentration_perliter'
method = 'UMAP'
# n_neighbors = [2, 5, 10, 20, 50, 100, 200]
n_neighbors = [200]
min_dist = [0.1, 0.25, 0.5, 0.8, 0.99]
save_dir = "/home/jen-hungwang/Desktop/umap/n200/"

for name, data in preprocessed_data.items():
    for n in n_neighbors:
        for m in min_dist:
            if metadata_column in data['df'].columns:
                print(f"Hyperparameters: n_neighbors={n}, min_dist={m}")
                # 2D plots
                # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                #                                 method, f"{method} of {name}", continuous=True, n_components=2)
                plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                              method, f"{method} of {name}, n_neighbors={n}, min_dist={m}", n_neighbors=n, min_dist=m, continuous=False, n_components=2, save_path=save_dir + f"{name}_{method}_n{n}_m{m}")
                
                # 3D plots
                # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                #                                 method, f"{method} of {name}", continuous=True, n_components=3)
                # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                #                               method, f"{method} of {name}, n_neighbors={n}, min_dist={m}", n_neighbors=n, min_dist=m, continuous=False, n_components=3, save_path=save_dir + f"{name}_{method}_n{n}_m{m}")
            else:
                print(f"Warning: {metadata_column} not found in {name}.")

# 3. Clustering

In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from plotly.subplots import make_subplots

In [None]:
def perform_clustering(X_reduced, df, metadata_column, method_name, reduction_method, n_clusters=3, eps=0.5, min_samples=5, covariance='full', save_path=None):
    if method_name == 'KMeans':
        clusterer = KMeans(n_clusters=n_clusters, random_state=42)
    elif method_name == 'DBSCAN':
        clusterer = DBSCAN(eps=eps, min_samples=min_samples)
    elif method_name == 'Agglomerative':
        clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    elif method_name == 'GaussianMixture':
        clusterer = GaussianMixture(n_components=n_clusters, random_state=42, covariance_type=covariance)
    else:
        raise ValueError("Unsupported clustering method")
    
    if df is None:
        df = X_reduced

    # Fit the clustering model
    labels = clusterer.fit_predict(df)
    
    # Compute evaluation metrics
    metrics = {}
    if len(np.unique(labels)) > 1:  # Metrics require at least 2 clusters
        metrics['Silhouette'] = silhouette_score(X_reduced, labels) if len(np.unique(labels)) < X_reduced.shape[0] else -1
        metrics['Davies-Bouldin'] = davies_bouldin_score(X_reduced, labels)
        metrics['Calinski-Harabasz'] = calinski_harabasz_score(X_reduced, labels)
    else:
        metrics['Silhouette'] = -1
        metrics['Davies-Bouldin'] = -1
        metrics['Calinski-Harabasz'] = -1
    
    # Initialize figure
    fig = go.Figure()
    unique_labels = np.unique(labels)
    color_map = {label: f'rgb({r*255:.0f}, {g*255:.0f}, {b*255:.0f})' for label, (r, g, b) in zip(unique_labels, sns.color_palette('tab10', len(unique_labels)))}
    
    # Create metrics text for display
    metrics_text = (f"Silhouette: {metrics['Silhouette']:.3f}<br>"
                    f"Davies-Bouldin: {metrics['Davies-Bouldin']:.3f}<br>"
                    f"Calinski-Harabasz: {metrics['Calinski-Harabasz']:.3f}")
    
    # Check dimensionality of X_reduced for 2D or 3D visualization
    if X_reduced.shape[1] >= 3:
        # 3D Visualization
        for label in unique_labels:
            mask = labels == label
            fig.add_trace(go.Scatter3d(
                x=X_reduced[mask, 0], 
                y=X_reduced[mask, 1], 
                z=X_reduced[mask, 2],
                mode='markers',
                marker=dict(size=3, color=color_map[label], opacity=0.8),
                name=f'Cluster {label}' if label != -1 else 'Noise',
                showlegend=True
            ))
        
        # Set axis labels based on reduction method for 3D
        if reduction_method == 'PCA':
            x_label, y_label, z_label = 'PC1', 'PC2', 'PC3'
        elif reduction_method == 'LDA':
            x_label, y_label, z_label = 'LD1', 'LD2', 'LD3'
        elif reduction_method == 't-SNE':
            x_label, y_label, z_label = 't-SNE 1', 't-SNE 2', 't-SNE 3'
        elif reduction_method == 'UMAP':
            x_label, y_label, z_label = 'UMAP 1', 'UMAP 2', 'UMAP 3'
        else:
            x_label, y_label, z_label = 'Component 1', 'Component 2', 'Component 3'
        
        fig.update_layout(
            title=f'{method_name} Clustering on {reduction_method} (3D, {len(unique_labels)} clusters)',
            scene=dict(
                xaxis_title=x_label,
                yaxis_title=y_label,
                zaxis_title=z_label
            ),
            showlegend=True,
            legend=dict(x=0.5, y=0, xanchor="center", yanchor="bottom", orientation="h"),
            annotations=[
                dict(
                    x=0.95,
                    y=0.95,
                    xref="paper",
                    yref="paper",
                    text=metrics_text,
                    showarrow=False,
                    align="right",
                    font=dict(size=12),
                    bgcolor="rgba(255, 255, 255, 0.8)",
                    bordercolor="black",
                    borderwidth=1
                )
            ]
        )
        # plot_filename = f'{method_name}_{reduction_method}_3D_clustering.png' if save_path is None else f'{save_path}.png'
    
    else:
        # 2D Visualization
        for label in unique_labels:
            mask = labels == label
            fig.add_trace(go.Scatter(
                x=X_reduced[mask, 0], 
                y=X_reduced[mask, 1], 
                mode='markers',
                marker=dict(color=color_map[label], size=3),
                name=f'Cluster {label}' if label != -1 else 'Noise',
                showlegend=True
            ))
        
        # Set axis labels based on reduction method for 2D
        if reduction_method == 'PCA':
            x_label, y_label = 'PC1', 'PC2'
        elif reduction_method == 'LDA':
            x_label, y_label = 'LD1', 'LD2'
        elif reduction_method == 't-SNE':
            x_label, y_label = 't-SNE 1', 't-SNE 2'
        elif reduction_method == 'UMAP':
            x_label, y_label = 'UMAP 1', 'UMAP 2'
        else:
            x_label, y_label = 'Component 1', 'Component 2'
        
        fig.update_layout(
            title=f'{method_name} Clustering on {reduction_method} (2D, {len(unique_labels)} clusters)',
            xaxis_title=x_label,
            yaxis_title=y_label,
            showlegend=True,
            legend=dict(x=0.5, y=0, xanchor="center", yanchor="bottom", orientation="h"),
            annotations=[
                dict(
                    x=0.95,
                    y=0.95,
                    xref="paper",
                    yref="paper",
                    text=metrics_text,
                    showarrow=False,
                    align="right",
                    font=dict(size=12),
                    bgcolor="rgba(255, 255, 255, 0.8)",
                    bordercolor="black",
                    borderwidth=1
                )
            ]
        )
        # plot_filename = f'{method_name}_{reduction_method}_2D_clustering.png' if save_path is None else f'{save_path}.png'
    
    # Save the plot as PNG if save_png is True
    if save_path:
        fig.write_image(f"{save_path}.png", width=1200, height=800)
    
    fig.show()
    
    return labels, fig

In [None]:
def visualize_cluster_concentration_distribution(df, metadata_column, labels, cluster_method, reduction_method, dim, save_path=None):
    # Extract unique clusters (excluding noise if present, e.g., -1 in DBSCAN)
    unique_clusters = np.unique(labels[labels != -1])
    n_clusters = len(unique_clusters)
    
    # Determine grid layout (e.g., 2 columns, rows adjust based on number of clusters)
    cols = min(2, n_clusters)  # Maximum 2 columns for readability
    rows = (n_clusters + cols - 1) // cols  # Calculate rows needed
    
    # Create subplot figure
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=[f'Cluster {cluster}' for cluster in unique_clusters])
    
    # Get unique concentration values and convert to numerical for sorting
    concentration_values = df[metadata_column].unique()
    # print("Raw concentration values:", concentration_values)
    converted_values = np.array([convert_concentration(val) for val in concentration_values])
    # print("Converted values:", converted_values)
    # Sort indices based on converted values in descending order
    sort_indices = np.argsort(converted_values)[::-1]
    sorted_concentration_values = concentration_values[sort_indices]
    # print("Sorted concentration values:", sorted_concentration_values)
    
    
    # Add bar charts for each cluster
    for idx, cluster in enumerate(unique_clusters):
        row = (idx // cols) + 1
        col = (idx % cols) + 1
        
        cluster_mask = labels == cluster
        cluster_concentrations = df[metadata_column][cluster_mask]
        counts = np.zeros(len(sorted_concentration_values), dtype=int)
        for i, conc in enumerate(sorted_concentration_values):
            counts[i] = np.sum(cluster_concentrations == conc)
        
        fig.add_trace(
            go.Bar(
                x=sorted_concentration_values,
                y=counts,
                marker_color='rgb(31, 78, 121)'
            ),
            row=row,
            col=col
        )
    
    # Update layout with enhanced resolution and detailed title
    fig.update_layout(
        title_text=f'Concentration Distribution Across Clusters ({reduction_method}, {cluster_method}, {n_clusters} Clusters, {dim}D)',
        height=400 * rows,  # Increased for better resolution
        width=800 * cols,   # Increased for better resolution
        showlegend=False    # No global legend since each subplot is labeled
    )
    
    # Update axes titles for all subplots
    for i in range(rows):
        for j in range(cols):
            fig.update_xaxes(title_text='Concentration Levels', row=i+1, col=j+1)
            fig.update_yaxes(title_text='Count', row=i+1, col=j+1)
    
    # Save the plot as PNG if save_path is provided
    if save_path is not None:
        fig.write_image(f"{save_path}/{name}_{reduction_method}_{cluster_method}_{dim}D_{n_clusters}clusters_histogram.png", width=800 * cols, height=400 * rows)
    
    fig.show()

In [None]:
# Apply clustering to reduced data from PCA, LDA, t-SNE, and UMAP
metadata_column = 'Metadata_concentration_perliter'
# clustering_methods = ['KMeans', 'DBSCAN', 'Agglomerative', 'GaussianMixture']
clustering_methods = ['KMeans', 'GaussianMixture']
# clustering_methods = ['GaussianMixture']
reduction_methods = ['LDA']
# reduction_methods = ['PCA', 'LDA', 't-SNE', 'UMAP']
n_components = 3
n_clusters = 3
covariance = 'full'  # 'full', 'tied', 'diag', 'spherical'
# save_dir = "/home/jen-hungwang/Desktop/mnp_analysis/eval/"

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        X_scaled = data['X_scaled']
        
        for reduction_method in reduction_methods:
            # Compute reduced data based on the method
            if reduction_method == 'PCA':
                reducer = PCA(n_components=n_components)
                X_reduced = reducer.fit_transform(X_scaled)
            elif reduction_method == 'LDA':
                labels = data['df'][metadata_column].astype(str)
                reducer = LDA(n_components=n_components)
                X_reduced = reducer.fit_transform(X_scaled, labels)
            elif reduction_method == 't-SNE':
                reducer = TSNE(n_components=n_components, perplexity=30, learning_rate=200, random_state=42)
                X_reduced = reducer.fit_transform(X_scaled)
            elif reduction_method == 'UMAP':
                reducer = umap.UMAP(n_components=n_components, n_neighbors=15, min_dist=0.1, random_state=42)
                X_reduced = reducer.fit_transform(X_scaled)
            
            for cluster_method in clustering_methods:
                if cluster_method == 'KMeans' or cluster_method == 'Agglomerative' or cluster_method == 'GaussianMixture':
                    labels, fig_reduced = perform_clustering(X_reduced, None, metadata_column, cluster_method, reduction_method, n_clusters=n_clusters, covariance=covariance, save_path=save_dir + f"{name}_{reduction_method}_{cluster_method}_{n_components}D_{n_clusters}clusters")
                    # labels, fig_reduced = perform_clustering(X_reduced, None, metadata_column, cluster_method, reduction_method, n_clusters=n_clusters)
                    # labels, fig_all = perform_clustering(X_reduced, X_scaled, metadata_column, cluster_method, reduction_method, n_clusters=n_clusters, save_path=save_dir + f"{name}_{reduction_method}_{cluster_method}_{n_components}D_{n_clusters}clusters_noreduced")
                elif cluster_method == 'DBSCAN':
                    labels = perform_clustering(X_reduced, None, metadata_column, cluster_method, reduction_method, eps=0.5, min_samples=5, save_path=save_dir + f"{name}_{reduction_method}_{cluster_method}_{n_components}D")
                # print(f"{cluster_method} clustering labels on {reduction_method} for {name}: {np.unique(labels)}")
                
                # Visualize concentration distribution across clusters~%
                # visualize_cluster_concentration_distribution(X_reduced, df, metadata_column, labels, save_path=save_dir + f"{name}_{reduction_method}_{cluster_method}_{n_components}D_{n_clusters}clusters_histogram")
                # Visualize concentration distribution across clusters
                visualize_cluster_concentration_distribution(
                    df, metadata_column, labels, cluster_method, reduction_method, n_components, save_path=save_dir
                )
    else:
        print(f"Warning: {metadata_column} not found in {name}.")

## (Optional) Grid Search: Clustering

In [None]:
# Apply clustering to reduced data from PCA, LDA, t-SNE, and UMAP
metadata_column = 'Metadata_concentration_perliter'
clustering_methods = ['KMeans', 'DBSCAN', 'Agglomerative', 'GaussianMixture']
reduction_methods = ['LDA']
# reduction_methods = ['PCA', 'LDA', 't-SNE', 'UMAP']
n_components_list = [2, 3]
n_clusters_list = [3, 4, 5, 6, 7, 8]
save_dir = "/home/jen-hungwang/Desktop/eval/"

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        X_scaled = data['X_scaled']

        for reduction_method in reduction_methods:
            for n_components in n_components_list:
                # Compute reduced data based on the method
                if reduction_method == 'PCA':
                    reducer = PCA(n_components=n_components)
                    X_reduced = reducer.fit_transform(X_scaled)
                elif reduction_method == 'LDA':
                    labels = data['df'][metadata_column].astype(str)
                    reducer = LDA(n_components=n_components)
                    X_reduced = reducer.fit_transform(X_scaled, labels)
                elif reduction_method == 't-SNE':
                    reducer = TSNE(n_components=n_components, perplexity=30, learning_rate=200, random_state=42)
                    X_reduced = reducer.fit_transform(X_scaled)
                elif reduction_method == 'UMAP':
                    reducer = umap.UMAP(n_components=n_components, n_neighbors=15, min_dist=0.1, random_state=42)
                    X_reduced = reducer.fit_transform(X_scaled)

                for cluster_method in clustering_methods:
                    if cluster_method in ['KMeans', 'Agglomerative', 'GaussianMixture']:
                        for n_clusters in n_clusters_list:
                            labels = perform_clustering(
                                X_reduced, 
                                None, 
                                metadata_column, 
                                cluster_method, 
                                reduction_method, 
                                n_clusters=n_clusters, 
                                save_path=save_dir + f"{name}_{reduction_method}_{cluster_method}_{n_components}D_{n_clusters}clusters"
                            )
                            print(f"{cluster_method} clustering labels on {reduction_method} for {name} ({n_components}D, {n_clusters} clusters): {np.unique(labels)}")
                    elif cluster_method == 'DBSCAN':
                        labels = perform_clustering(
                            X_reduced, 
                            None, 
                            metadata_column, 
                            cluster_method, 
                            reduction_method, 
                            eps=0.5, 
                            min_samples=5, 
                            save_path=save_dir + f"{name}_{reduction_method}_{cluster_method}_{n_components}D"
                        )
                        print(f"{cluster_method} clustering labels on {reduction_method} for {name} ({n_components}D): {np.unique(labels)}")
    else:
        print(f"Warning: {metadata_column} not found in {name}.")

## (Optional) Subplots

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


figs       = [fig_original, fig_reduced]
fig_labels = ['Original classes',
              'Cluster with dimensionality reduction']
num_plots = 2

# -- 1. build a 3-column subplot skeleton, each cell is 3-D --------------
combined_fig = make_subplots(
    rows=1, cols=num_plots,
    specs=[[{"type": "scene"}]*num_plots],               # 3× Scatter3d panels
    subplot_titles=fig_labels,
    horizontal_spacing=0.07                      # little gap between plots
)

# -- 2. copy every trace into the right cell, tweak the marker size -------
for col, (fig, label) in enumerate(zip(figs, fig_labels), start=1):
    for trace in fig.data:
        # make dots smaller (overwrite whatever size was there)
        if hasattr(trace, "marker"):          # safety check
            trace.marker.size = 3
        # prefix trace names with panel label so the legend is explicit
        trace.name = f"{trace.name}"
        combined_fig.add_trace(trace, row=1, col=col)

# -- 3. carry over each panel’s axis titles -------------------------------
for col, fig in enumerate(figs, start=1):
    if hasattr(fig.layout, "scene"):  # 3-D source figure
        tgt_scene = "scene" if col == 1 else f"scene{col}"
        combined_fig.update_layout({
            tgt_scene: dict(
                xaxis_title = fig.layout.scene.xaxis.title.text,
                yaxis_title = fig.layout.scene.yaxis.title.text,
                zaxis_title = fig.layout.scene.zaxis.title.text
            )
        })
    else:                             # 2-D source figure (just in case)
        combined_fig.update_xaxes(title_text=fig.layout.xaxis.title.text,
                                  row=1, col=col)
        combined_fig.update_yaxes(title_text=fig.layout.yaxis.title.text,
                                  row=1, col=col)

# -- 4. overall figure cosmetics ------------------------------------------
combined_fig.update_layout(
    title="Combined clustering results",
    height=500, width=1350,
    legend=dict(itemsizing="constant")   # keep legend entry size compact
)

combined_fig.show(renderer="browser")

# 4. Evaluation

In [None]:
from sklearn.metrics import silhouette_score, adjusted_rand_score, calinski_harabasz_score

In [None]:
# Evaluation function for clustering results
def evaluate_clustering(X_reduced, labels, df, metadata_column):
    # Convert concentration to categorical labels as ground truth
    true_labels = df[metadata_column].astype(str)
    
    # Silhouette Score
    silhouette_avg = silhouette_score(X_reduced, labels) if len(np.unique(labels)) > 1 else None
    
    # Adjusted Rand Score (requires true labels)
    adjusted_rand = adjusted_rand_score(true_labels, labels) if len(np.unique(labels)) > 1 else None
    
    # Calinski-Harabasz Score
    ch_score = calinski_harabasz_score(X_reduced, labels) if len(np.unique(labels)) > 1 else None
    
    print(f"Evaluation Metrics for Clustering:")
    print(f"Silhouette Score: {silhouette_avg:.4f}" if silhouette_avg is not None else "Silhouette Score: N/A (single cluster)")
    print(f"Adjusted Rand Score: {adjusted_rand:.4f}" if adjusted_rand is not None else "Adjusted Rand Score: N/A (single cluster or no true labels)")
    print(f"Calinski-Harabasz Score: {ch_score:.4f}" if ch_score is not None else "Calinski-Harabasz Score: N/A (single cluster)")
    return silhouette_avg, adjusted_rand, ch_score

In [None]:
# Evaluate clustering results for each reduction method, clustering method, and dataset
metadata_column = 'Metadata_concentration_perliter'
clustering_methods = ['KMeans', 'DBSCAN', 'Agglomerative']
reduction_methods = ['LDA']
# reduction_methods = ['PCA', 'LDA', 't-SNE', 'UMAP']
n_components = 3
save_dir = "/home/jen-hungwang/Desktop/eval/"

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        X_scaled = data['X_scaled']
        
        for reduction_method in reduction_methods:
            # Compute reduced data based on the method
            if reduction_method == 'PCA':
                reducer = PCA(n_components=n_components)
                X_reduced = reducer.fit_transform(X_scaled)
            elif reduction_method == 'LDA':
                labels = data['df'][metadata_column].astype(str)
                reducer = LDA(n_components=n_components)
                X_reduced = reducer.fit_transform(X_scaled, labels)
            elif reduction_method == 't-SNE':
                reducer = TSNE(n_components=n_components, perplexity=30, learning_rate=200, random_state=42)
                X_reduced = reducer.fit_transform(X_scaled)
            elif reduction_method == 'UMAP':
                reducer = umap.UMAP(n_components=n_components, n_neighbors=15, min_dist=0.1, random_state=42)
                X_reduced = reducer.fit_transform(X_scaled)
            
            for cluster_method in clustering_methods:
                if cluster_method == 'KMeans' or cluster_method == 'Agglomerative':
                    labels = perform_clustering(X_reduced, None, metadata_column, cluster_method, reduction_method, n_clusters=3, save_path=save_dir + f"{name}_{reduction_method}_{cluster_method}")
                elif cluster_method == 'DBSCAN':
                    labels = perform_clustering(X_reduced, None, metadata_column, cluster_method, reduction_method, eps=0.5, min_samples=5, save_path=save_dir + f"{name}_{reduction_method}_{cluster_method}")
                print(f"\nEvaluating {cluster_method} clustering on {reduction_method} for {name}:")
                evaluate_clustering(X_reduced, labels, data['df'], metadata_column)
    else:
        print(f"Warning: {metadata_column} not found in {name}.")