# 1. Dimensionality Reduction

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from pathlib import Path
import plotly.graph_objects as go
import seaborn as sns

# Set random seeds for reproducibility
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


## Data preprocessing

In [None]:
# Preprocessing function
def preprocess_dataframe(df, nan_threshold=0.5):
    feature_columns = [col for col in df.columns if not col.startswith(('Metadata_', 'Image_'))]
    print("Selected feature columns:", feature_columns)
    print("Number of feature columns:", len(feature_columns))
    
    X = df[feature_columns]
    # print("Initial number of rows in X:", X.shape[0])
    # print("NaN count per column:\n", X.isna().sum())
    # print("Inf count per column:\n", np.isinf(X).sum())
    
    threshold = X.shape[0] * nan_threshold
    valid_columns = [col for col in X.columns if X[col].isna().sum() < threshold and np.isinf(X[col]).sum() < threshold]
    print("Valid columns after filtering (>50% valid data):", valid_columns)
    print("Number of valid columns:", len(valid_columns))
    
    if not valid_columns:
        raise ValueError("No valid columns remain after filtering.")
    
    X = X[valid_columns]
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())
    
    nan_count_after_fill = X.isna().sum().sum()
    print("NaN count after filling with median:", nan_count_after_fill)
    if nan_count_after_fill > 0:
        print("Warning: Some NaN values remain. Filling with zero.")
        X = X.fillna(0)
    
    if X.shape[0] == 0 or X.shape[1] == 0:
        raise ValueError("No rows/columns remain after preprocessing.")
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, valid_columns

## 2D & 3D Plot

In [30]:
def plot_dimensionality_reduction(X_scaled, df, valid_columns, metadata_column, method_name, title, tsne_perplexity=30, n_neighbors=15, min_dist=0.1, continuous=True, n_components=2, save_path=None):
    if method_name == 'PCA':
        reducer = PCA(n_components=n_components)
        x_label = 'PC1'
        y_label = 'PC2'
        z_label = 'PC3' if n_components == 3 else None
    elif method_name == 't-SNE':
        reducer = TSNE(n_components=n_components, perplexity=tsne_perplexity, learning_rate='auto', random_state=42)
        x_label = 't-SNE 1'
        y_label = 't-SNE 2'
        z_label = 't-SNE 3' if n_components == 3 else None
    elif method_name == 'UMAP':
        reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, random_state=42)
        x_label = 'UMAP 1'
        y_label = 'UMAP 2'
        z_label = 'UMAP 3' if n_components == 3 else None
    elif method_name == 'LDA':
        labels = df[metadata_column].astype(str)
        reducer = LDA(n_components=n_components)
        X_reduced = reducer.fit_transform(X_scaled, labels)
        x_label = 'LD1'
        y_label = 'LD2'
        z_label = 'LD3' if n_components == 3 else None
    else:
        raise ValueError("Unsupported method")
    
    if method_name != 'LDA':
        X_reduced = reducer.fit_transform(X_scaled)
    
    # 2D Plot
    if n_components == 2:
        fig_2d = go.Figure()
        if continuous:
            concentrations = df[metadata_column].apply(convert_concentration)
            fig_2d.add_trace(go.Scatter(x=X_reduced[:, 0], y=X_reduced[:, 1], mode='markers',
                                        marker=dict(color=concentrations, colorscale='Viridis', showscale=True, colorbar=dict(title=f'{metadata_column} (g)'))))
            title += ' (Continuous)'
        else:
            labels = df[metadata_column].astype(str)
            unique_labels = sorted(labels.unique(), key=convert_concentration, reverse=True)
            color_map = {label: f'rgb({r}, {g}, {b})' for label, (r, g, b) in zip(unique_labels, sns.color_palette('tab10', len(unique_labels)))}
            for label in unique_labels:
                mask = labels == label
                fig_2d.add_trace(go.Scatter(
                    x=X_reduced[mask, 0], 
                    y=X_reduced[mask, 1], 
                    mode='markers',
                    marker=dict(color=color_map[label]),
                    name=str(label),
                    showlegend=True
                ))
            fig_2d.update_layout(showlegend=True, legend=dict(itemsizing='constant'))
            title += ' (Categorical)'
        fig_2d.update_layout(title=title + f' (2D, {len(valid_columns)} features)', xaxis_title=x_label, yaxis_title=y_label)
        fig_2d.show()
        # Save 2D plot as PNG if save_path is provided
        if save_path:
            fig_2d.write_image(f"{save_path}_2D.png")
    
    # 3D Plot (if n_components = 3)
    if n_components == 3:
        fig_3d = go.Figure()
        if continuous:
            concentrations = df[metadata_column].apply(convert_concentration)
            fig_3d.add_trace(go.Scatter3d(x=X_reduced[:, 0], y=X_reduced[:, 1], z=X_reduced[:, 2], mode='markers',
                                          marker=dict(color=concentrations, colorscale='Viridis', showscale=True, colorbar=dict(title=f'{metadata_column} (g)'))))
            title += ' (Continuous)'
        else:
            labels = df[metadata_column].astype(str)
            unique_labels = sorted(labels.unique(), key=convert_concentration, reverse=True)
            color_map = {label: f'rgb({r}, {g}, {b})' for label, (r, g, b) in zip(unique_labels, sns.color_palette('tab10', len(unique_labels)))}
            for label in unique_labels:
                mask = labels == label
                fig_3d.add_trace(go.Scatter3d(
                    x=X_reduced[mask, 0], 
                    y=X_reduced[mask, 1], 
                    z=X_reduced[mask, 2], 
                    mode='markers',
                    marker=dict(color=color_map[label], size=3),
                    name=str(label),
                    showlegend=True
                ))
            fig_3d.update_layout(showlegend=True, legend=dict(itemsizing='constant'))
            title += ' (Categorical)'
        fig_3d.update_layout(title=title + f' (3D, {len(valid_columns)} features)', 
                             scene=dict(xaxis_title=x_label, yaxis_title=y_label, zaxis_title=z_label))
        fig_3d.show()
        # Save 3D plot as PNG if save_path is provided
        if save_path:
            fig_3d.write_image(f"{save_path}_3D.png")
    
    return fig_2d if n_components == 2 else fig_3d

# Function to convert concentration strings to numerical values (grams)
def convert_concentration(value):
    if pd.isna(value) or value == "":
        return 0.0
    try:
        value = str(value).lower().replace(" ", "")  # Normalize input
        if 'g' in value:
            num = float(value.replace('g', ''))
            return num  # Already in grams
        elif 'mg' in value:
            num = float(value.replace('mg', ''))
            return num * 1e-3  # Convert mg to g
        elif 'ug' in value:
            num = float(value.replace('ug', ''))
            return num * 1e-6  # Convert ug to g
        elif 'ng' in value:
            num = float(value.replace('ng', ''))
            return num * 1e-9  # Convert ng to g
        else:
            return float(value)  # Assume it's already a number (e.g., 0)
    except ValueError:
        return 0.0  # Default to 0 for invalid entries

## Load Dataset

In [3]:
# Load data
csv_dir = Path("/Users/felipecolombelli/phd/liver-plastic/datasets")
hep_path = csv_dir / "hepg2"
f1 = hep_path / "df_SingleCell_AO_HEPG2_102912.csv"
f2 = hep_path / "df_SingleCell_AO_HEPG2_110341.csv"
f3 = hep_path / "df_SingleCell_AO_HEPG2_231222.csv"

df1 = pd.read_csv(f1, sep=",", header=0)
# df2 = pd.read_csv(f2, sep=",", header=0)
# df3 = pd.read_csv(f3, sep=",", header=0)

# Preprocess each DataFrame
# dataframes = {'df1': df1, 'df2': df2, 'df3': df3}
dataframes = {'df1': df1}
preprocessed_data = {}

# print(f"Original df1 shape: {df1.shape}")
# print(f"Original df2 shape: {df2.shape}")
# print(f"Original df3 shape: {df3.shape}")

for name, df in dataframes.items():
    print(f"Original {name} shape: {df.shape}")
    print(f"Preprocessing {name}...")
    X_scaled, valid_columns = preprocess_dataframe(df)
    preprocessed_data[name] = {'X_scaled': X_scaled, 'valid_columns': valid_columns, 'df': df}
    print(f"Preprocessed {name} with {len(valid_columns)} valid columns.\n")


  df1 = pd.read_csv(f1, sep=",", header=0)


Original df1 shape: (45596, 1639)
Preprocessing df1...
Selected feature columns: ['Cells_ObjectNumber', 'Cells_AreaShape_Area', 'Cells_AreaShape_BoundingBoxArea', 'Cells_AreaShape_BoundingBoxMaximum_X', 'Cells_AreaShape_BoundingBoxMaximum_Y', 'Cells_AreaShape_BoundingBoxMinimum_X', 'Cells_AreaShape_BoundingBoxMinimum_Y', 'Cells_AreaShape_Center_X', 'Cells_AreaShape_Center_Y', 'Cells_AreaShape_CentralMoment_0_0', 'Cells_AreaShape_CentralMoment_0_1', 'Cells_AreaShape_CentralMoment_0_2', 'Cells_AreaShape_CentralMoment_0_3', 'Cells_AreaShape_CentralMoment_1_0', 'Cells_AreaShape_CentralMoment_1_1', 'Cells_AreaShape_CentralMoment_1_2', 'Cells_AreaShape_CentralMoment_1_3', 'Cells_AreaShape_CentralMoment_2_0', 'Cells_AreaShape_CentralMoment_2_1', 'Cells_AreaShape_CentralMoment_2_2', 'Cells_AreaShape_CentralMoment_2_3', 'Cells_AreaShape_Compactness', 'Cells_AreaShape_ConvexArea', 'Cells_AreaShape_Eccentricity', 'Cells_AreaShape_EquivalentDiameter', 'Cells_AreaShape_EulerNumber', 'Cells_AreaShap

## LDA

In [31]:
# Apply dimensionality reduction methods to each DataFrame
metadata_column = 'Metadata_concentration_perliter'
method = 'LDA'

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        # 2D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=2)
        plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                        method, f"{method} of {name}", continuous=False, n_components=2)
        
        # 3D plots
        # plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
        #                                 method, f"{method} of {name}", continuous=True, n_components=3)
        plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                        method, f"{method} of {name}", continuous=False, n_components=3)
    else:
        print(f"Warning: {metadata_column} not found in {name}.")


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul




divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul



# 2. Clustering

In [8]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

In [32]:
# Clustering function to apply different methods and visualize results
def perform_clustering(X_reduced, df, metadata_column, method_name, reduction_method, n_clusters=3, eps=0.5, min_samples=5):
    if method_name == 'KMeans':
        clusterer = KMeans(n_clusters=n_clusters, random_state=42)
    elif method_name == 'DBSCAN':
        clusterer = DBSCAN(eps=eps, min_samples=min_samples)
    elif method_name == 'Agglomerative':
        clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    elif method_name == 'GaussianMixture':
        clusterer = GaussianMixture(n_components=n_clusters, random_state=42)
    else:
        raise ValueError("Unsupported clustering method")
    
    if df is None:
        df = X_reduced

    # Fit the clustering model
    labels = clusterer.fit_predict(df)
    
    # Visualize clustering results
    fig = go.Figure()
    unique_labels = np.unique(labels)
    color_map = {label: f'rgb({r}, {g}, {b})' for label, (r, g, b) in zip(unique_labels, sns.color_palette('tab10', len(unique_labels)))}
    
    # Choose plotting class based on dimensionality
    ScatterCls = go.Scatter3d if X_reduced.shape[1] == 3 else go.Scatter

    for label in unique_labels:
        mask = labels == label
        trace_args = {
        "x": X_reduced[mask, 0],
        "y": X_reduced[mask, 1],
        "mode": "markers",
        "marker": dict(color=color_map[label], size=3),
        "name": f"Cluster {label}" if label != -1 else "Noise",
        "showlegend": True,
        }
        if X_reduced.shape[1] == 3:
            trace_args["z"] = X_reduced[mask, 2]
        fig.add_trace(ScatterCls(**trace_args))
    
    # Set axis labels based on reduction method
    if reduction_method == 'PCA':
        x_label, y_label = 'PC1', 'PC2'
    elif reduction_method == 'LDA':
        x_label, y_label = 'LD1', 'LD2'
    elif reduction_method == 't-SNE':
        x_label, y_label = 't-SNE 1', 't-SNE 2'
    elif reduction_method == 'UMAP':
        x_label, y_label = 'UMAP 1', 'UMAP 2'
    
    fig.update_layout(
        title=f'{method_name} Clustering on {reduction_method} ({len(unique_labels)} clusters)',
        xaxis_title=x_label,
        yaxis_title=y_label,
        showlegend=True,
        legend=dict(itemsizing='constant')
    )
    fig.show()
    
    return labels, fig

In [33]:
# Apply clustering to reduced data from PCA, LDA, t-SNE, and UMAP
metadata_column = 'Metadata_concentration_perliter'
clustering_methods = ['GaussianMixture']
reduction_methods = ['LDA']
# reduction_methods = ['PCA', 'LDA', 't-SNE', 'UMAP']
n_components = 3

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        X_scaled = data['X_scaled']
        
        for reduction_method in reduction_methods:
            fig_original = plot_dimensionality_reduction(data['X_scaled'], data['df'], data['valid_columns'], metadata_column,
                                        reduction_method, f"{method} of {name}", continuous=False, n_components=n_components)
            # Compute reduced data based on the method
            if reduction_method == 'PCA':
                reducer = PCA(n_components=n_components)
                X_reduced = reducer.fit_transform(X_scaled)
            elif reduction_method == 'LDA':
                labels = data['df'][metadata_column].astype(str)
                reducer = LDA(n_components=n_components)
                X_reduced = reducer.fit_transform(X_scaled, labels)
            elif reduction_method == 't-SNE':
                reducer = TSNE(n_components=n_components, perplexity=30, learning_rate=200, random_state=42)
                X_reduced = reducer.fit_transform(X_scaled)
            elif reduction_method == 'UMAP':
                reducer = umap.UMAP(n_components=n_components, n_neighbors=15, min_dist=0.1, random_state=42)
                X_reduced = reducer.fit_transform(X_scaled)
            
            for cluster_method in clustering_methods:
                if cluster_method == 'KMeans' or cluster_method == 'Agglomerative' or cluster_method == 'GaussianMixture':
                    labels, fig_reduced = perform_clustering(X_reduced, None, metadata_column, cluster_method, reduction_method, n_clusters=4)
                    labels, fig_all = perform_clustering(X_reduced, X_scaled, metadata_column, cluster_method, reduction_method, n_clusters=4)
                elif cluster_method == 'DBSCAN':
                    labels = perform_clustering(X_reduced, data['df'], metadata_column, cluster_method, reduction_method, eps=0.5, min_samples=5)
                print(f"{cluster_method} clustering labels on {reduction_method} for {name}: {np.unique(labels)}")
    else:
        print(f"Warning: {metadata_column} not found in {name}.")


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul




divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul




divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul



GaussianMixture clustering labels on LDA for df1: [0 1 2 3]


In [35]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

figs       = [fig_original, fig_reduced, fig_all]
fig_labels = ['Original classes',
              'Cluster with dimensionality reduction',
              'Cluster with all features']

# -- 1. build a 3-column subplot skeleton, each cell is 3-D --------------
combined_fig = make_subplots(
    rows=1, cols=3,
    specs=[[{"type": "scene"}]*3],               # 3× Scatter3d panels
    subplot_titles=fig_labels,
    horizontal_spacing=0.07                      # little gap between plots
)

# -- 2. copy every trace into the right cell, tweak the marker size -------
for col, (fig, label) in enumerate(zip(figs, fig_labels), start=1):
    for trace in fig.data:
        # make dots smaller (overwrite whatever size was there)
        if hasattr(trace, "marker"):          # safety check
            trace.marker.size = 3
        # prefix trace names with panel label so the legend is explicit
        trace.name = f"{trace.name}"
        combined_fig.add_trace(trace, row=1, col=col)

# -- 3. carry over each panel’s axis titles -------------------------------
for col, fig in enumerate(figs, start=1):
    if hasattr(fig.layout, "scene"):  # 3-D source figure
        tgt_scene = "scene" if col == 1 else f"scene{col}"
        combined_fig.update_layout({
            tgt_scene: dict(
                xaxis_title = fig.layout.scene.xaxis.title.text,
                yaxis_title = fig.layout.scene.yaxis.title.text,
                zaxis_title = fig.layout.scene.zaxis.title.text
            )
        })
    else:                             # 2-D source figure (just in case)
        combined_fig.update_xaxes(title_text=fig.layout.xaxis.title.text,
                                  row=1, col=col)
        combined_fig.update_yaxes(title_text=fig.layout.yaxis.title.text,
                                  row=1, col=col)

# -- 4. overall figure cosmetics ------------------------------------------
combined_fig.update_layout(
    title="Combined clustering results",
    height=500, width=1350,
    legend=dict(itemsizing="constant")   # keep legend entry size compact
)

combined_fig.show(renderer="browser")


# 3. Evaluation

In [None]:
from sklearn.metrics import silhouette_score, adjusted_rand_score, calinski_harabasz_score

In [None]:
# Evaluation function for clustering results
def evaluate_clustering(X_reduced, labels, df, metadata_column):
    # Convert concentration to categorical labels as ground truth
    true_labels = df[metadata_column].astype(str)
    
    # Silhouette Score
    silhouette_avg = silhouette_score(X_reduced, labels) if len(np.unique(labels)) > 1 else None
    
    # Adjusted Rand Score (requires true labels)
    adjusted_rand = adjusted_rand_score(true_labels, labels) if len(np.unique(labels)) > 1 else None
    
    # Calinski-Harabasz Score
    ch_score = calinski_harabasz_score(X_reduced, labels) if len(np.unique(labels)) > 1 else None
    
    print(f"Evaluation Metrics for Clustering:")
    print(f"Silhouette Score: {silhouette_avg:.4f}" if silhouette_avg is not None else "Silhouette Score: N/A (single cluster)")
    print(f"Adjusted Rand Score: {adjusted_rand:.4f}" if adjusted_rand is not None else "Adjusted Rand Score: N/A (single cluster or no true labels)")
    print(f"Calinski-Harabasz Score: {ch_score:.4f}" if ch_score is not None else "Calinski-Harabasz Score: N/A (single cluster)")
    return silhouette_avg, adjusted_rand, ch_score

In [None]:
# Evaluate clustering results for each reduction method, clustering method, and dataset
metadata_column = 'Metadata_concentration_perliter'
clustering_methods = ['KMeans', 'DBSCAN', 'Agglomerative']
reduction_methods = ['LDA']
# reduction_methods = ['PCA', 'LDA', 't-SNE', 'UMAP']

for name, data in preprocessed_data.items():
    if metadata_column in data['df'].columns:
        X_scaled = data['X_scaled']
        
        for reduction_method in reduction_methods:
            # Compute reduced data based on the method
            if reduction_method == 'PCA':
                reducer = PCA(n_components=2)
                X_reduced = reducer.fit_transform(X_scaled)
            elif reduction_method == 'LDA':
                labels = data['df'][metadata_column].astype(str)
                reducer = LDA(n_components=2)
                X_reduced = reducer.fit_transform(X_scaled, labels)
            elif reduction_method == 't-SNE':
                reducer = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
                X_reduced = reducer.fit_transform(X_scaled)
            elif reduction_method == 'UMAP':
                reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
                X_reduced = reducer.fit_transform(X_scaled)
            
            for cluster_method in clustering_methods:
                if cluster_method == 'KMeans' or cluster_method == 'Agglomerative':
                    labels = perform_clustering(X_reduced, data['df'], metadata_column, cluster_method, reduction_method, n_clusters=3)
                elif cluster_method == 'DBSCAN':
                    labels = perform_clustering(X_reduced, data['df'], metadata_column, cluster_method, reduction_method, eps=0.5, min_samples=5)
                print(f"\nEvaluating {cluster_method} clustering on {reduction_method} for {name}:")
                evaluate_clustering(X_reduced, labels, data['df'], metadata_column)
    else:
        print(f"Warning: {metadata_column} not found in {name}.")