In [1]:
import pandas as pd
import numpy as np
import yaml
import os

from sklearn.impute import SimpleImputer
from sklearn.metrics import calinski_harabasz_score
from sklearn.cluster import SpectralClustering
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import umap
import sys
import seaborn as sns
import matplotlib.pyplot as plt

import argparse
from datetime import datetime

## Loading

In [2]:
def prepare_categorical_variable(clinical_df, var, drop_values=None, verbose=True):
    """
    Prepares a categorical variable by mapping unique values to integers starting from 0.

    Args:
        clinical_df (pd.DataFrame): clinical data with sample index.
        var (str): variable name.
        drop_values (list): list of values to treat as NaN and exclude from mapping.
        verbose (bool): print debug info.

    Returns:
        tuple: (pd.Series with integer-encoded values, dict mapping integers to original values)
    """
    assert isinstance(clinical_df, pd.DataFrame), "clinical_df must be a DataFrame"
    assert var in clinical_df.columns, f"'{var}' not found in clinical_df"

    series = clinical_df[var].copy()

    if drop_values is not None:
        series = series.replace(drop_values, pd.NA)

    unique_vals = series.dropna().unique()
    value_to_int = {val: i for i, val in enumerate(unique_vals)}
    int_to_value = {i: val for val, i in value_to_int.items()}
    mapped_series = series.map(value_to_int)
    mapped_series = mapped_series.dropna().astype(int)


    if verbose:
        print(f"{var} unique values: {len(unique_vals)}")
        if drop_values:
            print(f"Dropped values: {drop_values}")
        print("Mapping created:")
        for i, val in int_to_value.items():
            print(f"\t{i}: {repr(val)}")


    return mapped_series, int_to_value


### Cleaning

In [3]:
def filter_features_by_missingness(X, threshold=0.2, verbose=True):
    """
    Remove protein features with too many missing values.

    Args:
        X (pd.DataFrame): samples x proteins.
        threshold (float): maximum fraction of missing values allowed (e.g., 0.2 = 20%).
        verbose (bool): print number of features removed.

    Returns:
        pd.DataFrame: filtered X with fewer columns.
    """
    missing_fraction = X.isna().mean()
    keep_cols = missing_fraction[missing_fraction <= threshold].index
    if verbose:
        dropped = len(X.columns) - len(keep_cols)
        print(f"Filtered out {dropped}/{len(X.columns)} proteins with >{threshold*100:.0f}% missing values.")
    return X[keep_cols]


In [4]:
def impute_missing_values(X, method="mean"):
    """
    Impute missing values in X.

    Args:
        X (pd.DataFrame): samples x proteins.
        method (str): "mean", "median", or "zero".

    Returns:
        pd.DataFrame: imputed X.
    """
    if method == "mean":
        return X.fillna(X.mean())
    elif method == "median":
        return X.fillna(X.median())
    elif method == "zero":
        return X.fillna(0)
    else:
        raise ValueError("Invalid imputation method. Choose from 'mean', 'median', or 'zero'.")


## UMAP

In [5]:
def get_umap_embeddings(X, n_neighbors, min_dist, metric, random_state=42):
    reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric, random_state=random_state)
    return reducer.fit_transform(X)


## Clustering

In [6]:
def run_clustering(X_embedded, n_clusters, random_state=42):
    clustering = SpectralClustering(n_clusters=n_clusters, random_state=random_state)
    return clustering.fit_predict(X_embedded)


def optimize_clustering(X, param_grid):
    best_score = -1
    best_params = None
    best_labels = None
    best_embedding = None

    results = []

    for n_neighbors in param_grid['umap_n_neighbors']:
        for min_dist in param_grid['umap_min_dist']:
            for metric in param_grid['umap_metrics']:
                embedding = get_umap_embeddings(X, n_neighbors, min_dist, metric)
                for n_clusters in param_grid['n_clusters_grid']:
                    labels = run_clustering(embedding, n_clusters)
                    score = calinski_harabasz_score(embedding, labels)
                    results.append((n_neighbors, min_dist, metric, n_clusters, score))
                    if score > best_score:
                        best_score = score
                        best_params = (n_neighbors, min_dist, metric, n_clusters)
                        best_labels = labels
                        best_embedding = embedding

    results_df = pd.DataFrame(results, columns=["n_neighbors", "min_dist", "metric", "n_clusters", "CH_score"])
    return best_params, best_embedding, best_labels, results_df

## Chi-squared

In [7]:
def chi_squared_analysis(clinical_df, cluster_label_dict, categorical_vars):

    results = []
    for var in categorical_vars:
        cat_var, label_map = prepare_categorical_variable(clinical_df, var, drop_values=["UNKNOWN"])

        cluster_df = pd.DataFrame().from_dict(cluster_label_dict, orient="index")
        cluster_df.columns = ["cluster"]

        # merge cluster_df with cat_var
        cluster_df = pd.merge(cat_var, cluster_df, left_index=True, right_index=True)

        # create contingency table agaisnt clinical variable
        contingency = pd.crosstab(cluster_df['cluster'], cluster_df[var])
        chi2, p, _, _ = chi2_contingency(contingency)
        results.append((var, chi2, p))

    results_df = pd.DataFrame(results, columns=["Variable", "Chi2", "p_value"])
    results_df['FDR'] = multipletests(results_df['p_value'], method='fdr_bh')[1]
    
    return results_df
    
        
    

## Plot

In [8]:
def plot_umap(X, best_embedding, best_labels, clinical_df, results_df, p_thresh=None, save_dir=None):

    # Get Variables with p_value < 0.05
    if p_thresh:
        sig_vars = list(results_df[results_df['p_value'] < p_thresh]['Variable'])
    else:
        sig_vars = list(results_df["Variable"])

    for var in sig_vars:
        #cat_var, label_map = prepare_categorical_variable(clinical_df, var, drop_values=["UNKNOWN"])

        #construct umap_df with best_embedding following X.index
        umap_df = pd.DataFrame(best_embedding, index=X.index, columns=["UMAP1", "UMAP2"])

        # Add cluster labels to umap_df
        umap_df['cluster'] = best_labels

        # Add clinical variables to umap_df
        umap_df = umap_df.join(clinical_df[var])

        # Plot UMAP_df with sig_vars
        sns.scatterplot(x="UMAP1", y="UMAP2", hue=var, data=umap_df)

        # save plot
        if save_dir:
            plt.savefig(f"{save_dir}/{var}.png")
        else:
            plt.show()  

        plt.close()




## Pipeline

In [9]:
def clustering_pipeline(protein_group_csv, clinical_csv, categorical_vars, param_grid, missingness_thresh, imputation_method, fdr_thresh, save_dir=None):
    protein_group_df = pd.read_csv(protein_group_csv, index_col=0)
    clinical_df = pd.read_csv(clinical_csv, index_col=0)
    X = protein_group_df
    X = filter_features_by_missingness(X, missingness_thresh)
    X = impute_missing_values(X, imputation_method)

    print("Optimizing clustering...")
    best_params, best_embedding, best_labels, CH_score_df = optimize_clustering(X, param_grid)
    CH_score_df.sort_values(by="CH_score", ascending=False)

    cluster_label_dict = dict(zip(X.index, best_labels))

    results_df = chi_squared_analysis(clinical_df, cluster_label_dict, categorical_vars)

    # Save chi-squared results
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        results_path = os.path.join(save_dir, "chi_squared_results.csv")
        results_df.to_csv(results_path, index=False)
        plot_umap(X, best_embedding, best_labels, clinical_df, results_df, p_thresh=None, save_dir=save_dir)

    return results_df


## main()

In [10]:
def main(config_path):
    import yaml
    from datetime import datetime
    import os

    # Load config
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)

    # Required
    protein_group_csv = config["protein_group_csv"]
    clinical_csv = config["clinical_csv"]
    output_dir = config["output_dir"]
    categorical_vars = config["categorical_vars"]

    # Optional
    missingness_thresh = config.get("missingness_thresh", 0.2)
    imputation_method = config.get("imputation_method", "mean")
    fdr_thresh = config.get("fdr_thresh", 0.05)
    param_grid = config.get("param_grid", {
        'umap_n_neighbors': [5, 10, 15],
        'umap_min_dist': [0.1, 0.01, 0.001],
        'umap_metrics': ['euclidean', 'cosine'],
        'n_clusters_grid': [2, 3, 4, 5]
    })

    # Create timestamped output directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    timestamped_dir = os.path.join(output_dir, timestamp)
    os.makedirs(timestamped_dir, exist_ok=True)

    print("Running de novo clustering pipeline...")
    clustering_pipeline(
        protein_group_csv=protein_group_csv,
        clinical_csv=clinical_csv,
        categorical_vars=categorical_vars,
        param_grid=param_grid,
        missingness_thresh=missingness_thresh,
        imputation_method=imputation_method,
        fdr_thresh=fdr_thresh,
        save_dir=timestamped_dir,
    )

    # Save a copy of the config
    config_copy_path = os.path.join(timestamped_dir, "config.yaml")
    with open(config_copy_path, "w") as f:
        yaml.dump(config, f, default_flow_style=False)

    print(f"Saved config and results to: {timestamped_dir}")


## execution

In [12]:
if __name__ == "__main__" and 'ipykernel' not in sys.modules:  # CLI mode
    parser = argparse.ArgumentParser(description="Pathway Enrichment Pipeline using gseapy")
    parser.add_argument("--config_path", type=str, required=True,
                        help="Path to config file")
    
    args = parser.parse_args()
    main(args.config_path)

if 'ipykernel' in sys.modules:  # Notebook mode
    config = {
        "protein_group_csv": "protein_group_df.csv",
        "clinical_csv": "clinical_df.csv",
        "output_dir": "clustering_results",

        # Optional settings
        "missingness_thresh": 0.2,
        "imputation_method": "mean",
        "fdr_thresh": 0.05,

        "categorical_vars":  [
                                #'Sample ID', 
                                'Cuatom_Status', 
                                #'Sample Type', 
                                #'Species', 
                                'Condition',
                                #'Description', 
                                'Custom_Sex', 
                                #'Custom_Age', 
                                'Diseaseseverity',
                                'Custom_Hypertension', 
                                'Custom_Dyslipidemia', 
                                'Custom_Active smoker',
                                'Custom_Ever smoker', 
                                'Custom_DM'
                            ],
        "param_grid": {
            'umap_n_neighbors': [5, 10, 15, 20],
            'umap_min_dist': [0.1, 0.01, 0.001], 
            'umap_metrics': ['euclidean', 'cosine', 'manhattan'],
            'n_clusters_grid': [2, 3, 4, 5, 6]
        }
        }

    with open("config.yaml", "w") as f:
        yaml.dump(config, f, default_flow_style=False)

    main("config.yaml")

Running de novo clustering pipeline...
Filtered out 2278/8320 proteins with >20% missing values.
Optimizing clustering...


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.

Cuatom_Status unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'ND'
	1: 'D'
Condition unique values: 4
Dropped values: ['UNKNOWN']
Mapping created:
	0: '1-C'
	1: '2-SC'
	2: '3-A'
	3: '4-S'
Custom_Sex unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'M'
	1: 'F'
Diseaseseverity unique values: 6
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'Low'
	1: 'Severe'
	2: 'Moderate-severe'
	3: 'Moderate'
	4: 'Mild-moderate'
	5: 'Mild'
Custom_Hypertension unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
Custom_Dyslipidemia unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: '????'
Custom_Active smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_Ever smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_DM unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: 'YES '
Saved config and re

In [12]:
main("Imputed_MatrixExport_NP_A.yaml")

Running de novo clustering pipeline...
Filtered out 167/2142 proteins with >20% missing values.
Optimizing clustering...


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.

Cuatom_Status unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'ND'
	1: 'D'
Condition unique values: 4
Dropped values: ['UNKNOWN']
Mapping created:
	0: '1-C'
	1: '2-SC'
	2: '3-A'
	3: '4-S'
Custom_Sex unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'M'
	1: 'F'
Diseaseseverity unique values: 6
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'Low'
	1: 'Severe'
	2: 'Moderate-severe'
	3: 'Moderate'
	4: 'Mild-moderate'
	5: 'Mild'
Custom_Hypertension unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
Custom_Dyslipidemia unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: '????'
Custom_Active smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_Ever smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_DM unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: 'YES '
Saved config and re

In [13]:
main("Imputed_MatrixExport_NP_B.yaml")

Running de novo clustering pipeline...
Filtered out 631/3788 proteins with >20% missing values.
Optimizing clustering...


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.

Cuatom_Status unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'ND'
	1: 'D'
Condition unique values: 4
Dropped values: ['UNKNOWN']
Mapping created:
	0: '1-C'
	1: '2-SC'
	2: '3-A'
	3: '4-S'
Custom_Sex unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'M'
	1: 'F'
Diseaseseverity unique values: 6
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'Low'
	1: 'Severe'
	2: 'Moderate-severe'
	3: 'Moderate'
	4: 'Mild-moderate'
	5: 'Mild'
Custom_Hypertension unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
Custom_Dyslipidemia unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: '????'
Custom_Active smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_Ever smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_DM unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: 'YES '
Saved config and re

In [14]:
main("MatrixExport_NP_A.yaml")

Running de novo clustering pipeline...
Filtered out 0/3054 proteins with >20% missing values.
Optimizing clustering...


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.

Cuatom_Status unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'ND'
	1: 'D'
Condition unique values: 4
Dropped values: ['UNKNOWN']
Mapping created:
	0: '1-C'
	1: '2-SC'
	2: '3-A'
	3: '4-S'
Custom_Sex unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'M'
	1: 'F'
Diseaseseverity unique values: 6
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'Low'
	1: 'Severe'
	2: 'Moderate-severe'
	3: 'Moderate'
	4: 'Mild-moderate'
	5: 'Mild'
Custom_Hypertension unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
Custom_Dyslipidemia unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: '????'
Custom_Active smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_Ever smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_DM unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: 'YES '
Saved config and re

In [15]:
main("MatrixExport_NP_B.yaml")



Running de novo clustering pipeline...
Filtered out 0/1649 proteins with >20% missing values.
Optimizing clustering...


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.

Cuatom_Status unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'ND'
	1: 'D'
Condition unique values: 4
Dropped values: ['UNKNOWN']
Mapping created:
	0: '1-C'
	1: '2-SC'
	2: '3-A'
	3: '4-S'
Custom_Sex unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'M'
	1: 'F'
Diseaseseverity unique values: 6
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'Low'
	1: 'Severe'
	2: 'Moderate-severe'
	3: 'Moderate'
	4: 'Mild-moderate'
	5: 'Mild'
Custom_Hypertension unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
Custom_Dyslipidemia unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: '????'
Custom_Active smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_Ever smoker unique values: 2
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'NO'
	1: 'YES'
Custom_DM unique values: 3
Dropped values: ['UNKNOWN']
Mapping created:
	0: 'YES'
	1: 'NO'
	2: 'YES '
Saved config and re

## Debug

In [167]:
protein_group_csv = "protein_group_df.csv"
clinical_csv = "clinical_df.csv"


missingness_thresh = 0.2
imputation_method = "mean"
fdr_thresh = 0.05  


categorical_vars = [
    #'Sample ID', 
    'Cuatom_Status', 
    #'Sample Type', 
    #'Species', 
    'Condition',
    #'Description', 
    'Custom_Sex', 
    #'Custom_Age', 
    'Diseaseseverity',
    'Custom_Hypertension', 
    'Custom_Dyslipidemia', 
    'Custom_Active smoker',
    'Custom_Ever smoker', 
    'Custom_DM'
]

param_grid = {
    'umap_n_neighbors': [5, 10, 15],
    'umap_min_dist': [0.1, 0.01, 0.001], 
    'umap_metrics': ['euclidean', 'cosine'],
    'n_clusters_grid': [2, 3, 4, 5]
}

In [None]:
# Determining the optimal number of clusters... TODO

# import numpy as np

# global_mean_ch_score = np.mean(CH_score_df['CH_score'])
# n_clusters_grid = param_grid['n_clusters_grid']
# n_clusters_grid.sort()


# for n_clusters in n_clusters_grid:
#     mean_ch_score = CH_score_df[CH_score_df['n_clusters'] == n_clusters]['CH_score'].mean()
#     if mean_ch_score > global_mean_ch_score:
#         selected_n_clusters = n_clusters
#         break

# # rerun param_grid with selected_n_clusters
# param_grid['n_clusters_grid'] = [selected_n_clusters]
# best_params, best_embedding, best_labels, CH_score_df = optimize_clustering(X, param_grid)
