In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway
from statsmodels.stats.multitest import multipletests
import warnings
import os

# Configuration Constants
CONFIG = {
    'data': {
        'filepath': './results/radiomics.csv',
        'index_col': 0
    },
    'feature_selection': {
        'var_threshold': 0.02,
        'corr_threshold': 0.9
    },
    'dimension_reduction': {
        'n_components': 10,
        'n_neighbors': 10,
        'min_dist': 0.02,
        'metric': 'cosine',
        'random_state': 42,
        'final_components': 2
    },
    'clustering': {
        'max_clusters': 5,
        'covariance_type': 'full',
        'random_state': 42
    },
    'analysis': {
        'top_features': 50
    },
    'visualization': {
        'figsize': (12, 8),
        'cmap': 'jet',
        'heatmap_figsize': (16, 12),
        'dpi': 300
    },
    'output': {
        'results_dir': './results',
        'figures_dir': './figures'
    }
}

# Create output directories if they don't exist
os.makedirs(CONFIG['output']['results_dir'], exist_ok=True)
os.makedirs(CONFIG['output']['figures_dir'], exist_ok=True)

warnings.filterwarnings('ignore')
np.random.seed(CONFIG['clustering']['random_state'])


class DataProcessor:
    @staticmethod
    def load_data():
        df = pd.read_csv(CONFIG['data']['filepath'], index_col=CONFIG['data']['index_col'])
        return df.index.values, df.values, df.columns.tolist()

    @staticmethod
    def select_features(features, feature_names):
        selector = VarianceThreshold(threshold=CONFIG['feature_selection']['var_threshold'])
        features_high_var = selector.fit_transform(features)
        selected_indices = selector.get_support(indices=True)
        selected_features = [feature_names[i] for i in selected_indices]

        df = pd.DataFrame(features_high_var, columns=selected_features)
        corr_matrix = df.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > CONFIG['feature_selection']['corr_threshold'])]

        var_values = df.var()
        to_keep = []
        for col in to_drop:
            correlated_cols = upper.index[upper[col] > CONFIG['feature_selection']['corr_threshold']].tolist()
            correlated_cols.append(col)
            best_feature = var_values[correlated_cols].idxmax()
            if best_feature not in to_keep:
                to_keep.append(best_feature)

        final_features = [f for f in selected_features if f not in to_drop or f in to_keep]
        return df[final_features].values, final_features

    @staticmethod
    def scale_features(features):
        return StandardScaler().fit_transform(features)

    @staticmethod
    def reduce_dimension(features):
        reducer = umap.UMAP(
            n_components=CONFIG['dimension_reduction']['n_components'],
            n_neighbors=CONFIG['dimension_reduction']['n_neighbors'],
            min_dist=CONFIG['dimension_reduction']['min_dist'],
            metric=CONFIG['dimension_reduction']['metric'],
            random_state=CONFIG['dimension_reduction']['random_state']
        )
        return reducer.fit_transform(features)


class ClusterAnalyzer:
    @staticmethod
    def optimize_gmm(features):
        best_score, best_gmm, best_n = -1, None, 2
        results = []
        for n in range(2, CONFIG['clustering']['max_clusters'] + 1):
            gmm = GaussianMixture(
                n_components=n,
                covariance_type=CONFIG['clustering']['covariance_type'],
                random_state=CONFIG['clustering']['random_state']
            )
            clusters = gmm.fit_predict(features)
            metrics = {
                'n_clusters': n,
                'silhouette': round(silhouette_score(features, clusters), 2),
                'calinski_harabasz': round(calinski_harabasz_score(features, clusters), 2),
                'davies_bouldin': round(davies_bouldin_score(features, clusters), 2)
            }
            results.append(metrics)
            if metrics['silhouette'] > best_score:
                best_score, best_gmm, best_n = metrics['silhouette'], gmm, n
        return best_gmm, best_n, pd.DataFrame(results)

    @staticmethod
    def get_top_features(original_features, scaled_features, feature_names, clusters):
        unique_clusters = np.unique(clusters)
        cluster_centers = []
        for cluster in unique_clusters:
            cluster_mask = clusters == cluster
            cluster_mean = np.mean(scaled_features[cluster_mask], axis=0)
            cluster_centers.append(cluster_mean)
        cluster_centers = np.array(cluster_centers)
        feature_importance = np.var(cluster_centers, axis=0)

        # Perform ANOVA test for all features
        pvals = []
        for i in range(scaled_features.shape[1]):
            groups = [scaled_features[clusters == c, i] for c in unique_clusters]
            _, p = f_oneway(*groups)
            pvals.append(p)
        
        # Apply FDR correction
        reject, pvals_corrected, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh')
        
        # Get all statistically significant features
        sig_indices = np.where(reject)[0]
        
        # Get top 50 features for heatmap (sorted by importance)
        if len(sig_indices) > 0:
            top_sig = sig_indices[np.argsort(feature_importance[sig_indices])][-CONFIG['analysis']['top_features']:]
            heatmap_indices = top_sig[::-1]
        else:
            heatmap_indices = []
            print("Warning: No statistically significant features found at alpha=0.05")

        # Create summary for ALL statistically significant features
        summary_data = []
        if len(sig_indices) > 0:
            # Sort all significant features by importance
            all_sig_sorted = sig_indices[np.argsort(feature_importance[sig_indices])][::-1]
            
            for i in all_sig_sorted:
                feature = feature_names[i]
                
                value_repr = []
                original_value_repr = []
                for cluster in unique_clusters:
                    scaled_values = scaled_features[clusters == cluster, i]
                    value_repr.append(f"{np.median(scaled_values):.3f}({np.percentile(scaled_values, 25):.3f},{np.percentile(scaled_values, 75):.3f})")
                    original_values = original_features[clusters == cluster, i]
                    original_value_repr.append(f"{np.median(original_values):.3f}({np.percentile(original_values, 25):.3f},{np.percentile(original_values, 75):.3f})")

                summary_data.append({
                    'Feature': feature,
                    'Importance': feature_importance[i],
                    'P_value': pvals[i],
                    'P_value_corrected': pvals_corrected[i],
                    **{f'Cluster{cluster}_Scaled': value_repr[cluster] for cluster in unique_clusters},
                    **{f'Cluster{cluster}_Original': original_value_repr[cluster] for cluster in unique_clusters}
                })

        # Create heatmap data (only top 50 features)
        if len(heatmap_indices) > 0:
            df_heatmap = pd.DataFrame(scaled_features[:, heatmap_indices], 
                                    columns=[feature_names[i] for i in heatmap_indices])
            df_heatmap['Cluster'] = clusters
            df_heatmap = df_heatmap.sort_values('Cluster').drop('Cluster', axis=1)
            heatmap_features = [feature_names[i] for i in heatmap_indices]
        else:
            df_heatmap = pd.DataFrame()
            heatmap_features = []

        return (pd.DataFrame(summary_data).sort_values('Importance', ascending=False),
                df_heatmap,
                heatmap_features)


class Visualizer:
    @staticmethod
    def create_umap_plot(umap_features, clusters, best_silhouette, n_clusters):
        fig, ax = plt.subplots(figsize=CONFIG['visualization']['figsize'])
        scatter = ax.scatter(
            umap_features[:, 0], umap_features[:, 1],
            c=clusters, cmap=CONFIG['visualization']['cmap'], s=30, alpha=0.6
        )
        ax.set_title(f'UMAP Visualization of GMM Clusters (n={n_clusters})', fontsize=14)
        ax.set_xlabel('UMAP 1', fontsize=12)
        ax.set_ylabel('UMAP 2', fontsize=12)
        cbar = plt.colorbar(scatter, ax=ax)
        cbar.set_label('Cluster', fontsize=12)
        ax.annotate(
            f'Silhouette: {best_silhouette:.2f}',
            xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12,
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8)
        )
        plt.tight_layout()
        return fig

    @staticmethod
    def create_heatmap(top_features_heatmap, clusters):
        if top_features_heatmap.empty:
            return None
            
        fig, ax = plt.subplots(figsize=CONFIG['visualization']['heatmap_figsize'])
        sns.heatmap(
            top_features_heatmap.T,
            cmap=CONFIG['visualization']['cmap'], ax=ax,
            cbar_kws={'label': 'Scaled Value'}
        )
        
        unique_clusters = np.unique(clusters)
        cluster_counts = [np.sum(clusters == c) for c in unique_clusters]
        cluster_boundaries = np.cumsum(cluster_counts)
        
        for i, (cluster, pos) in enumerate(zip(unique_clusters, cluster_boundaries)):
            start = 0 if i == 0 else cluster_boundaries[i-1]
            middle = (start + pos) / 2
            ax.axvline(x=pos, color='black', linewidth=0.5)
            ax.text(middle, -0.5, f'Cluster {cluster}', ha='center', va='center', fontsize=10)
        
        ax.set_title(f'Top {min(CONFIG["analysis"]["top_features"], len(top_features_heatmap.columns))} Features Heatmap', fontsize=14)
        ax.set_xlabel('Patient Index', fontsize=12)
        ax.set_ylabel('Feature', fontsize=12)
        ax.set_yticks(np.arange(len(top_features_heatmap.columns)) + 0.5)
        ax.set_yticklabels(top_features_heatmap.columns, rotation=0, fontsize=8)
        plt.tight_layout()
        return fig


class ResultSaver:
    @staticmethod
    def save_results(cluster_results, metrics_df, feature_summary):
        results_dir = CONFIG['output']['results_dir']
        
        # Save cluster assignments
        cluster_results.to_csv(f'{results_dir}/cluster_assignments.csv', index=False)
        
        # Save clustering metrics
        metrics_df.to_csv(f'{results_dir}/clustering_metrics.csv', index=False)
        
        # Save feature summary as radiomics_distinctive.csv
        if not feature_summary.empty:
            feature_summary.to_csv(f'{results_dir}/radiomics_distinctive.csv', index=False)


def main():
    print("Loading data...")
    patient_ids, original_features, feature_names = DataProcessor.load_data()
    
    print("Selecting features...")
    features_selected, selected_feature_names = DataProcessor.select_features(original_features, feature_names)
    
    print("Scaling features...")
    features_scaled = DataProcessor.scale_features(features_selected)

    print("Reducing dimensionality...")
    features_umap = DataProcessor.reduce_dimension(features_scaled)
    
    print("Optimizing GMM clustering...")
    best_gmm, best_n, metrics_df = ClusterAnalyzer.optimize_gmm(features_umap)
    clusters = best_gmm.predict(features_umap)
    best_silhouette = metrics_df.loc[metrics_df['n_clusters'] == best_n, 'silhouette'].values[0]

    print("Analyzing top features...")
    top_features_summary, top_features_heatmap, top_feature_names = ClusterAnalyzer.get_top_features(
        features_selected, features_scaled, selected_feature_names, clusters
    )

    print("Creating 2D visualization...")
    umap_2d = umap.UMAP(
        n_components=CONFIG['dimension_reduction']['final_components'],
        n_neighbors=CONFIG['dimension_reduction']['n_neighbors'],
        min_dist=CONFIG['dimension_reduction']['min_dist'],
        metric=CONFIG['dimension_reduction']['metric'],
        random_state=CONFIG['dimension_reduction']['random_state']
    ).fit_transform(features_scaled)

    print("Generating visualizations...")
    # Create and save UMAP plot
    umap_fig = Visualizer.create_umap_plot(umap_2d, clusters, best_silhouette, best_n)
    umap_fig.savefig(f"{CONFIG['output']['figures_dir']}/umap_clusters.png", 
                     dpi=CONFIG['visualization']['dpi'], bbox_inches='tight')
    plt.close(umap_fig)

    # Create and save heatmap
    heatmap_fig = Visualizer.create_heatmap(top_features_heatmap, clusters)
    if heatmap_fig is not None:
        heatmap_fig.savefig(f"{CONFIG['output']['figures_dir']}/TOP50_heatmap.png", 
                            dpi=CONFIG['visualization']['dpi'], bbox_inches='tight')
        plt.close(heatmap_fig)

    print("Saving results...")
    ResultSaver.save_results(
        pd.DataFrame({'PatientID': patient_ids, 'Cluster': clusters}),
        metrics_df,
        top_features_summary
    )

    print(f"Optimal number of clusters: {best_n}")
    print(f"Best silhouette score: {best_silhouette:.2f}")
    if not top_features_summary.empty:
        print(f"Total statistically significant features: {len(top_features_summary)}")
        print(f"Most important feature: {top_features_summary.iloc[0]['Feature']} "
              f"(importance={top_features_summary.iloc[0]['Importance']:.3f})")
    else:
        print("No statistically significant features found")
    print(f"Visualizations saved to {CONFIG['output']['figures_dir']}")
    print(f"Results saved to {CONFIG['output']['results_dir']}")


if __name__ == "__main__":
    main()

Loading data...
Selecting features...
Scaling features...
Reducing dimensionality...
Optimizing GMM clustering...
Analyzing top features...
Creating 2D visualization...
Generating visualizations...
Saving results...
Optimal number of clusters: 2
Best silhouette score: 0.76
Total statistically significant features: 412
Most important feature: AP_log-sigma-3-0-mm-3D_firstorder_Minimum (importance=0.682)
Visualizations saved to ./figures
Results saved to ./results
