In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
import joblib
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, 
                           roc_auc_score, roc_curve, classification_report, confusion_matrix, auc)
from sklearn.utils.class_weight import compute_class_weight
import seaborn as sns
import os
from datetime import datetime

In [None]:
# Parameters
needle_height = '1.3'
conjugate = 'chlr'
n_trials = 50
dataset_key = f"{needle_height}_{conjugate}"

# Create output directory with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"results/{dataset_key}_{timestamp}"
os.makedirs(output_dir, exist_ok=True)

# Base directory
base_dir = r"D:\20241129_solid_nN_1.3_2.4_mdck_siRNA_tnsfn_chlr"

# Define dataset path
dataset_path = base_dir + r"\20241129_solid_nN_1.3_mdck_chlr_dataset\solid_1.3_chlr_cell_level.csv"

# Define morphological and intensity features
cell_morph_features = [
    'area', 'perimeter', 'major_axis_length', 'minor_axis_length', 
    'eccentricity', 'circularity', 'solidity', 'orientation'
]

nuclear_morph_features = [
    'nuclear_area', 'nuclear_perimeter', 'nuclear_major_axis_length', 
    'nuclear_minor_axis_length', 'nuclear_eccentricity', 'nuclear_circularity', 
    'nuclear_solidity', 'nuclear_orientation'
]

channel_feature_suffixes = [
    'intensity_p10', 'intensity_p25', 'intensity_p50', 
    'intensity_p75', 'intensity_p90'
]

protein_channels = ['actin', 'caveolin', 'clathrin_hc', 'nuclei']

# Generate feature list with caveolin features first to ensure dominance
feature_list = cell_morph_features + nuclear_morph_features

for suffix in channel_feature_suffixes:
    feature_list.append(f"caveolin_{suffix}")

for ch in protein_channels:
    if ch != 'caveolin':
        for suffix in channel_feature_suffixes:
            feature_list.append(f"{ch}_{suffix}")

def process_dataset(dataset_path, dataset_name, area_percentiles=(2, 98)):
    print(f"\n=== Processing {dataset_name} ===")
    
    # Extract conjugate type from dataset_name
    conjugate_type = dataset_name.split('_')[1] # Will be 'chlr'
    
    # Set the correct intensity column name
    intensity_column = f"{conjugate_type}_intensity_mean"
    
    print(f"Using intensity column: {intensity_column}")
    
    # Load dataset
    df = pd.read_csv(dataset_path)
    
    # Determine threshold for chlr
    intensity_threshold = 300
    
    # Apply area filtering based on percentiles
    cell_area_min, cell_area_max = np.percentile(df['area'], area_percentiles)
    nuclear_area_min, nuclear_area_max = np.percentile(df['nuclear_area'], area_percentiles)
    
    # Filter cells and nuclei based on thresholds
    df_filtered = df[
        (df['area'] >= cell_area_min) & 
        (df['area'] <= cell_area_max) & 
        (df[intensity_column] > intensity_threshold)
    ].copy()
    
    nuclei_threshold = (
        (df_filtered['nuclear_area'] >= nuclear_area_min) & 
        (df_filtered['nuclear_area'] <= nuclear_area_max)
    )
    
    nuclear_cols = [col for col in df_filtered.columns if col.startswith('nuclear_')]
    df_filtered.loc[~nuclei_threshold, nuclear_cols] = np.nan
    
    # Convert target into categorical bins
    num_bins = 5
    df_filtered['conjugate_category'] = pd.qcut(df_filtered[intensity_column], q=num_bins, labels=False)
    
    y = df_filtered['conjugate_category']
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    X = df_filtered[feature_list]
    
    images = df_filtered['image_id']
    
    # Initialise storage for aggregated metrics
    all_fold_metrics = []
    class_report_list = []
    shap_values_list = []
    mean_fpr = np.linspace(0, 1, 100)
    tprs = []
    aucs = []
    
    # Track class distributions
    class_distributions = []
    
    # Outer CV: Stratified Group K-Fold
    outer_cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Create a dummy model for SHAP initialisation
    dummy_model = xgb.XGBClassifier()
    dummy_model.fit(X.iloc[:10], y_encoded[:10])
    explainer = shap.TreeExplainer(dummy_model)
    
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y_encoded, groups=images), start=1):
        print(f"\n=== Outer Fold {fold} ===")
        
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]
        
        # Track class distribution in fold
        class_distributions.append({
            "train": np.bincount(y_train, minlength=num_bins),
            "test": np.bincount(y_test, minlength=num_bins)
        })
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        def objective(trial):
            params = {
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'gamma': trial.suggest_float('gamma', 0, 5),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
                'n_estimators': trial.suggest_int('n_estimators', 50, 200)
            }
            
            model = xgb.XGBClassifier(random_state=42, **params)
            
            # Inner CV: Stratified K-Fold
            inner_cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)
            inner_scores = []
            
            for inner_train_idx, inner_valid_idx in inner_cv.split(X_train_scaled, y_train, groups=images[train_idx]):
                X_inner_train = X_train_scaled[inner_train_idx]
                X_inner_valid = X_train_scaled[inner_valid_idx]
                y_inner_train = y_train[inner_train_idx]
                y_inner_valid = y_train[inner_valid_idx]
                
                model.fit(X_inner_train, y_inner_train)
                y_pred_inner = model.predict(X_inner_valid)
                score = accuracy_score(y_inner_valid, y_pred_inner)
                inner_scores.append(score)
            
            return np.mean(inner_scores)
        
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=n_trials)
        
        best_params = study.best_params
        
        best_model = xgb.XGBClassifier(random_state=42, **best_params)
        best_model.fit(X_train_scaled, y_train)
        
        # Calculate comprehensive metrics
        y_test_pred = best_model.predict(X_test_scaled)
        y_test_proba = best_model.predict_proba(X_test_scaled)
        
        # Store fold metrics
        fold_metrics = {
            "fold": fold,
            "accuracy": accuracy_score(y_test, y_test_pred),
            "f1_weighted": f1_score(y_test, y_test_pred, average='weighted'),
            "precision_weighted": precision_score(y_test, y_test_pred, average='weighted'),
            "recall_weighted": recall_score(y_test, y_test_pred, average='weighted'),
            "roc_auc": roc_auc_score(y_test, y_test_proba, multi_class='ovr')
        }
        all_fold_metrics.append(fold_metrics)
        
        # Generate class-wise metrics
        class_report = classification_report(y_test, y_test_pred, output_dict=True)
        class_report_list.append(class_report)
        
        # Calculate SHAP values
        explainer = shap.TreeExplainer(best_model)
        shap_values = explainer.shap_values(X_test_scaled)
        shap_values_list.append(shap_values)
        
        # Plot ROC curves for each class
        try:
            fig, ax = plt.subplots(figsize=(10, 8))
            for i in range(len(np.unique(y_encoded))):
                try:
                    if i < len(y_test_proba[0]):  # Ensure class exists in predictions
                        fpr, tpr, _ = roc_curve((y_test == i).astype(int), y_test_proba[:, i])
                        roc_auc = auc(fpr, tpr)
                        ax.plot(fpr, tpr, label=f'Class {i} (AUC = {roc_auc:.2f})')
                        
                        # Store for aggregate ROC
                        if i == 0:  # Main class of interest
                            interp_tpr = np.interp(mean_fpr, fpr, tpr)
                            interp_tpr[0] = 0.0
                            tprs.append(interp_tpr)
                            aucs.append(roc_auc)
                except Exception as e:
                    print(f"Error plotting ROC for class {i}: {str(e)}")
                    
            ax.plot([0, 1], [0, 1], 'k--')
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.set_title(f'Per-Class ROC Curves - Fold {fold}')
            ax.legend()
            plt.savefig(f"{output_dir}/per_class_roc_fold_{fold}_{dataset_name}.png")
        except Exception as e:
            print(f"Error generating ROC curve: {str(e)}")
        finally:
            plt.close(fig)
        
        # Plot confusion matrix
        cm = confusion_matrix(y_test, y_test_pred)
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
        ax.set_title(f'Confusion Matrix - Fold {fold}')
        ax.set_xlabel('Predicted')
        ax.set_ylabel('True')
        plt.savefig(f"{output_dir}/confusion_matrix_fold_{fold}_{dataset_name}.png")
        plt.close(fig)
        
        # Feature-target correlation analysis
        correlation_df = X_test.copy()
        correlation_df['target'] = y_test
        fig, ax = plt.subplots(figsize=(15, 10))
        corr_with_target = correlation_df.corr()['target'].sort_values(ascending=False)
        sns.heatmap(pd.DataFrame(corr_with_target).T, annot=True, cmap='coolwarm', ax=ax)
        ax.set_title(f'Feature-Target Correlations - Fold {fold}')
        plt.savefig(f"{output_dir}/feature_target_corr_fold_{fold}_{dataset_name}.png")
        plt.close(fig)
        
        # SHAP summary plot for this fold
        fig, ax = plt.subplots(figsize=(12, 10))
        shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
        plt.title(f'SHAP Feature Importance - Fold {fold}')
        plt.tight_layout()
        plt.savefig(f"{output_dir}/shap_summary_fold_{fold}_{dataset_name}.png")
        plt.close(fig)
        
        # Feature importance plot
        fig, ax = plt.subplots(figsize=(12, 8))
        xgb.plot_importance(best_model, max_num_features=20, ax=ax)
        ax.set_title(f"Feature Importance - {conjugate_type.upper()} - Fold {fold}")
        plt.savefig(f"{output_dir}/feature_importance_{conjugate_type}_fold_{fold}_{dataset_name}.png")
        plt.close(fig)
        
        # Save model
        model_filename = f"{output_dir}/model_{dataset_name}_fold_{fold}.joblib"
        joblib.dump(best_model, model_filename)
        print(f"Model saved as {model_filename}")
    
    # After all folds complete:
    # 1. Aggregate performance metrics
    metrics_df = pd.DataFrame(all_fold_metrics)
    avg_metrics = {
        'accuracy': metrics_df['accuracy'].mean(),
        'accuracy_std': metrics_df['accuracy'].std(),
        'f1_weighted': metrics_df['f1_weighted'].mean(),
        'f1_weighted_std': metrics_df['f1_weighted'].std(),
        'precision_weighted': metrics_df['precision_weighted'].mean(),
        'precision_weighted_std': metrics_df['precision_weighted'].std(),
        'recall_weighted': metrics_df['recall_weighted'].mean(),
        'recall_weighted_std': metrics_df['recall_weighted'].std(),
        'roc_auc': metrics_df['roc_auc'].mean(),
        'roc_auc_std': metrics_df['roc_auc'].std()
    }
    
    # 2. Generate aggregate ROC curve
    if len(tprs) > 0 and len(aucs) > 0:
        try:
            fig, ax = plt.subplots(figsize=(10, 8))
            
            # Plot individual fold ROC curves (faded)
            for i, tpr in enumerate(tprs):
                ax.plot(mean_fpr, tpr, alpha=0.3, label=f'ROC fold {i+1} (AUC = {aucs[i]:.2f})')
            
            # Plot mean ROC
            mean_tpr = np.mean(tprs, axis=0)
            mean_auc = auc(mean_fpr, mean_tpr)
            std_auc = np.std(aucs)
            ax.plot(mean_fpr, mean_tpr, 'b-', label=f'Mean ROC (AUC = {mean_auc:.2f} ± {std_auc:.2f})', lw=2)
            
            # Plot standard deviation
            std_tpr = np.std(tprs, axis=0)
            tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
            tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
            ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.2, label=r'± 1 std. dev.')
            
            ax.plot([0, 1], [0, 1], 'k--')
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.set_title('Aggregate ROC Curve')
            ax.legend(loc="lower right")
            plt.savefig(f"{output_dir}/aggregate_roc_{dataset_name}.png")
        except Exception as e:
            print(f"Error generating aggregate ROC curve: {str(e)}")
        finally:
            plt.close(fig)
    else:
        print("Warning: No ROC data collected across folds. Skipping aggregate ROC curve.")
    
    # 3. Class distribution analysis
    class_df = pd.DataFrame()
    for fold, dist in enumerate(class_distributions, 1):
        fold_df = pd.DataFrame({
            'fold': fold,
            'class': range(len(dist['train'])),
            'train_count': dist['train'],
            'test_count': dist['test']
        })
        class_df = pd.concat([class_df, fold_df])
    
    # Plot class distribution
    fig, ax = plt.subplots(figsize=(12, 8))
    sns.boxplot(x='class', y='train_count', data=class_df, ax=ax)
    ax.set_title('Class Distribution Across Folds (Train)')
    plt.savefig(f"{output_dir}/class_distribution_train_{dataset_name}.png")
    plt.close(fig)
    
    return avg_metrics

# Run the analysis
avg_metrics = process_dataset(dataset_path, dataset_key)

# Print final results
print("\n=== Final Results ===")
print(f"Dataset: {dataset_key}")
print(f"Accuracy: {avg_metrics['accuracy']:.4f} ± {avg_metrics['accuracy_std']:.4f}")
print(f"F1 Score (weighted): {avg_metrics['f1_weighted']:.4f} ± {avg_metrics['f1_weighted_std']:.4f}")
print(f"Precision (weighted): {avg_metrics['precision_weighted']:.4f} ± {avg_metrics['precision_weighted_std']:.4f}")
print(f"Recall (weighted): {avg_metrics['recall_weighted']:.4f} ± {avg_metrics['precision_weighted_std']:.4f}")
print(f"ROC AUC: {avg_metrics['roc_auc']:.4f} ± {avg_metrics['roc_auc_std']:.4f}")



=== Processing 1.3_chlr ===
Using intensity column: chlr_intensity_mean


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
[I 2025-04-01 18:31:13,194] A new study created in memory with name: no-name-92972c20-7619-47b8-a68d-8674e81ccb0c



=== Outer Fold 1 ===


[I 2025-04-01 18:31:15,272] Trial 0 finished with value: 0.3880130767965131 and parameters: {'max_depth': 9, 'learning_rate': 0.04172364300364212, 'subsample': 0.7517151964775471, 'colsample_bytree': 0.7314004432092831, 'min_child_weight': 3, 'gamma': 3.439058442179847, 'reg_alpha': 0.8310674202959656, 'reg_lambda': 9.85507587792393, 'n_estimators': 184}. Best is trial 0 with value: 0.3880130767965131.
[I 2025-04-01 18:31:17,668] Trial 1 finished with value: 0.3887237246904276 and parameters: {'max_depth': 10, 'learning_rate': 0.03221584014191672, 'subsample': 0.9947456007665895, 'colsample_bytree': 0.8694315163543268, 'min_child_weight': 7, 'gamma': 2.9228421921621117, 'reg_alpha': 3.160730173101788, 'reg_lambda': 4.696216981559815, 'n_estimators': 174}. Best is trial 1 with value: 0.3887237246904276.
[I 2025-04-01 18:31:20,352] Trial 2 finished with value: 0.38765766868945467 and parameters: {'max_depth': 5, 'learning_rate': 0.02188190779625187, 'subsample': 0.6578604440193249, 'cols

Model saved as results/1.3_chlr_20250401_183111/model_1.3_chlr_fold_1.joblib

=== Outer Fold 2 ===


[I 2025-04-01 18:32:59,172] Trial 0 finished with value: 0.36607854806895834 and parameters: {'max_depth': 4, 'learning_rate': 0.09391413800264574, 'subsample': 0.8475367880567968, 'colsample_bytree': 0.8249414437303852, 'min_child_weight': 4, 'gamma': 2.4828188331260455, 'reg_alpha': 3.854436931539973, 'reg_lambda': 7.659051114493858, 'n_estimators': 185}. Best is trial 0 with value: 0.36607854806895834.
[I 2025-04-01 18:33:00,191] Trial 1 finished with value: 0.3574347660449628 and parameters: {'max_depth': 5, 'learning_rate': 0.09476269156387616, 'subsample': 0.7111541085350708, 'colsample_bytree': 0.9808130899556315, 'min_child_weight': 6, 'gamma': 3.8368001403991996, 'reg_alpha': 9.660101544948198, 'reg_lambda': 3.1826078499622703, 'n_estimators': 193}. Best is trial 0 with value: 0.36607854806895834.
[I 2025-04-01 18:33:02,288] Trial 2 finished with value: 0.36771388473576333 and parameters: {'max_depth': 10, 'learning_rate': 0.035690367316231326, 'subsample': 0.9833427307501631,

Model saved as results/1.3_chlr_20250401_183111/model_1.3_chlr_fold_2.joblib

=== Outer Fold 3 ===


[I 2025-04-01 18:34:35,666] Trial 0 finished with value: 0.3635949943117178 and parameters: {'max_depth': 8, 'learning_rate': 0.08769659463256715, 'subsample': 0.6043124996672513, 'colsample_bytree': 0.597838159063327, 'min_child_weight': 1, 'gamma': 3.896550537134265, 'reg_alpha': 9.325556954200977, 'reg_lambda': 4.603380930175312, 'n_estimators': 172}. Best is trial 0 with value: 0.3635949943117178.
[I 2025-04-01 18:34:37,674] Trial 1 finished with value: 0.3736063708759954 and parameters: {'max_depth': 5, 'learning_rate': 0.04616386610599324, 'subsample': 0.7823125594428819, 'colsample_bytree': 0.6588569226445256, 'min_child_weight': 1, 'gamma': 1.5474574978351834, 'reg_alpha': 4.581907638695235, 'reg_lambda': 5.8452733746239725, 'n_estimators': 198}. Best is trial 1 with value: 0.3736063708759954.
[I 2025-04-01 18:34:39,207] Trial 2 finished with value: 0.37178612059158134 and parameters: {'max_depth': 9, 'learning_rate': 0.05402946847706527, 'subsample': 0.9256221742401918, 'colsa

Model saved as results/1.3_chlr_20250401_183111/model_1.3_chlr_fold_3.joblib

=== Outer Fold 4 ===


[I 2025-04-01 18:36:00,287] Trial 0 finished with value: 0.36803364879074657 and parameters: {'max_depth': 3, 'learning_rate': 0.06062731936062546, 'subsample': 0.8766702446405223, 'colsample_bytree': 0.5820983782422224, 'min_child_weight': 3, 'gamma': 4.530948210942617, 'reg_alpha': 9.257261875062234, 'reg_lambda': 2.6457563211198423, 'n_estimators': 163}. Best is trial 0 with value: 0.36803364879074657.
[I 2025-04-01 18:36:01,351] Trial 1 finished with value: 0.3724734197920318 and parameters: {'max_depth': 9, 'learning_rate': 0.04577612459401719, 'subsample': 0.7576822407293294, 'colsample_bytree': 0.9520511925514024, 'min_child_weight': 8, 'gamma': 4.139050348458767, 'reg_alpha': 4.647634975428617, 'reg_lambda': 6.854525925918418, 'n_estimators': 121}. Best is trial 1 with value: 0.3724734197920318.
[I 2025-04-01 18:36:02,792] Trial 2 finished with value: 0.3752774856875803 and parameters: {'max_depth': 4, 'learning_rate': 0.01505769520725447, 'subsample': 0.8729526251567254, 'cols

Model saved as results/1.3_chlr_20250401_183111/model_1.3_chlr_fold_4.joblib

=== Outer Fold 5 ===


[I 2025-04-01 18:37:01,897] Trial 0 finished with value: 0.37556497550946366 and parameters: {'max_depth': 10, 'learning_rate': 0.06910438202821752, 'subsample': 0.7580465708179256, 'colsample_bytree': 0.8677625326795448, 'min_child_weight': 1, 'gamma': 0.6770326725192682, 'reg_alpha': 0.12302291484161287, 'reg_lambda': 6.442520034831366, 'n_estimators': 109}. Best is trial 0 with value: 0.37556497550946366.
[I 2025-04-01 18:37:03,791] Trial 1 finished with value: 0.38275152687911956 and parameters: {'max_depth': 6, 'learning_rate': 0.02835931926839002, 'subsample': 0.9618125958352092, 'colsample_bytree': 0.9589569377751512, 'min_child_weight': 6, 'gamma': 1.7003546081766285, 'reg_alpha': 3.0657445987098217, 'reg_lambda': 3.063609574347784, 'n_estimators': 83}. Best is trial 1 with value: 0.38275152687911956.
[I 2025-04-01 18:37:04,530] Trial 2 finished with value: 0.3754483078349559 and parameters: {'max_depth': 4, 'learning_rate': 0.07279666229553529, 'subsample': 0.9950261291908022,

Model saved as results/1.3_chlr_20250401_183111/model_1.3_chlr_fold_5.joblib

=== Final Results ===
Dataset: 1.3_chlr
Accuracy: 0.3788 ± 0.0094
F1 Score (weighted): 0.3683 ± 0.0121
Precision (weighted): 0.3699 ± 0.0165
Recall (weighted): 0.3788 ± 0.0165
ROC AUC: 0.7127 ± 0.0097
