In [17]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
import joblib
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, 
                            roc_auc_score, roc_curve, classification_report, confusion_matrix, auc)
from sklearn.utils.class_weight import compute_class_weight
import seaborn as sns
import os
import warnings
from datetime import datetime

# Parameters
needle_height = '1.3'
conjugate = 'chlr'
n_trials = 50
dataset_key = f"{needle_height}_{conjugate}" # Dataset name

# Create output directory with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"results/{dataset_key}_{timestamp}"
os.makedirs(output_dir, exist_ok=True)

# Base directory
base_dir = r"D:\20241129_solid_nN_1.3_2.4_mdck_siRNA_tnsfn_chlr\20241129_solid_nN_1.3_mdck_chlr_dataset"
dataset_path = base_dir + r"\solid_1.3_chlr_cell_level.csv"


In [18]:
# Feature Definition

# Define morphological and intensity features
cell_morph_features = [
    'area', 'perimeter', 'major_axis_length', 'minor_axis_length', 
    'eccentricity', 'circularity', 'solidity', 'orientation'
]

nuclear_morph_features = [
    'nuclear_area', 'nuclear_perimeter', 'nuclear_major_axis_length', 
    'nuclear_minor_axis_length', 'nuclear_eccentricity', 'nuclear_circularity', 
    'nuclear_solidity', 'nuclear_orientation'
]

channel_feature_suffixes = [
    'intensity_p10', 'intensity_p25', 'intensity_p50', 
    'intensity_p75', 'intensity_p90'
]

protein_channels = ['actin', 'caveolin', 'clathrin_hc', 'nuclei']

# Generate feature list with caveolin features first to ensure dominance
feature_list = cell_morph_features + nuclear_morph_features

for suffix in channel_feature_suffixes:
    feature_list.append(f"caveolin_{suffix}")

for ch in protein_channels:
    if ch != 'caveolin':
        for suffix in channel_feature_suffixes:
            feature_list.append(f"{ch}_{suffix}")

print(feature_list)


['area', 'perimeter', 'major_axis_length', 'minor_axis_length', 'eccentricity', 'circularity', 'solidity', 'orientation', 'nuclear_area', 'nuclear_perimeter', 'nuclear_major_axis_length', 'nuclear_minor_axis_length', 'nuclear_eccentricity', 'nuclear_circularity', 'nuclear_solidity', 'nuclear_orientation', 'caveolin_intensity_p10', 'caveolin_intensity_p25', 'caveolin_intensity_p50', 'caveolin_intensity_p75', 'caveolin_intensity_p90', 'actin_intensity_p10', 'actin_intensity_p25', 'actin_intensity_p50', 'actin_intensity_p75', 'actin_intensity_p90', 'clathrin_hc_intensity_p10', 'clathrin_hc_intensity_p25', 'clathrin_hc_intensity_p50', 'clathrin_hc_intensity_p75', 'clathrin_hc_intensity_p90', 'nuclei_intensity_p10', 'nuclei_intensity_p25', 'nuclei_intensity_p50', 'nuclei_intensity_p75', 'nuclei_intensity_p90']


In [19]:
# Data Processing

def process_dataset(dataset_path, dataset_key, area_percentiles=(2, 98)):
    print(f"\n=== Processing {dataset_key} ===")

    # Extract conjugate type from dataset_key
    conjugate_type = dataset_key.split('_')[1]

    # Set the correct intensity column name
    intensity_column = f"{conjugate_type}_intensity_mean"
    intensity_threshold = 300

    print(f"Using intensity column: {intensity_column}")

    # Load dataset
    df = pd.read_csv(dataset_path)

    # Apply area filtering based on percentiles
    cell_area_min, cell_area_max = np.percentile(df['area'], area_percentiles)
    nuclear_area_min, nuclear_area_max = np.percentile(df['nuclear_area'], area_percentiles)

    # Filter cells based on area thresholds
    df_filtered = df[(df['area'] >= cell_area_min) & (df['area'] <= cell_area_max)].copy()

    # Filter nuclei based on area thresholds
    nuclei_threshold = (
        (df_filtered['nuclear_area'] >= nuclear_area_min) & 
        (df_filtered['nuclear_area'] <= nuclear_area_max)
    )

    nuclear_cols = [col for col in df_filtered.columns if col.startswith('nuclear_')]
    df_filtered.loc[~nuclei_threshold, nuclear_cols] = np.nan

    # Create binary target variable
    df_filtered['conjugate_category'] = (df_filtered[intensity_column] > intensity_threshold).astype(int)

    print("Unique conjugate_category values:", df_filtered['conjugate_category'].unique())
    print("Value counts:\n", df_filtered['conjugate_category'].value_counts())

    # Define X, y, and images
    X = df_filtered[feature_list]
    y = df_filtered['conjugate_category']
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    images = df_filtered['image_id']
    
    return X, y_encoded, images


In [20]:
# Model Training and Evaluation

X, y_encoded, images = process_dataset(dataset_path, dataset_key)

# Dictionaries for aggregated metrics
all_fold_metrics = []
class_report_list = []
shap_values_list = []
mean_fpr = np.linspace(0, 1, 100)
tprs = []
aucs = []
    
# Outer CV: Stratified Group K-Fold
outer_cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    
for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y_encoded, groups=images), start=1):
    print(f"\n=== Outer Fold {fold} ===")
    print(f"Fold {fold}: n_test={len(test_idx)}")
    print(f"Class distribution: Class 0: {np.sum(y_encoded[test_idx] == 0)}, Class 1: {np.sum(y_encoded[test_idx] == 1)}")
        
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y_encoded[train_idx], y_encoded[test_idx]
        
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
        
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 200)
            }
            
        model = xgb.XGBClassifier(random_state=42, **params)
            
        inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        inner_scores = []
            
        for inner_train_idx, inner_valid_idx in inner_cv.split(X_train_scaled, y_train):
            X_inner_train = X_train_scaled[inner_train_idx]
            X_inner_valid = X_train_scaled[inner_valid_idx]
            y_inner_train = y_train[inner_train_idx]
            y_inner_valid = y_train[inner_valid_idx]

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                model.fit(X_inner_train, y_inner_train)
                
            y_pred_inner = model.predict(X_inner_valid)
            score = accuracy_score(y_inner_valid, y_pred_inner)
            inner_scores.append(score)
            
        return np.mean(inner_scores)
        
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    best_params = study.best_params
    best_model = xgb.XGBClassifier(random_state=42, **best_params)
    best_model.fit(X_train_scaled, y_train)
        
    # Calculate comprehensive metrics
    y_test_pred = best_model.predict(X_test_scaled)
    y_test_proba = best_model.predict_proba(X_test_scaled)
        
    # Store fold metrics
    fold_metrics = {
        "fold": fold,
        "accuracy": accuracy_score(y_test, y_test_pred),
        "f1_weighted": f1_score(y_test, y_test_pred, average='weighted'),
        "precision_weighted": precision_score(y_test, y_test_pred, average='weighted'),
        "recall_weighted": recall_score(y_test, y_test_pred, average='weighted'),
        "roc_auc": roc_auc_score(y_test, y_test_proba[:, 1])
    }
    all_fold_metrics.append(fold_metrics)
        
    # Generate class-wise metrics
    class_report = classification_report(y_test, y_test_pred, output_dict=True)
    class_report_list.append(class_report)
        
    # Calculate SHAP values
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test_scaled)
    shap_values_list.append(shap_values)
        
    # Save model
    model_filename = f"{output_dir}/model_{dataset_key}_fold_{fold}.joblib"
    joblib.dump(best_model, model_filename)
    print(f"Model saved as {model_filename}")
        
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_test_proba[:, 1])
    roc_auc = auc(fpr, tpr)
        
    interp_tpr = np.interp(mean_fpr, fpr, tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(roc_auc)
    
# After all folds complete
metrics_df = pd.DataFrame(all_fold_metrics)
avg_metrics = {
    'accuracy': metrics_df['accuracy'].mean(),
    'accuracy_std': metrics_df['accuracy'].std(),
    'f1_weighted': metrics_df['f1_weighted'].mean(),
    'f1_weighted_std': metrics_df['f1_weighted'].std(),
    'precision_weighted': metrics_df['precision_weighted'].mean(),
    'precision_weighted_std': metrics_df['precision_weighted'].std(),
    'recall_weighted': metrics_df['recall_weighted'].mean(),
    'recall_weighted_std': metrics_df['recall_weighted'].std(),
    'roc_auc': metrics_df['roc_auc'].mean(),
    'roc_auc_std': metrics_df['roc_auc'].std()
}
    
best_fold_idx = np.argmax(metrics_df['roc_auc'])
best_fold = metrics_df.iloc[best_fold_idx]['fold']
best_model_path = f"{output_dir}/model_{dataset_key}_fold_{int(best_fold)}.joblib"
best_model = joblib.load(best_model_path)

# Print results
print("\n=== Final Results ===")
print(f"Dataset: {dataset_key}")
print(f"Accuracy: {avg_metrics['accuracy']:.4f} ± {avg_metrics['accuracy_std']:.4f}")
print(f"F1 Score (weighted): {avg_metrics['f1_weighted']:.4f} ± {avg_metrics['f1_weighted_std']:.4f}")
print(f"Precision (weighted): {avg_metrics['precision_weighted']:.4f} ± {avg_metrics['precision_weighted_std']:.4f}")
print(f"Recall (weighted): {avg_metrics['recall_weighted']:.4f} ± {avg_metrics['recall_weighted_std']:.4f}")
print(f"ROC AUC: {avg_metrics['roc_auc']:.4f} ± {avg_metrics['roc_auc_std']:.4f}")
print(f"Best Fold: {best_fold}")



=== Processing 1.3_chlr ===
Using intensity column: chlr_intensity_mean
Unique conjugate_category values: [1 0]
Value counts:
 conjugate_category
0    16180
1    10745
Name: count, dtype: int64


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
[I 2025-04-13 18:20:10,776] A new study created in memory with name: no-name-25be72a2-0bdd-4e67-a8e6-17d268055225



=== Outer Fold 1 ===
Fold 1: n_test=5031
Class distribution: Class 0: 2949, Class 1: 2082


[I 2025-04-13 18:20:11,161] Trial 0 finished with value: 0.8269845619804513 and parameters: {'max_depth': 4, 'learning_rate': 0.08875254008934962, 'subsample': 0.6961156576431756, 'colsample_bytree': 0.9733428027089286, 'min_child_weight': 3, 'gamma': 4.762815923258452, 'reg_alpha': 0.6443986102477228, 'reg_lambda': 9.6972747538607, 'n_estimators': 65}. Best is trial 0 with value: 0.8269845619804513.
[I 2025-04-13 18:20:11,754] Trial 1 finished with value: 0.8239700374531834 and parameters: {'max_depth': 7, 'learning_rate': 0.04986442494080787, 'subsample': 0.6796955169412799, 'colsample_bytree': 0.8132357308959421, 'min_child_weight': 7, 'gamma': 4.065510881829434, 'reg_alpha': 8.476081373992853, 'reg_lambda': 8.414779833019445, 'n_estimators': 68}. Best is trial 0 with value: 0.8269845619804513.
[I 2025-04-13 18:20:12,281] Trial 2 finished with value: 0.8225084498035992 and parameters: {'max_depth': 5, 'learning_rate': 0.03320765549637375, 'subsample': 0.8017897812247957, 'colsample_

Model saved as results/1.3_chlr_20250413_182009/model_1.3_chlr_fold_1.joblib

=== Outer Fold 2 ===
Fold 2: n_test=5247
Class distribution: Class 0: 3241, Class 1: 2006


[I 2025-04-13 18:20:52,685] Trial 0 finished with value: 0.8280284159055263 and parameters: {'max_depth': 10, 'learning_rate': 0.07148398584644437, 'subsample': 0.7004221473772211, 'colsample_bytree': 0.9318156142336065, 'min_child_weight': 4, 'gamma': 3.5666478158648434, 'reg_alpha': 4.7245188170861185, 'reg_lambda': 4.345351344017914, 'n_estimators': 128}. Best is trial 0 with value: 0.8280284159055263.
[I 2025-04-13 18:20:53,276] Trial 1 finished with value: 0.8231847956453547 and parameters: {'max_depth': 10, 'learning_rate': 0.044129342798053195, 'subsample': 0.9578038570035813, 'colsample_bytree': 0.6953370649261748, 'min_child_weight': 1, 'gamma': 3.8157769874155054, 'reg_alpha': 5.81213988729541, 'reg_lambda': 1.8084216776870454, 'n_estimators': 54}. Best is trial 0 with value: 0.8280284159055263.
[I 2025-04-13 18:20:53,687] Trial 2 finished with value: 0.827659378171418 and parameters: {'max_depth': 5, 'learning_rate': 0.08630735837088979, 'subsample': 0.996565050393184, 'cols

Model saved as results/1.3_chlr_20250413_182009/model_1.3_chlr_fold_2.joblib

=== Outer Fold 3 ===
Fold 3: n_test=5462
Class distribution: Class 0: 3327, Class 1: 2135


[I 2025-04-13 18:21:31,624] Trial 0 finished with value: 0.8319435237981928 and parameters: {'max_depth': 6, 'learning_rate': 0.07820193792664017, 'subsample': 0.6687977979284145, 'colsample_bytree': 0.9406762956956888, 'min_child_weight': 3, 'gamma': 1.5470376147987441, 'reg_alpha': 2.5204994053012086, 'reg_lambda': 0.16567327635386153, 'n_estimators': 93}. Best is trial 0 with value: 0.8319435237981928.
[I 2025-04-13 18:21:32,397] Trial 1 finished with value: 0.8293809721125749 and parameters: {'max_depth': 8, 'learning_rate': 0.0390433310494516, 'subsample': 0.9193401210424086, 'colsample_bytree': 0.562632525179986, 'min_child_weight': 4, 'gamma': 3.490063776166157, 'reg_alpha': 8.465229578470767, 'reg_lambda': 8.561060918073753, 'n_estimators': 139}. Best is trial 0 with value: 0.8319435237981928.
[I 2025-04-13 18:21:33,289] Trial 2 finished with value: 0.8314310421142506 and parameters: {'max_depth': 10, 'learning_rate': 0.04358827873218672, 'subsample': 0.8876365582712651, 'colsa

Model saved as results/1.3_chlr_20250413_182009/model_1.3_chlr_fold_3.joblib

=== Outer Fold 4 ===
Fold 4: n_test=5639
Class distribution: Class 0: 3373, Class 1: 2266


[I 2025-04-13 18:22:18,709] Trial 0 finished with value: 0.8342099715595429 and parameters: {'max_depth': 8, 'learning_rate': 0.0930360582329175, 'subsample': 0.6341383768695057, 'colsample_bytree': 0.8470594964429421, 'min_child_weight': 10, 'gamma': 3.4345578963218895, 'reg_alpha': 1.3572220770275933, 'reg_lambda': 7.452079292690669, 'n_estimators': 62}. Best is trial 0 with value: 0.8342099715595429.
[I 2025-04-13 18:22:19,012] Trial 1 finished with value: 0.8232638979925365 and parameters: {'max_depth': 3, 'learning_rate': 0.06716530633931787, 'subsample': 0.8127865421005536, 'colsample_bytree': 0.8256208512492125, 'min_child_weight': 5, 'gamma': 2.8316434988257315, 'reg_alpha': 8.968871955828137, 'reg_lambda': 4.4255424234248615, 'n_estimators': 100}. Best is trial 0 with value: 0.8342099715595429.
[I 2025-04-13 18:22:19,510] Trial 2 finished with value: 0.8249551438455768 and parameters: {'max_depth': 5, 'learning_rate': 0.036886921403890514, 'subsample': 0.7769340020546263, 'col

Model saved as results/1.3_chlr_20250413_182009/model_1.3_chlr_fold_4.joblib

=== Outer Fold 5 ===
Fold 5: n_test=5546
Class distribution: Class 0: 3290, Class 1: 2256


[I 2025-04-13 18:23:01,792] Trial 0 finished with value: 0.8255763945270879 and parameters: {'max_depth': 3, 'learning_rate': 0.08622263763544448, 'subsample': 0.644420379039413, 'colsample_bytree': 0.5120900358411729, 'min_child_weight': 7, 'gamma': 4.275380897103565, 'reg_alpha': 9.142618999815944, 'reg_lambda': 4.060389068082597, 'n_estimators': 177}. Best is trial 0 with value: 0.8255763945270879.
[I 2025-04-13 18:23:02,466] Trial 1 finished with value: 0.8282427394841433 and parameters: {'max_depth': 5, 'learning_rate': 0.04138622413425877, 'subsample': 0.821013977763588, 'colsample_bytree': 0.6389013277621138, 'min_child_weight': 4, 'gamma': 3.114084566162538, 'reg_alpha': 6.2933989807326505, 'reg_lambda': 9.77332313939019, 'n_estimators': 181}. Best is trial 1 with value: 0.8282427394841433.
[I 2025-04-13 18:23:02,930] Trial 2 finished with value: 0.8242199844755554 and parameters: {'max_depth': 9, 'learning_rate': 0.06881780491758585, 'subsample': 0.9487804114116943, 'colsample

Model saved as results/1.3_chlr_20250413_182009/model_1.3_chlr_fold_5.joblib

=== Final Results ===
Dataset: 1.3_chlr
Accuracy: 0.8339 ± 0.0081
F1 Score (weighted): 0.8328 ± 0.0079
Precision (weighted): 0.8331 ± 0.0081
Recall (weighted): 0.8339 ± 0.0081
ROC AUC: 0.9103 ± 0.0086
Best Fold: 2.0


In [21]:
#Visualisation

# Aggregate ROC curve
plt.figure(figsize=(10, 8))
for i, tpr in enumerate(tprs):
    plt.plot(mean_fpr, tpr, alpha=0.3, label=f'ROC fold {i+1} (AUC = {aucs[i]:.2f})')

mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, 'b-', label=f'Mean ROC (AUC = {mean_auc:.2f} ± {std_auc:.2f})', lw=2)

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Aggregate ROC Curve for Class 1')
plt.legend(loc="lower right")
plt.savefig(f"{output_dir}/aggregate_roc_class1_{dataset_key}.png")
plt.close()

# SHAP plot for best fold
fig, ax = plt.subplots(figsize=(12, 10))
X_test = X.iloc[test_idx]
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="beeswarm", show=False)
plt.title(f'SHAP Feature Importance - Best Fold {best_fold}')
plt.tight_layout()
plt.savefig(f"{output_dir}/best_fold_shap_beeswarm_{dataset_key}.png")
plt.close(fig)

# SHAP plot for best model
X_test = X.iloc[test_idx]
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.title(f'SHAP Feature Importance - Best Fold {best_fold}')
plt.tight_layout()
plt.savefig(f"{output_dir}/best_fold_shap_importance_{dataset_key}.png")
plt.close()