In [7]:
pip install numpy pandas matplotlib seaborn joblib scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [8]:
# For 50:50 split of actives:inactives

In [23]:
# -*- coding: utf-8 -*-
"""
ML-based Virtual Screening for MIC based activity prediction (v5.1 - RFE + Visualization)
Performs RFECV feature selection, visualizes selected feature space using PCA/t-SNE,
then trains and evaluates classification models using the selected features within pipelines.

Enhancements v5.1:
- Added PCA and t-SNE visualization step after RFECV feature selection
  to assess class separation based on selected features.
- Kept RFECV implementation and pipeline structure from V5.
"""

import os
import pandas as pd
import numpy as np
import joblib # Used for saving/loading models (Pipelines & Selector)
import matplotlib.pyplot as plt
import seaborn as sns
import glob         # For finding files
from PIL import Image # For combining images
import time         # To time feature selection

# Preprocessing & Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier # Used for RFE and as a model
from sklearn.svm import SVC
import lightgbm as lgb

# Feature Selection & Dimensionality Reduction
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA         # Import PCA
from sklearn.manifold import TSNE           # Import TSNE

# Model Selection and Evaluation
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    make_scorer
)

# --- Configuration ---
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

TRAIN_DATASET_PATH = "train_dataset.csv"
TEST_DATASET_PATH = "test_dataset.csv"
TARGET_COLUMN = "Activity_Label"

BASE_DIR = "mic_activity_prediction_study_v5_1_RFECV_Viz" # Updated directory name for V5.1
MODEL_DIR = os.path.join(BASE_DIR, "models")
RESULTS_DIR = os.path.join(BASE_DIR, "results")
FIGURES_DIR = os.path.join(RESULTS_DIR, "figures")
CSV_DIR = os.path.join(RESULTS_DIR, "csv")

RFE_ESTIMATOR = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced')
RFE_CV_FOLDS = 5
RFE_SCORING = 'matthews_corrcoef'
RFE_MIN_FEATURES = 10

# --- Create Directories ---
print("Creating output directories...")
for dir_path in [MODEL_DIR, RESULTS_DIR, FIGURES_DIR, CSV_DIR]:
    os.makedirs(dir_path, exist_ok=True)
    # print(f"  Directory created/exists: {dir_path}") # Reduce verbosity

# --- Load Data ---
print("\nLoading datasets...")
try:
    train_df = pd.read_csv(TRAIN_DATASET_PATH)
    test_df = pd.read_csv(TEST_DATASET_PATH)
    print(f"  Training data shape: {train_df.shape}")
    print(f"  Test data shape: {test_df.shape}")
except FileNotFoundError as e: print(f"Error loading data: {e}."); exit()
except KeyError: print(f"Error: Target column '{TARGET_COLUMN}' not found."); exit()

initial_features = [col for col in train_df.columns if col != TARGET_COLUMN]
print(f"\nIdentified {len(initial_features)} initial features.")
y_train = train_df[TARGET_COLUMN]
y_test = test_df[TARGET_COLUMN] # Keep test labels aside
X_train_initial = train_df[initial_features]

X_train_initial.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train_initial = X_train_initial.dropna()  # or X.fillna(X.mean())
X_train_initial = X_train_initial.clip(-1e6, 1e6)  # Limit extreme values
y_train = y_train.loc[X_train_initial.index] # Align target with cleaned features

Creating output directories...

Loading datasets...
  Training data shape: (680, 7769)
  Test data shape: (170, 7769)

Identified 7768 initial features.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_initial.replace([np.inf, -np.inf], np.nan, inplace=True)


In [19]:
# --- Preprocessing for Feature Selection ---
print("\nPreprocessing data for RFECV (Imputation)...")
fs_imputer = SimpleImputer(strategy="median")
X_train_imputed_fs = fs_imputer.fit_transform(X_train_initial)
X_train_imputed_fs_df = pd.DataFrame(X_train_imputed_fs, columns=initial_features, index=X_train_initial.index)
print("  Imputation complete for feature selection step.")

# --- Feature Selection using RFECV ---
print(f"\nStarting Feature Selection using RFECV (Estimator: {RFE_ESTIMATOR.__class__.__name__})...")
cv_strategy_rfe = StratifiedKFold(n_splits=RFE_CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
selector = RFECV(estimator=RFE_ESTIMATOR, step=1, cv=cv_strategy_rfe, scoring=RFE_SCORING,
                 min_features_to_select=RFE_MIN_FEATURES, n_jobs=-1, verbose=1)
start_time = time.time()
print("  Fitting RFECV (this may take some time)...")
selector.fit(X_train_imputed_fs_df, y_train)
end_time = time.time()
print(f"  RFECV fitting completed in {(end_time - start_time):.2f} seconds.")

selected_features_mask = selector.support_
selected_features_rfe = X_train_imputed_fs_df.columns[selected_features_mask].tolist()
print(f"\nRFECV selected {len(selected_features_rfe)} features:")
print(selected_features_rfe)
selector_filename = os.path.join(MODEL_DIR, "rfecv_selector.pkl")
joblib.dump(selector, selector_filename)
print(f"  RFECV selector object saved to {selector_filename}")


Preprocessing data for RFECV (Imputation)...
  Imputation complete for feature selection step.

Starting Feature Selection using RFECV (Estimator: RandomForestClassifier)...
  Fitting RFECV (this may take some time)...
Fitting estimator with 7768 features.
Fitting estimator with 7767 features.
Fitting estimator with 7766 features.
Fitting estimator with 7765 features.
Fitting estimator with 7764 features.
Fitting estimator with 7763 features.
Fitting estimator with 7762 features.
Fitting estimator with 7761 features.
Fitting estimator with 7760 features.
Fitting estimator with 7759 features.
Fitting estimator with 7758 features.
Fitting estimator with 7757 features.
Fitting estimator with 7756 features.
Fitting estimator with 7755 features.
Fitting estimator with 7754 features.
Fitting estimator with 7753 features.
Fitting estimator with 7752 features.
Fitting estimator with 7751 features.
Fitting estimator with 7750 features.
Fitting estimator with 7749 features.
Fitting estimator wi

In [20]:
# --- Dimensionality Reduction Visualization (using RFECV selected features on Training Data) ---
print("\n--- Generating PCA and t-SNE plots using RFECV selected features ---")

# 1. Select the features chosen by RFECV from the original training data
X_train_selected_viz = train_df.loc[X_train_initial.index, selected_features_rfe]

# 2. Preprocess this selected subset (Impute + Scale) for visualization
print("  Preprocessing selected features for visualization (Impute + Scale)...")
viz_imputer = SimpleImputer(strategy="median")
viz_scaler = StandardScaler()

# Create a temporary pipeline for preprocessing
preprocess_pipeline_viz = Pipeline([
    ('imputer', viz_imputer),
    ('scaler', viz_scaler)
])

# Fit and transform the data
X_train_processed_viz = preprocess_pipeline_viz.fit_transform(X_train_selected_viz)
print("  Preprocessing for visualization complete.")

# 3. Apply PCA
print("  Applying PCA...")
pca = PCA(n_components=2, random_state=RANDOM_STATE)
X_pca = pca.fit_transform(X_train_processed_viz)
print(f"  PCA Explained Variance Ratio: {pca.explained_variance_ratio_}")

# 4. Apply t-SNE
print("  Applying t-SNE (this might take a moment)...")
# Adjust perplexity based on dataset size (N=680), 30-50 is reasonable
# Increase n_iter for better convergence
tsne = TSNE(n_components=2, random_state=RANDOM_STATE, perplexity=30, n_iter=2000, init='pca', learning_rate='auto')
X_tsne = tsne.fit_transform(X_train_processed_viz)
print("  t-SNE fitting complete.")

# 5. Create Plots
plt.style.use('seaborn-v0_8-whitegrid') # Use a clean style

# PCA Plot
print("  Generating PCA Plot...")
plt.figure(figsize=(10, 7))
scatter_pca = sns.scatterplot(
    x=X_pca[:, 0], y=X_pca[:, 1],
    hue=y_train, # Color by activity label
    palette='viridis', # Choose a color palette
    alpha=0.7,
    legend='full'
)
plt.title(f'PCA of Training Data using {len(selected_features_rfe)} RFECV Selected Features')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2f} Variance)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2f} Variance)')
plt.legend(title=TARGET_COLUMN)
pca_fig_path = os.path.join(FIGURES_DIR, "pca_plot_rfecv_features_v5.png")
plt.savefig(pca_fig_path, dpi=300, bbox_inches='tight')
plt.close() # Close plot to free memory
print(f"  PCA plot saved: {pca_fig_path}")

# t-SNE Plot
print("  Generating t-SNE Plot...")
plt.figure(figsize=(10, 7))
scatter_tsne = sns.scatterplot(
    x=X_tsne[:, 0], y=X_tsne[:, 1],
    hue=y_train, # Color by activity label
    palette='viridis',
    alpha=0.7,
    legend='full'
)
plt.title(f't-SNE of Training Data using {len(selected_features_rfe)} RFECV Selected Features')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(title=TARGET_COLUMN)
tsne_fig_path = os.path.join(FIGURES_DIR, "tsne_plot_rfecv_features_v5.png")
plt.savefig(tsne_fig_path, dpi=300, bbox_inches='tight')
plt.close() # Close plot
print(f"  t-SNE plot saved: {tsne_fig_path}")
print("--- Visualization Complete ---")


--- Generating PCA and t-SNE plots using RFECV selected features ---
  Preprocessing selected features for visualization (Impute + Scale)...
  Preprocessing for visualization complete.
  Applying PCA...
  PCA Explained Variance Ratio: [0.18617181 0.14377548]
  Applying t-SNE (this might take a moment)...




  t-SNE fitting complete.
  Generating PCA Plot...
  PCA plot saved: mic_activity_prediction_study_v5_1_RFECV_Viz/results/figures/pca_plot_rfecv_features_v5.png
  Generating t-SNE Plot...
  t-SNE plot saved: mic_activity_prediction_study_v5_1_RFECV_Viz/results/figures/tsne_plot_rfecv_features_v5.png
--- Visualization Complete ---


In [25]:
# --- Define Pipelines and Hyperparameter Grids (using RFECV selected features) ---
# This section remains the same as V5, defining the pipelines for the main loop
print("\nDefining pipelines and hyperparameter grids using RFECV features...")
# ... [Pipeline/Grid definitions identical to V5] ...
pipelines = {}
param_grids = {}
median_imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()
pipelines['kNN']=Pipeline([('imputer', median_imputer), ('scaler', scaler), ('model', KNeighborsClassifier())]); param_grids['kNN']={'model__n_neighbors': list(range(1, 28, 2))}
pipelines['MLP']=Pipeline([('imputer', median_imputer), ('scaler', scaler), ('model', MLPClassifier(random_state=RANDOM_STATE, early_stopping=True, validation_fraction=0.1))]); param_grids['MLP']={'model__hidden_layer_sizes': [(n,) for n in range(20, 101, 20)]+[(50, 25),(100, 50)], 'model__activation': ['relu', 'tanh'], 'model__alpha': [0.0001, 0.001, 0.01], 'model__learning_rate': ['constant', 'adaptive'], 'model__max_iter': [1500]}
pipelines['Naive Bayes']=Pipeline([('imputer', median_imputer), ('model', GaussianNB())]); param_grids['Naive Bayes']={'model__var_smoothing': np.logspace(-9, -2, 50)}
pipelines['Decision Tree']=Pipeline([('imputer', median_imputer), ('model', DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight='balanced'))]); param_grids['Decision Tree']={'model__criterion': ["gini", "entropy"], 'model__min_samples_split': [2, 5, 10, 20, 50], 'model__min_samples_leaf': [1, 5, 10, 20], 'model__max_depth': [None]+list(range(5, 31, 5))}
pipelines['Random Forest']=Pipeline([('imputer', median_imputer), ('model', RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced'))]); param_grids['Random Forest']={'model__n_estimators': [100, 200, 300, 500], 'model__max_depth': [None, 10, 20, 30], 'model__min_samples_split': [2, 5, 10], 'model__min_samples_leaf': [1, 3, 5], 'model__criterion': ["gini", "entropy"]}
pipelines['SVM']=Pipeline([('imputer', median_imputer), ('scaler', scaler), ('model', SVC(probability=True, random_state=RANDOM_STATE, cache_size=500, class_weight='balanced'))]); param_grids['SVM']=[{'model__kernel': ['linear'], 'model__C': [0.1, 1, 10, 100]}, {'model__kernel': ['poly'], 'model__C': [0.1, 1, 10], 'model__degree': [2, 3, 4], 'model__gamma': ['scale', 'auto']}, {'model__kernel': ['rbf'], 'model__C': [0.1, 1, 10, 100], 'model__gamma': ['scale', 'auto']+list(np.logspace(-3, 2, 6))}]
#pipelines['LightGBM']=Pipeline([('imputer', median_imputer), ('model', lgb.LGBMClassifier(random_state=RANDOM_STATE, class_weight='balanced', n_jobs=1, verbose=-1))]); param_grids['LightGBM']={'model__n_estimators': [100, 200, 500], 'model__learning_rate': [0.01, 0.05, 0.1], 'model__num_leaves': [20, 31, 40], 'model__max_depth': [-1, 10, 20], 'model__reg_alpha': [0, 0.01, 0.1], 'model__reg_lambda': [0, 0.01, 0.1]}

# --- Model Training, Tuning, and Evaluation (Using Pipelines on RFECV features) ---
# This section remains the same as V5, operating on X_train_selected and X_test_selected
results = {}
best_params = {}
classification_reports_dict = {}
SCORING_METRIC_GRID = RFE_SCORING # Use same scoring as RFE for grid search consistency
CV_FOLDS_GRID = 5
cv_strategy_grid = StratifiedKFold(n_splits=CV_FOLDS_GRID, shuffle=True, random_state=RANDOM_STATE)

print(f"\n--- Starting Model Training and Evaluation V5.1 (CV strategy={cv_strategy_grid.__class__.__name__}, Optimize for {SCORING_METRIC_GRID}) ---")
print(f"--- Using {len(selected_features_rfe)} features selected by RFECV ---")

X_train_selected = train_df.loc[X_train_initial.index, selected_features_rfe]

for name, pipeline in pipelines.items():
    # ... [Main model loop code identical to V5] ...
    # Make sure gridsearch uses X_train_selected and evaluation uses X_test_selected
    # --- [Code identical to V5 main loop section] ---
    print(f"\n{'='*10} Processing Model: {name} {'='*10}")
    print(f"  Running GridSearchCV for {name} pipeline on RFECV features...")
    gridsearch_n_jobs = 1 if name == 'LightGBM' else -1; #if gridsearch_n_jobs == 1: print(f"  NOTE: Setting GridSearchCV n_jobs=1 for {name}.")
    if name not in param_grids: 
        print(f"  Warning: No parameter grid found for model '{name}'. Fitting pipeline with default parameters."); 
        try: pipeline.fit(X_train_selected, y_train); best_pipeline=pipeline; best_params[name]={"parameters": "default"}; 
        except Exception as e: 
            print(f"  Error fitting {name} pipeline with default parameters: {e}. Skipping..."); 
            continue
    else:
        grid_search=GridSearchCV(estimator=pipeline, param_grid=param_grids[name], cv=cv_strategy_grid, scoring=SCORING_METRIC_GRID, n_jobs=gridsearch_n_jobs, verbose=1, refit=True)
        try: grid_search.fit(X_train_selected, y_train); best_pipeline=grid_search.best_estimator_; best_params[name]=grid_search.best_params_; print(f"  Best CV Score ({SCORING_METRIC_GRID}): {grid_search.best_score_:.4f}"); print(f"  Best Parameters for {name}: {best_params[name]}");
        except Exception as e: print(f"  Error during GridSearchCV for {name} pipeline: {e}. Skipping..."); continue
    pipeline_filename = os.path.join(MODEL_DIR, f"{name}_best_pipeline_v5.pkl"); joblib.dump(best_pipeline, pipeline_filename)
    try:
        y_pred = best_pipeline.predict(X_test_selected); y_pred_prob=None; final_estimator=best_pipeline.steps[-1][1]
        if hasattr(final_estimator, "predict_proba"):
             try: y_pred_prob = best_pipeline.predict_proba(X_test_selected)[:, 1]
             except Exception: y_pred_prob = None
        test_acc=accuracy_score(y_test, y_pred); test_mcc=matthews_corrcoef(y_test, y_pred); test_auc=np.nan
        if y_pred_prob is not None:
             try:
                 if len(np.unique(y_test)) > 1 and np.all(np.isfinite(y_pred_prob)): test_auc = roc_auc_score(y_test, y_pred_prob)
                 else: test_auc=np.nan
             except ValueError: test_auc=np.nan
             except Exception: test_auc=np.nan
        results[name] = {"Test Accuracy": round(test_acc, 4), "Test MCC": round(test_mcc, 4), "Test AUC": round(test_auc, 4) if not np.isnan(test_auc) else "N/A"}
        print("\n  Test Set Performance:"); print(f"    Accuracy: {results[name]['Test Accuracy']}"); print(f"    MCC: {results[name]['Test MCC']}"); print(f"    AUC: {results[name]['Test AUC']}")
        report = classification_report(y_test, y_pred, target_names=["Inactive", "Active"], output_dict=True, zero_division=0); classification_reports_dict[name] = report; report_df = pd.DataFrame(report).transpose(); report_csv_path = os.path.join(CSV_DIR, f"{name}_classification_report_v5.csv"); report_df.to_csv(report_csv_path) # V5 filename
        cm=confusion_matrix(y_test, y_pred); plt.figure(figsize=(5, 4)); sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Predicted Inactive", "Predicted Active"], yticklabels=["Actual Inactive", "Actual Active"]); plt.xlabel("Predicted Label"); plt.ylabel("Actual Label"); plt.title(f"Confusion Matrix - {name}"); plt.tight_layout(); cm_fig_path = os.path.join(FIGURES_DIR, f"{name}_confusion_matrix_v5.png"); plt.savefig(cm_fig_path, dpi=150); plt.close() # V5 filename
    except Exception as e: print(f"  Error during evaluation for {name}: {e}")
    # --- [End of loop section] ---


print("\n--- Model Training and Evaluation Complete ---")

# --- Create Performance Comparison Plot ---
# ... [Code identical to V5 plotting section, update titles/filenames to v5.1 if desired] ...
print("\nGenerating model performance comparison plot...")
valid_results = {k: v for k, v in results.items() if isinstance(v.get("Test MCC"), (int, float, np.number))}
if not valid_results: print("  No valid model results found to generate comparison plot.")
else:
    # --- [Code identical to V5 plotting section, just ensure variable names match & update title/filename] ---
    results_df=pd.DataFrame.from_dict(valid_results, orient="index").reset_index().rename(columns={"index": "Model"})
    results_plot_df = results_df.copy(); results_plot_df.replace("N/A", np.nan, inplace=True)
    for col in ["Test Accuracy", "Test MCC", "Test AUC"]:
        if col in results_plot_df.columns: results_plot_df[col]=pd.to_numeric(results_plot_df[col], errors='coerce')
    if results_plot_df.drop(columns="Model", errors='ignore').isnull().all().all(): print("  Warning: No valid numerical scores.")
    else:
        try:
            results_long=results_plot_df.melt(id_vars="Model", var_name="Metric", value_name="Score"); results_long.dropna(subset=['Score'], inplace=True)
            if results_long.empty: print("  Warning: No non-NaN scores.")
            else:
                plt.figure(figsize=(12, 7)); sns.barplot(data=results_long, x="Model", y="Score", hue="Metric", palette="viridis")
                plt.xlabel("Models"); plt.ylabel("Performance Score (Test Set)")
                plt.title(f"Model Performance Comparison V5.1 (Optimized for {SCORING_METRIC_GRID})") # V5.1 title
                plt.xticks(rotation=45, ha='right')
                min_score=results_long["Score"].min(); max_score=results_long["Score"].max()
                plt.ylim(bottom=min(-1.05 if "MCC" in results_long["Metric"].values else 0, min_score - 0.1 if pd.notna(min_score) else 0), top=max(1.05, max_score + 0.05 if pd.notna(max_score) else 1.05))
                plt.legend(title="Metrics", bbox_to_anchor=(1.02, 1), loc='upper left'); plt.grid(axis="y", linestyle="--", alpha=0.7)
                plt.tight_layout(rect=[0, 0, 0.9, 1])
                comparison_fig_path = os.path.join(FIGURES_DIR, "model_performance_comparison_v5.1.png") # V5.1 filename
                plt.savefig(comparison_fig_path, dpi=300, bbox_inches='tight'); print(f"  Model comparison figure saved: {comparison_fig_path}"); plt.close()
        except Exception as e: print(f"  Error generating model comparison plot: {e}")
    # --- [End of plotting section] ---


# --- Combine Confusion Matrices ---
# ... [Code identical to V5 combining section, update pattern/filenames to v5.1 if desired] ...
print("\nCombining confusion matrices into a single grid...")
glob_pattern = os.path.join(FIGURES_DIR, "*_confusion_matrix_v5.png") # Keep v5 suffix consistent with loop output
# --- [Code identical to V5 combining section, just ensure variable names match & update pattern/filename] ---
image_paths = sorted(glob.glob(glob_pattern))
if not image_paths: print(f"  Warning: No confusion matrix images found matching pattern '{glob_pattern}'.")
else:
    try:
        images = [Image.open(path) for path in image_paths]
        if not images: print("  Could not load any images.")
        else:
            width, height = images[0].size; cols = 3; rows = (len(images) + cols - 1) // cols
            combined_image = Image.new('RGB', (cols * width, rows * height), color='white')
            print(f"  Creating a {cols}x{rows} grid for {len(images)} images...")
            for index, img in enumerate(images): x_offset=(index % cols)*width; y_offset=(index // cols)*height; combined_image.paste(img, (x_offset, y_offset)); img.close()
            combined_image_path = os.path.join(FIGURES_DIR, "combined_confusion_matrices_v5.1.png") # V5.1 filename
            combined_image.save(combined_image_path); print(f"  Combined confusion matrix grid saved: {combined_image_path}")
    except Exception as e: print(f"  Error combining confusion matrix images: {e}")
# --- [End of combining section] ---


# --- Save Overall Results ---
# ... [Code identical to V5 saving section, update titles/filenames to v5.1 if desired] ...
print("\nSaving overall performance summary...")
features_used_str = str(selected_features_rfe) if 'selected_features_rfe' in locals() else "Error in RFE"
feature_list_map = {}
for name in pipelines.keys():
    if name in results and isinstance(results[name].get("Test MCC"), (int, float, np.number)): feature_list_map[name] = features_used_str
    else: feature_list_map[name] = "N/A (Model Failed or Skipped)"
features_df = pd.DataFrame.from_dict(feature_list_map, orient='index', columns=['Features Used (RFECV)'])
results_summary_df=pd.DataFrame.from_dict(results, orient="index")
cleaned_best_params = {}
for name, params in best_params.items():
    if isinstance(params, dict) and "parameters" not in params: cleaned_params={k.split('__', 1)[1]: v for k, v in params.items()}; cleaned_best_params[name]=str(cleaned_params)
    else: cleaned_best_params[name]=str(params)
params_summary_df=pd.DataFrame.from_dict(cleaned_best_params, orient='index', columns=['Best Parameters'])
summary_df=pd.concat([results_summary_df, params_summary_df, features_df], axis=1, join='outer')
summary_csv_path = os.path.join(CSV_DIR, "model_performance_summary_v5.1.csv") # V5.1 filename
try: summary_df.to_csv(summary_csv_path, index_label="Model"); print(f"Overall performance summary saved: {summary_csv_path}")
except Exception as e: print(f"Error saving summary CSV: {e}")

# Display final summary table
print(f"\n--- Final Model Performance Summary (Test Set - Specific Features V3 - Optimized for {SCORING_METRIC_GRID}) ---") # Updated title
try:
    from IPython.display import display
    # Optional: Format floats in display
    with pd.option_context('display.precision', 4):
        display(summary_df)
except ImportError:
    print(summary_df.to_string())

print("\nScript execution finished.")


Defining pipelines and hyperparameter grids using RFECV features...

--- Starting Model Training and Evaluation V5.1 (CV strategy=StratifiedKFold, Optimize for matthews_corrcoef) ---
--- Using 25 features selected by RFECV ---

  Running GridSearchCV for kNN pipeline on RFECV features...
Fitting 5 folds for each of 14 candidates, totalling 70 fits
  Best CV Score (matthews_corrcoef): 0.2271
  Best Parameters for kNN: {'model__n_neighbors': 9}

  Test Set Performance:
    Accuracy: 0.5529
    MCC: 0.1061
    AUC: 0.5649

  Running GridSearchCV for MLP pipeline on RFECV features...
Fitting 5 folds for each of 84 candidates, totalling 420 fits
  Best CV Score (matthews_corrcoef): 0.2636
  Best Parameters for MLP: {'model__activation': 'tanh', 'model__alpha': 0.0001, 'model__hidden_layer_sizes': (100, 50), 'model__learning_rate': 'constant', 'model__max_iter': 1500}

  Test Set Performance:
    Accuracy: 0.5118
    MCC: 0.0269
    AUC: 0.5077

  Running GridSearchCV for Naive Bayes pipeli

Unnamed: 0,Test Accuracy,Test MCC,Test AUC,Best Parameters,Features Used (RFECV)
kNN,0.5529,0.1061,0.5649,{'n_neighbors': 9},"['AATS7p', 'ATSC8m', 'ATSC3p', 'ATSC4s', 'AATS..."
MLP,0.5118,0.0269,0.5077,"{'activation': 'tanh', 'alpha': 0.0001, 'hidde...","['AATS7p', 'ATSC8m', 'ATSC3p', 'ATSC4s', 'AATS..."
Naive Bayes,0.4941,-0.012,0.4994,{'var_smoothing': 1.9306977288832496e-08},"['AATS7p', 'ATSC8m', 'ATSC3p', 'ATSC4s', 'AATS..."
Decision Tree,0.4824,-0.0354,0.4929,"{'criterion': 'entropy', 'max_depth': None, 'm...","['AATS7p', 'ATSC8m', 'ATSC3p', 'ATSC4s', 'AATS..."
Random Forest,0.4882,-0.0236,0.501,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...","['AATS7p', 'ATSC8m', 'ATSC3p', 'ATSC4s', 'AATS..."
SVM,0.5706,0.1413,0.5487,"{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}","['AATS7p', 'ATSC8m', 'ATSC3p', 'ATSC4s', 'AATS..."



Script execution finished.
