<a href="https://colab.research.google.com/github/muajnstu/CAST/blob/main/CAST_full_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Core ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.metrics import (
    accuracy_score, classification_report,
    matthews_corrcoef, roc_auc_score
)
from imblearn.metrics import geometric_mean_score

# Classifiers
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier,
    GradientBoostingClassifier, VotingClassifier, StackingClassifier
)
from sklearn.linear_model import (
    LogisticRegression, RidgeClassifier, Perceptron,
    SGDClassifier, PassiveAggressiveClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
)
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Balancing
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek


# optimization
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import RFECV

In [15]:
df = pd.read_csv(
    'https://raw.githubusercontent.com/muajnstu/CAST/refs/heads/main/Perception/Perception_1.csv'
)
print(f"Raw dataset shape : {df.shape}")
print(f"Target distribution:\n{df['CAST'].value_counts()}\n")

X_raw = df.drop(columns=['CAST'])
y_raw = df['CAST']

Raw dataset shape : (400, 47)
Target distribution:
CAST
0    218
1    110
2     72
Name: count, dtype: int64



***Data Balancing***

In [16]:

BALANCERS = {
    "B1_RandomOverSampling":  RandomOverSampler(random_state=42),
    "B2_RandomUnderSampling": RandomUnderSampler(random_state=42),
    "B3_ADASYN":              ADASYN(random_state=42),
    "B4_SMOTE":               SMOTE(random_state=42),
    "B5_BorderlineSMOTE":     BorderlineSMOTE(random_state=42),
    "B6_TomekLinks":          TomekLinks(),
    "B7_SMOTETomek":          SMOTETomek(random_state=42),
}


def apply_balancer(name, sampler, X, y):
    try:
        X_b, y_b = sampler.fit_resample(X, y)
        df_b = pd.DataFrame(X_b, columns=X.columns)
        df_b['CAST'] = y_b
        print(f"  {name:30s} -> shape {df_b.shape}  "
              f"dist: {dict(pd.Series(y_b).value_counts())}")
        return df_b
    except Exception as e:
        print(f"  {name:30s} -> FAILED: {e}")
        return None

***Tunning FS Methods***

In [17]:
CV      = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
CV_EST  = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
N_TOTAL = X_raw.shape[1]

def _cv_score(X_sel, y):
    """Mean 5-fold CV accuracy for a given feature subset."""
    return cross_val_score(CV_EST, X_sel, y, cv=CV,
                           scoring='accuracy', n_jobs=-1).mean()

def section_header(title):
    print("\n" + "#"*80)
    print(f"  {title}")
    print("#"*80)

def sub_header(title):
    print(f"\n{'='*70}")
    print(f"  {title}")
    print("="*70)

# Tune FS1
print("\n" + "="*70)
print("  TUNING FEATURE SELECTORS ON RAW DATA (5-fold CV)")
print("="*70)

print("\n[FS1] Sweeping correlation threshold...")
thresholds   = [0.70, 0.75, 0.80, 0.85, 0.90, 0.95]
fs1_results  = {}
for thr in thresholds:
    corr  = X_raw.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    drop  = [c for c in upper.columns if any(upper[c] >= thr)]
    X_sel = X_raw.drop(columns=drop)
    score = _cv_score(X_sel, y_raw)
    fs1_results[thr] = (score, X_sel.shape[1])
    print(f"  threshold={thr:.2f}  features={X_sel.shape[1]:3d}  CV_acc={score:.4f}")

best_fs1_thr   = max(fs1_results, key=lambda t: fs1_results[t][0])
best_fs1_score = fs1_results[best_fs1_thr][0]
best_fs1_feats = fs1_results[best_fs1_thr][1]
print(f"  >> BEST threshold={best_fs1_thr}  features={best_fs1_feats}  CV_acc={best_fs1_score:.4f}")

# Tune FS2: KBest (f_classif)
print("\n[FS2] Sweeping k for SelectKBest (f_classif)...")
k_range     = list(range(1, N_TOTAL + 1))
fs2_results = {}
for k in k_range:
    sel   = SelectKBest(f_classif, k=k).fit(X_raw, y_raw)
    X_sel = X_raw.iloc[:, sel.get_support()]
    score = _cv_score(X_sel, y_raw)
    fs2_results[k] = score
    print(f"  k={k:3d}  CV_acc={score:.4f}")

best_fs2_k     = max(fs2_results, key=fs2_results.get)
best_fs2_score = fs2_results[best_fs2_k]
print(f"  >> BEST k={best_fs2_k}  CV_acc={best_fs2_score:.4f}")

# Tune FS3: RFE
print("\n[FS3] Running RFECV (RFE with built-in CV) to find optimal n_features...")
rfecv = RFECV(
    estimator=RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1),
    step=1, cv=CV, scoring='accuracy', n_jobs=-1, min_features_to_select=1
)
rfecv.fit(X_raw, y_raw)
best_fs3_n     = rfecv.n_features_
best_fs3_score = rfecv.cv_results_['mean_test_score'][best_fs3_n - 1]
_rfecv_support = rfecv.support_
print(f"  >> BEST n_features={best_fs3_n}  CV_acc={best_fs3_score:.4f}")

# Tune FS4: Mutual Information
print("\n[FS4] Sweeping k for SelectKBest (mutual_info_classif)...")
fs4_results = {}
for k in k_range:
    sel   = SelectKBest(mutual_info_classif, k=k).fit(X_raw, y_raw)
    X_sel = X_raw.iloc[:, sel.get_support()]
    score = _cv_score(X_sel, y_raw)
    fs4_results[k] = score
    print(f"  k={k:3d}  CV_acc={score:.4f}")

best_fs4_k     = max(fs4_results, key=fs4_results.get)
best_fs4_score = fs4_results[best_fs4_k]
print(f"  >> BEST k={best_fs4_k}  CV_acc={best_fs4_score:.4f}")

# Tune FS5: RF Importance cumulative threshold
print("\n[FS5] Sweeping cumulative importance threshold for RF feature importance...")
_rf_imp  = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
_rf_imp.fit(X_raw, y_raw)
imp_series = pd.Series(_rf_imp.feature_importances_, index=X_raw.columns)\
               .sort_values(ascending=False)
cum_imp = imp_series.cumsum()

cum_thresholds = [0.80, 0.85, 0.90, 0.95, 0.99]
fs5_results    = {}
for ct in cum_thresholds:
    top_feats = cum_imp[cum_imp <= ct].index.tolist()
    if len(top_feats) == 0:
        top_feats = [imp_series.index[0]]
    X_sel = X_raw[top_feats]
    score = _cv_score(X_sel, y_raw)
    fs5_results[ct] = (score, len(top_feats))
    print(f"  cum_threshold={ct:.2f}  features={len(top_feats):3d}  CV_acc={score:.4f}")

best_fs5_ct    = max(fs5_results, key=lambda t: fs5_results[t][0])
best_fs5_score = fs5_results[best_fs5_ct][0]
best_fs5_feats = fs5_results[best_fs5_ct][1]
print(f"  >> BEST cum_threshold={best_fs5_ct}  features={best_fs5_feats}  CV_acc={best_fs5_score:.4f}")

# Summary of tuned parameters
print("\n" + "="*70)
print("  TUNING SUMMARY")
print("="*70)
tuning_summary = pd.DataFrame([
    {"FS": "FS1_Correlation",  "Parameter": "threshold",        "Best_Value": best_fs1_thr,  "n_features": best_fs1_feats, "CV_Accuracy": round(best_fs1_score, 4)},
    {"FS": "FS2_KBest",        "Parameter": "k",                "Best_Value": best_fs2_k,    "n_features": best_fs2_k,     "CV_Accuracy": round(best_fs2_score, 4)},
    {"FS": "FS3_RFE",          "Parameter": "n_features (RFECV)","Best_Value": best_fs3_n,   "n_features": best_fs3_n,     "CV_Accuracy": round(best_fs3_score, 4)},
    {"FS": "FS4_MutualInfo",   "Parameter": "k",                "Best_Value": best_fs4_k,    "n_features": best_fs4_k,     "CV_Accuracy": round(best_fs4_score, 4)},
    {"FS": "FS5_RFImportance", "Parameter": "cum_threshold",    "Best_Value": best_fs5_ct,   "n_features": best_fs5_feats, "CV_Accuracy": round(best_fs5_score, 4)},
])
print(tuning_summary.to_string(index=False))



  TUNING FEATURE SELECTORS ON RAW DATA (5-fold CV)

[FS1] Sweeping correlation threshold...
  threshold=0.70  features= 14  CV_acc=0.6575
  threshold=0.75  features= 21  CV_acc=0.6450
  threshold=0.80  features= 38  CV_acc=0.6575
  threshold=0.85  features= 43  CV_acc=0.6575
  threshold=0.90  features= 46  CV_acc=0.6450
  threshold=0.95  features= 46  CV_acc=0.6450
  >> BEST threshold=0.7  features=14  CV_acc=0.6575

[FS2] Sweeping k for SelectKBest (f_classif)...
  k=  1  CV_acc=0.6175
  k=  2  CV_acc=0.6300
  k=  3  CV_acc=0.6275
  k=  4  CV_acc=0.6450
  k=  5  CV_acc=0.6425
  k=  6  CV_acc=0.6100
  k=  7  CV_acc=0.6100
  k=  8  CV_acc=0.6500
  k=  9  CV_acc=0.6350
  k= 10  CV_acc=0.6400
  k= 11  CV_acc=0.6225
  k= 12  CV_acc=0.6225
  k= 13  CV_acc=0.6250
  k= 14  CV_acc=0.6325
  k= 15  CV_acc=0.6425
  k= 16  CV_acc=0.6200
  k= 17  CV_acc=0.6325
  k= 18  CV_acc=0.6500
  k= 19  CV_acc=0.6225
  k= 20  CV_acc=0.6150
  k= 21  CV_acc=0.6225
  k= 22  CV_acc=0.6325
  k= 23  CV_acc=0.6350
 

***FS Methods***

In [18]:
def fs_correlation(data, target_col='CAST', threshold=best_fs1_thr):
    X_ = data.drop(columns=[target_col]); y_ = data[target_col]
    corr  = X_.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    drop  = [c for c in upper.columns if any(upper[c] >= threshold)]
    out   = X_.drop(columns=drop).copy(); out[target_col] = y_.values
    return out

def fs_kbest(data, target_col='CAST', k=best_fs2_k):
    X_ = data.drop(columns=[target_col]); y_ = data[target_col]
    sel = SelectKBest(f_classif, k=min(k, X_.shape[1])).fit(X_, y_)
    out = X_[X_.columns[sel.get_support()]].copy(); out[target_col] = y_.values
    return out

def fs_rfe(data, target_col='CAST', support=_rfecv_support):

    X_ = data.drop(columns=[target_col]); y_ = data[target_col]
    cols = X_.columns[support] if len(support) == X_.shape[1] else X_.columns
    out  = X_[cols].copy(); out[target_col] = y_.values
    return out

def fs_mi(data, target_col='CAST', k=best_fs4_k):
    X_ = data.drop(columns=[target_col]); y_ = data[target_col]
    sel = SelectKBest(mutual_info_classif, k=min(k, X_.shape[1])).fit(X_, y_)
    out = X_[X_.columns[sel.get_support()]].copy(); out[target_col] = y_.values
    return out

def fs_rf_importance(data, target_col='CAST', cum_threshold=best_fs5_ct):
    X_ = data.drop(columns=[target_col]); y_ = data[target_col]
    rf  = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_, y_)
    imp = pd.Series(rf.feature_importances_, index=X_.columns).sort_values(ascending=False)
    top = imp.cumsum()[imp.cumsum() <= cum_threshold].index.tolist()
    if len(top) == 0:
        top = [imp.index[0]]
    out = X_[top].copy(); out[target_col] = y_.values
    return out

FEATURE_SELECTORS = {
    "FS1_Correlation":  lambda d: fs_correlation(d),
    "FS2_KBest":        lambda d: fs_kbest(d),
    "FS3_RFE":          lambda d: fs_rfe(d),
    "FS4_MutualInfo":   lambda d: fs_mi(d),
    "FS5_RFImportance": lambda d: fs_rf_importance(d),
}


***Evaluation Metrics***

In [19]:
def evaluate_dataset(data, target_col='CAST', label="", verbose=True):
    X_ = data.drop(columns=[target_col])
    y_ = data[target_col]
    X_tr, X_te, y_tr, y_te = train_test_split(
        X_, y_, test_size=0.25, random_state=42, stratify=y_
    )
    rows = []
    for clf_name, model in build_models().items():
        try:
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_te)

            # ── Core metrics
            acc     = accuracy_score(y_te, y_pred)
            rep     = classification_report(y_te, y_pred, zero_division=0,
                                            output_dict=True)
            mcc     = matthews_corrcoef(y_te, y_pred)
            gmean   = geometric_mean_score(y_te, y_pred, average='macro')

            # ── AUC:
            n_classes = len(np.unique(y_te))
            try:
                if hasattr(model, 'predict_proba'):
                    y_score = model.predict_proba(X_te)
                else:
                    y_score = model.decision_function(X_te)
                    if n_classes > 2:
                        # decision_function for multiclass → already (n, k)
                        pass
                    else:
                        y_score = y_score.reshape(-1, 1)

                if n_classes == 2:
                    # Binary: use probability of positive class
                    prob = y_score[:, 1] if y_score.ndim == 2 else y_score
                    auc = roc_auc_score(y_te, prob)
                else:
                    auc = roc_auc_score(
                        y_te, y_score,
                        multi_class='ovr', average='macro',
                        labels=np.unique(y_te)
                    )
            except Exception:
                auc = None

            rows.append({
                'Label':           label,
                'Classifier':      clf_name,
                'Accuracy':        round(acc,   4),
                'Macro_F1':        round(rep['macro avg']['f1-score'],   4),
                'Macro_Precision': round(rep['macro avg']['precision'],  4),
                'Macro_Recall':    round(rep['macro avg']['recall'],     4),
                'MCC':             round(mcc,   4),
                'G_Mean':          round(gmean, 4),
                'AUC':             round(auc,   4) if auc is not None else None,
            })
            if verbose:
                auc_str = f"{auc:.4f}" if auc is not None else "  N/A"
                print(f"    {clf_name:25s}  Acc={acc:.4f}  F1={rep['macro avg']['f1-score']:.4f}"
                      f"  MCC={mcc:.4f}  G-Mean={gmean:.4f}  AUC={auc_str}")
        except Exception as e:
            rows.append({'Label': label, 'Classifier': clf_name,
                         'Accuracy': None, 'Macro_F1': None,
                         'Macro_Precision': None, 'Macro_Recall': None,
                         'MCC': None, 'G_Mean': None, 'AUC': None})
            if verbose:
                print(f"    {clf_name:25s}  ERROR: {e}")
    return rows


def section_header(title):
    print("\n" + "#"*80)
    print(f"  {title}")
    print("#"*80)

def sub_header(title):
    print(f"\n{'='*70}")
    print(f"  {title}")
    print("="*70)

***ML Models***

In [20]:
def build_models():
    return {
        "RandomForest":       RandomForestClassifier(random_state=42),
        "ExtraTrees":         ExtraTreesClassifier(random_state=42),
        "Bagging":            BaggingClassifier(random_state=42),
        "GradientBoosting":   GradientBoostingClassifier(random_state=42),
        "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
        "RidgeClassifier":    RidgeClassifier(),
        "DecisionTree":       DecisionTreeClassifier(random_state=42),
        "NaiveBayes":         GaussianNB(),
        "Perceptron":         Perceptron(random_state=42),
        "SGDClassifier":      SGDClassifier(random_state=42),
        "PassiveAggressive":  PassiveAggressiveClassifier(random_state=42),
        "LDA":                LinearDiscriminantAnalysis(),
        "QDA":                QuadraticDiscriminantAnalysis(),
        "LightGBM":           LGBMClassifier(verbosity=-1, random_state=42),
        "XGBoost":            XGBClassifier(n_estimators=100, random_state=42,
                                            eval_metric='mlogloss',
                                            use_label_encoder=False, verbosity=0),
        "VotingSoft": VotingClassifier(estimators=[
            ('rf',   RandomForestClassifier(n_estimators=100, random_state=42)),
            ('gb',   GradientBoostingClassifier(n_estimators=100, random_state=42)),
            ('et',   ExtraTreesClassifier(n_estimators=100, random_state=42)),
            ('lgbm', LGBMClassifier(verbosity=-1, n_estimators=100, random_state=42)),
        ], voting='soft', n_jobs=-1),
        "VotingHard": VotingClassifier(estimators=[
            ('rf',   RandomForestClassifier(n_estimators=100, random_state=42)),
            ('et',   ExtraTreesClassifier(n_estimators=100, random_state=42)),
            ('lgbm', LGBMClassifier(verbosity=-1, n_estimators=100, random_state=42)),
        ], voting='hard', n_jobs=-1),
        "Stacking": StackingClassifier(estimators=[
            ('rf',   RandomForestClassifier(n_estimators=100, random_state=42)),
            ('gb',   GradientBoostingClassifier(n_estimators=100, random_state=42)),
            ('et',   ExtraTreesClassifier(n_estimators=100, random_state=42)),
            ('lgbm', LGBMClassifier(verbosity=-1, n_estimators=100, random_state=42)),
        ], final_estimator=LogisticRegression(max_iter=1000, random_state=42),
           cv=5, stack_method='predict_proba', n_jobs=-1),
    }

# ***# Experiment 1***

In [21]:
section_header("PHASE 1 — 7 Balancing Variants  (all raw features, no feature selection)")

print("\nApplying balancers to raw data...")
balanced_datasets = {}
for bal_name, sampler in BALANCERS.items():
    df_b = apply_balancer(bal_name, sampler, X_raw, y_raw)
    if df_b is not None:
        balanced_datasets[bal_name] = df_b

phase1_rows = []
for bal_name, bal_df in balanced_datasets.items():
    sub_header(f"Balancer: {bal_name}  |  shape={bal_df.shape}")
    rows = evaluate_dataset(bal_df, label=bal_name, verbose=True)
    phase1_rows.extend(rows)

phase1_df = pd.DataFrame(phase1_rows).dropna(subset=['Accuracy'])
phase1_df['Phase'] = 'Phase1_BalancingOnly'
phase1_df.rename(columns={'Label': 'Variant'}, inplace=True)

print("\n\n-- Phase 1 Summary (top 20 by Accuracy) --")
p1_show = phase1_df.sort_values('Accuracy', ascending=False).rename(
    columns={'Variant': 'Balancer'})
METRICS = ['Accuracy','Macro_F1','MCC','G_Mean','AUC']
print(p1_show[['Balancer','Classifier'] + METRICS].head(20).to_string(index=False))

print("\n-- Phase 1: Best Classifier per Balancer --")
best_p1 = (p1_show.groupby('Balancer', group_keys=False)
           .apply(lambda g: g.nlargest(1, 'Accuracy'))
           [['Balancer','Classifier'] + METRICS])
print(best_p1.to_string(index=False))


################################################################################
  PHASE 1 — 7 Balancing Variants  (all raw features, no feature selection)
################################################################################

Applying balancers to raw data...
  B1_RandomOverSampling          -> shape (654, 47)  dist: {0: np.int64(218), 2: np.int64(218), 1: np.int64(218)}
  B2_RandomUnderSampling         -> shape (216, 47)  dist: {0: np.int64(72), 1: np.int64(72), 2: np.int64(72)}
  B3_ADASYN                      -> shape (655, 47)  dist: {1: np.int64(224), 0: np.int64(218), 2: np.int64(213)}
  B4_SMOTE                       -> shape (654, 47)  dist: {0: np.int64(218), 2: np.int64(218), 1: np.int64(218)}
  B5_BorderlineSMOTE             -> shape (654, 47)  dist: {0: np.int64(218), 2: np.int64(218), 1: np.int64(218)}
  B6_TomekLinks                  -> shape (369, 47)  dist: {0: np.int64(202), 1: np.int64(95), 2: np.int64(72)}
  B7_SMOTETomek                  -> shape (642, 

# ***Experiment 2***

In [22]:
section_header("PHASE 2 — 5 Feature Selection Variants  (raw unbalanced data, no balancing)")

phase2_rows = []
for fs_name, fs_fn in FEATURE_SELECTORS.items():
    try:
        fs_df = fs_fn(df)
        sub_header(f"Feature Selector: {fs_name}  |  features={fs_df.shape[1]-1}")
        rows = evaluate_dataset(fs_df, label=fs_name, verbose=True)
        phase2_rows.extend(rows)
    except Exception as e:
        print(f"  {fs_name} FAILED: {e}")

phase2_df = pd.DataFrame(phase2_rows).dropna(subset=['Accuracy'])
phase2_df['Phase'] = 'Phase2_FeatureSelectionOnly'
phase2_df.rename(columns={'Label': 'Variant'}, inplace=True)

print("\n\n-- Phase 2 Summary (top 20 by Accuracy) --")
p2_show = phase2_df.sort_values('Accuracy', ascending=False).rename(
    columns={'Variant': 'FeatureSelector'})
print(p2_show[['FeatureSelector','Classifier'] + METRICS].head(20).to_string(index=False))

print("\n-- Phase 2: Best Classifier per Feature Selector --")
best_p2 = (p2_show.groupby('FeatureSelector', group_keys=False)
           .apply(lambda g: g.nlargest(1, 'Accuracy'))
           [['FeatureSelector','Classifier'] + METRICS])
print(best_p2.to_string(index=False))


################################################################################
  PHASE 2 — 5 Feature Selection Variants  (raw unbalanced data, no balancing)
################################################################################

  Feature Selector: FS1_Correlation  |  features=14
    RandomForest               Acc=0.7000  F1=0.6399  MCC=0.4749  G-Mean=0.7146  AUC=0.8280
    ExtraTrees                 Acc=0.6600  F1=0.5694  MCC=0.4034  G-Mean=0.6706  AUC=0.8236
    Bagging                    Acc=0.6500  F1=0.5550  MCC=0.3780  G-Mean=0.6544  AUC=0.7862
    GradientBoosting           Acc=0.6400  F1=0.5940  MCC=0.3889  G-Mean=0.6826  AUC=0.8077
    LogisticRegression         Acc=0.6900  F1=0.6102  MCC=0.4578  G-Mean=0.6999  AUC=0.7998
    RidgeClassifier            Acc=0.6200  F1=0.4970  MCC=0.3184  G-Mean=0.6191  AUC=  N/A
    DecisionTree               Acc=0.5800  F1=0.5060  MCC=0.2940  G-Mean=0.6253  AUC=0.6225
    NaiveBayes                 Acc=0.6100  F1=0.5371  MCC=0.342

# ***Experiment 3***

In [23]:
section_header("PHASE 3 — Balancing x Feature Selection  (7 x 5 = 35 combos)")

phase3_rows = []
for bal_name, bal_df in balanced_datasets.items():
    for fs_name, fs_fn in FEATURE_SELECTORS.items():
        combo = f"{bal_name} + {fs_name}"
        try:
            fs_df = fs_fn(bal_df)
            sub_header(f"Combo: {combo}  |  shape={fs_df.shape}")
            rows = evaluate_dataset(fs_df, label=combo, verbose=True)
            phase3_rows.extend(rows)
        except Exception as e:
            print(f"  {combo} FAILED: {e}")

phase3_df = pd.DataFrame(phase3_rows).dropna(subset=['Accuracy'])
phase3_df['Phase'] = 'Phase3_Balancing+FS'
phase3_df.rename(columns={'Label': 'Variant'}, inplace=True)

print("\n\n-- Phase 3 Summary (top 30 combos by Accuracy) --")
p3_show = phase3_df.sort_values('Accuracy', ascending=False).rename(
    columns={'Variant': 'Balancer+FS'})
print(p3_show[['Balancer+FS','Classifier'] + METRICS].head(30).to_string(index=False))

print("\n-- Phase 3: Best Classifier per Balancer+FS combo --")
best_p3 = (p3_show.groupby('Balancer+FS', group_keys=False)
           .apply(lambda g: g.nlargest(1, 'Accuracy'))
           [['Balancer+FS','Classifier'] + METRICS]
           .sort_values('Accuracy', ascending=False))
print(best_p3.to_string(index=False))


################################################################################
  PHASE 3 — Balancing x Feature Selection  (7 x 5 = 35 combos)
################################################################################

  Combo: B1_RandomOverSampling + FS1_Correlation  |  shape=(654, 14)
    RandomForest               Acc=0.8659  F1=0.8622  MCC=0.8021  G-Mean=0.8983  AUC=0.9719
    ExtraTrees                 Acc=0.8659  F1=0.8624  MCC=0.8020  G-Mean=0.8983  AUC=0.9734
    Bagging                    Acc=0.8415  F1=0.8365  MCC=0.7716  G-Mean=0.8795  AUC=0.9588
    GradientBoosting           Acc=0.7561  F1=0.7538  MCC=0.6350  G-Mean=0.8144  AUC=0.9245
    LogisticRegression         Acc=0.5549  F1=0.5506  MCC=0.3332  G-Mean=0.6572  AUC=0.8013
    RidgeClassifier            Acc=0.5610  F1=0.5318  MCC=0.3529  G-Mean=0.6624  AUC=  N/A
    DecisionTree               Acc=0.7988  F1=0.7916  MCC=0.7048  G-Mean=0.8469  AUC=0.8500
    NaiveBayes                 Acc=0.6098  F1=0.6008  MCC=0.4

# ***Summary***

In [None]:
out = '/mnt/user-data/outputs/'
combined = pd.concat([phase1_df, phase2_df, phase3_df], ignore_index=True)
combined.sort_values(['Phase','Accuracy'], ascending=[True, False], inplace=True)

combined.to_csv(out + 'pipeline_all_results.csv', index=False)
phase1_df.to_csv(out + 'phase1_balancing_results.csv', index=False)
phase2_df.to_csv(out + 'phase2_feature_selection_results.csv', index=False)
phase3_df.to_csv(out + 'phase3_combined_results.csv', index=False)

print("\n\n" + "="*80)
print("  DONE — 4 CSV files saved:")
print("  pipeline_all_results.csv             (all 3 phases combined)")
print("  phase1_balancing_results.csv         (Phase 1 only)")
print("  phase2_feature_selection_results.csv (Phase 2 only)")
print("  phase3_combined_results.csv          (Phase 3 only)")
print("="*80)