In [1]:
# ===================== Jupyter cell: remove selected models and save as AUC_* =====================
# What this cell does:
# - Given a path to a folder OR a single CSV file with model results (NOT plots),
#   it removes the following models wherever they appear:
#       ["KNN", "NaiveBayes", "NiaveBayes", "LogisticRegression", "SVM_linear"]
# - It handles two file structures:
#   (A) Main per-k CSV: columns = ["k", <model1>, <model2>, ...]
#       -> drops the DISALLOWED model columns if present.
#   (B) Bootstrap CSV: ends with "_bootstrap.csv" OR has a "model" column with rows:
#       columns include ["k","model","mean_auc_boot","ci95_low","ci95_high", ...]
#       -> filters out rows whose "model" is in DISALLOWED.
# - Writes cleaned copies with the SAME file name prefixed by "AUC_" in the SAME folder.
# - Skips files already starting with "AUC_" and files like "*_top20_features.csv".
#
# Usage:
#   clean_auc_results("/path/to/folder_or_file.csv")

import os
import glob
import pandas as pd

DISALLOWED = {"KNN", "NaiveBayes", "NiaveBayes", "LogisticRegression", "SVM_linear"}

def _is_main_perk_csv(df: pd.DataFrame) -> bool:
    """Heuristic: a main per-k CSV must have column 'k' and ≥2 total columns."""
    return ("k" in df.columns) and (df.shape[1] >= 2)

def _is_bootstrap_csv(path: str, df: pd.DataFrame) -> bool:
    """Treat as bootstrap if filename ends with '_bootstrap.csv' or there is a 'model' column."""
    return path.lower().endswith("_bootstrap.csv") or ("model" in df.columns)

def _should_ignore_file(basename_lower: str) -> bool:
    """Skip outputs created by other steps and already-prefixed files."""
    if basename_lower.startswith("auc_"):  # already cleaned/prefixed
        return True
    if basename_lower.endswith("_top20_features.csv"):
        return True
    return False

def _process_main_csv(path: str) -> str:
    """Drop DISALLOWED model columns from a main per-k CSV (keep 'k' and any other non-model columns)."""
    df = pd.read_csv(path)
    cols_to_drop = [c for c in df.columns if c in DISALLOWED]
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)
    out_path = os.path.join(os.path.dirname(path), "AUC_" + os.path.basename(path))
    df.to_csv(out_path, index=False)
    print(f"[Main] Saved cleaned -> {out_path} (dropped: {cols_to_drop})")
    return out_path

def _process_bootstrap_csv(path: str) -> str:
    """Filter out DISALLOWED models from a bootstrap CSV (row-wise by 'model' column)."""
    df = pd.read_csv(path)
    if "model" not in df.columns:
        print(f"[Bootstrap] 'model' column not found, skipping: {path}")
        return ""
    before = len(df)
    df = df[~df["model"].isin(DISALLOWED)].reset_index(drop=True)
    after = len(df)
    out_path = os.path.join(os.path.dirname(path), "AUC_" + os.path.basename(path))
    df.to_csv(out_path, index=False)
    print(f"[Bootstrap] Saved cleaned -> {out_path} (removed {before - after} rows)")
    return out_path

def clean_auc_results(path: str):
    """
    Clean CSVs by removing DISALLOWED models and save as 'AUC_<original>.csv' in the same folder.
    - If `path` is a directory: process all *.csv inside (non-recursive).
    - If `path` is a file: process that single CSV.
    """
    if not os.path.exists(path):
        print(f"[Error] Path not found: {path}")
        return

    files = []
    if os.path.isdir(path):
        files = sorted(glob.glob(os.path.join(path, "*.csv")))
    else:
        files = [path]

    if not files:
        print(f"[Info] No CSV files found in: {path}")
        return

    for fp in files:
        base_low = os.path.basename(fp).lower()
        if _should_ignore_file(base_low):
            print(f"[Skip] {fp}")
            continue
        try:
            # Peek to decide type
            df_head = pd.read_csv(fp, nrows=5)
        except Exception as e:
            print(f"[Warn] Could not read {fp}: {e}")
            continue

        try:
            if _is_bootstrap_csv(fp, df_head):
                _process_bootstrap_csv(fp)
            elif _is_main_perk_csv(df_head):
                _process_main_csv(fp)
            else:
                print(f"[Skip] Unrecognized CSV structure (no action): {fp}")
        except Exception as e:
            print(f"[Error] Processing failed for {fp}: {e}")

In [10]:
# ---------------- Example ----------------
clean_auc_results("/home/77462217B/lois/AAImageneAnomalyDetection/results/heavymodelv1/CVMichaud/RECAll")
# clean_auc_results("/ruta/a/tu/Betas3CasFAST.csv")

[Skip] /home/77462217B/lois/AAImageneAnomalyDetection/results/heavymodelv1/CVMichaud/RECAll/AUC_MicCas.csv
[Skip] /home/77462217B/lois/AAImageneAnomalyDetection/results/heavymodelv1/CVMichaud/RECAll/AUC_MicCas_bootstrap.csv
[Skip] /home/77462217B/lois/AAImageneAnomalyDetection/results/heavymodelv1/CVMichaud/RECAll/MicCas_top20_features.csv
