In [1]:
import pandas as pd
import numpy as np
import glob

In [14]:
import pandas as pd
import glob

def save_selected_features(dfs, rel_thresh=0.01, freq_thresh=0.5):
    """
    For each dataset:
      - Keeps features with relative importance >= rel_thresh * max importance (per fold).
      - Aggregates across folds and keeps features appearing in >= freq_thresh fraction of folds.
      - Saves stable features to .txt.

    Parameters
    ----------
    dfs : dict
        Dictionary of dataframes keyed by filename.
    rel_thresh : float
        Relative threshold (fraction of max importance within each fold).
    freq_thresh : float
        Minimum fraction of folds a feature must appear in to be kept.
    """
    for fname, df in dfs.items():
        if "Fold" not in df.columns:
            raise ValueError(f"{fname} needs a 'Fold' column for cross-validation info")

        stable_features = []

        # Check across folds
        folds = df["Fold"].unique()
        feat_counts = {}

        for fold in folds:
            fold_df = df[df["Fold"] == fold]
            max_imp = fold_df["Importance"].max()

            # Relative importance filter
            selected = fold_df.loc[fold_df["Importance"] >= rel_thresh * max_imp, "Feature"]

            for feat in selected:
                feat_counts[feat] = feat_counts.get(feat, 0) + 1

        # Keep features that appear in enough folds
        n_folds = len(folds)
        stable_features = [f for f, count in feat_counts.items() if count / n_folds >= freq_thresh]

        # Output file
        out_txt = fname.replace("_Featureimportance.csv", "_stablefeatures.txt")

        with open(out_txt, "w") as f:
            for feat in stable_features:
                f.write(f"{feat}\n")

        print(f"✅ Saved {len(stable_features)} for {fname} stable features (rel>{rel_thresh}, freq>{freq_thresh}) to {out_txt}")




In [34]:
import pandas as pd
import glob

def save_selected_features(dfs, rel_thresh=0.5):
    """
    Save selected features based on importance:
      - If any zero features exist: keep all non-zero features.
      - If no zero features: keep only features with Importance >= rel_thresh * max importance.

    Parameters
    ----------
    dfs : dict
        Dictionary of dataframes keyed by filename.
    rel_thresh : float
        Relative importance threshold (fraction of max importance) when no zeros exist.
    """
    for fname, df in dfs.items():
        # Check for zero importance features
        zero_feats = df.loc[df["Importance"] == 0, "Feature"].unique().tolist()

        if zero_feats:
            # Case 1: zeros exist → keep only non-zero features
            selected_feats = df.loc[df["Importance"] > 0, "Feature"].unique().tolist()
        else:
            # Case 2: no zeros → keep only features above relative threshold
            max_imp = df["Importance"].max()
            selected_feats = df.loc[df["Importance"] >= rel_thresh * max_imp, "Feature"].unique().tolist()

        if selected_feats:
            # Build output filename
            out_txt = fname.replace("_Featureimportance.csv", "_specificfeatures.txt")

            # Save to file
            with open(out_txt, "w") as f:
                for feat in selected_feats:
                    f.write(f"{feat}\n")

            print(f"✅ Saved {len(selected_feats)} features to {out_txt}")
        else:
            print(f"⚠️ No features selected for {fname}, file not saved.")

# --- Usage ---
files = glob.glob("PredictionsLASSORegression*_*_recall_*_Featureimportance.csv")
dfs = {f: pd.read_csv(f) for f in files}
save_selected_features(dfs)


✅ Saved 29 features to PredictionsLASSORegression_proteins_residuals_PD_HC_unique_residualised_training_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 16 features to PredictionsLASSORegression_proteins_residuals_PD_HC_unique_PD_OND_residualisedhealthycontrol_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 23 features to PredictionsLASSORegression_proteins_matched_baseline_PD_HC_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 21 features to PredictionsLASSORegression_proteins_residuals_PD_HC_residualisedtraining_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 7 features to PredictionsLASSORegression_proteins_residuals_PD_HC_PD_OND_residualisedtraining_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 7 features to PredictionsLASSORegression_proteins_residuals_PD_HC_PD_OND_residualisedhealthycontrol_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ 

In [35]:
# --- Usage ---
files = glob.glob("PredictionsLightGBM*_*_recall_*_Featureimportance.csv")
dfs = {f: pd.read_csv(f) for f in files}
save_selected_features(dfs)


✅ Saved 5 features to PredictionsLightGBM_balanced_proteins_matched_prodromals_PD_HC_unique_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 5 features to PredictionsLightGBM_balanced_proteins_residuals_PD_HC_unique_PD_OND_residualised_training_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 5 features to PredictionsLightGBM_balanced_proteins_matched_prodromals_PD_HC_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 6 features to PredictionsLightGBM_balanced_proteins_residuals_PD_HC_residualisedhealthycontrol_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 14 features to PredictionsLightGBM_balanced_proteins_cox_specificPD_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 5 features to PredictionsLightGBM_balanced_proteins_residuals_PD_HC_unique_residualised_training_SIMPLEMEDIAN_recall_Training_all_prodromals_specificfeatures.txt
✅ Saved 9 features to PredictionsLigh