In [3]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import os

In [4]:
# Define the path to the parent directory
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Move one level up

# 1. Rank features

In [None]:
def rank_features(file_pattern, top_n=50):
    """
    Rank features across multiple files.
    
    Parameters:
        file_pattern (str): Glob pattern for matching files.
        top_n (int): Number of top features to consider from each file.
    
    Returns:
        pd.DataFrame: DataFrame containing ranked feature frequencies.
    """
    # Match files using the provided glob pattern
    files = glob.glob(file_pattern)
    all_features = []  # List to collect features from all files

    # Loop through each file to collect the top features
    for file in files:
        df = pd.read_csv(file)                                  # Read each CSV file
        top_features = df.head(top_n)['Feature_ID'].tolist()    # Extract top N features
        all_features.extend(top_features)                       # Add them to the list

    # Count the occurrences of each feature across all files
    feature_counts = pd.Series(all_features).value_counts()

    # Convert the counts into a DataFrame
    ranked_features = feature_counts.reset_index()
    ranked_features.columns = ['Feature_ID', 'Frequency']                           # Rename columns
    ranked_features = ranked_features.sort_values(by="Frequency", ascending=False)  # Sort by frequency
    
    return ranked_features  # Return the DataFrame containing ranked features


In [None]:
# Rank features by model importance
model_importance_ranked = rank_features(os.path.join(data_dir, "results/*model_importance.csv"), top_n=50)
model_importance_ranked.to_csv(os.path.join(data_dir, "results/model_importance_ranking.csv"), index=False)

# Rank features by SHAP importance
shap_importance_ranked = rank_features(os.path.join(data_dir, "results/*SHAP_importance.csv"), top_n=50)
shap_importance_ranked.to_csv(os.path.join(data_dir, "results/shap_importance_ranking.csv"), index=False)

# Rank features combining both methods
combined_ranked = rank_features(os.path.join(data_dir, "results/*importance.csv"), top_n=50)
combined_ranked.to_csv(os.path.join(data_dir, "results/combined_importance_ranking.csv"), index=False)

In [None]:
# Rank features for binary datasets
binary_ranked = rank_features(os.path.join(data_dir, "results/*binary*.csv"), top_n=50)
binary_ranked.to_csv(os.path.join(data_dir, "results/binary_importance_ranking.csv"), index=False)

# Rank features for numbers datasets
numbers_ranked = rank_features(os.path.join(data_dir, "results/*numbers*.csv"), top_n=50)
numbers_ranked.to_csv(os.path.join(data_dir, "results/numbers_importance_ranking.csv"), index=False)

# 2. Plot features ranking

In [None]:
def plot_feature_ranking(ranked_features, title):
    """
    Plot feature ranking without saving the plot.

    Parameters:
        ranked_features (pd.DataFrame): DataFrame containing features and their frequencies.
        title (str): Title of the plot.
    """
    # Create a horizontal bar plot to visualize the feature ranking
    plt.figure(figsize=(10, 6))
    plt.barh(ranked_features['Feature_ID'], ranked_features['Frequency'], color='skyblue')
    plt.xlabel('Frequency')
    plt.ylabel('Feature')
    plt.title(title)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

# Plot results directly without saving
plot_feature_ranking(model_importance_ranked, "Model Importance Ranking")
plot_feature_ranking(shap_importance_ranked, "SHAP Importance Ranking")
plot_feature_ranking(combined_ranked, "Combined Importance Ranking")
plot_feature_ranking(binary_ranked, "Binary Dataset Importance Ranking")
plot_feature_ranking(numbers_ranked, "Numbers Dataset Importance Ranking")

NameError: name 'model_importance_ranked' is not defined