In [2]:
"""
================================================================================
FEATURE IMPORTANCE AND OCCURRENCE ANALYSIS
================================================================================
Authors: Giuseppe Riccio, Stefano Cavuoti
Date: 2026-02-24
Description:
    This script analyzes feature importance results generated from machine 
    learning experiments. It processes CSV files to identify the most relevant 
    features (Haar or Haralick) across different bands/experiments.
    
    Key functionalities:
    1. Filters features based on importance thresholds.
    2. Removes "Total" aggregate bands to focus on specific image features.
    3. Calculates statistics across multiple experiments (Mean, Best, Worst rank).
    4. Computes occurrence frequency and percentage for each feature.
    5. Exports detailed ranking and occurrence reports to CSV and TXT files.

Dependencies:
    - numpy
    - matplotlib
    - pandas
    - os
    - IPython (for cell magic capture)

Usage:
    - Set the 'exp' variable to "haar" or "haralick".
    - Ensure the 'importance_analysis/importance_[exp]/' directory exists.
    - The script expects CSV files ending with the specified 'target' (e.g., CSM).
================================================================================
"""



In [3]:
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import pandas as pd

In [4]:
# Initialization and configuration
exp = "haar"
folder_path = 'importance_analysis' + os.sep + 'importance_' + exp + os.sep
target = "CSM"
threshold = 0.0001
feat2show = 20

# Get the list of CSV files in the folder, sorted alphabetically
csv_files = sorted([f for f in os.listdir(folder_path) if f.endswith(target + '.csv')])
print(csv_files)
nfiles = len(csv_files)

FileNotFoundError: [WinError 3] Impossibile trovare il percorso specificato: 'importance_analysis\\importance_haar\\'

In [58]:
# Capture statistics for each file
get_ipython().run_cell_magic('capture', 'cap --no-stderr', 
'print(f"Importance Threshold : {threshold}")\n'
'stats = {}\n\n'
'for i in range(nfiles):\n'
'    current_file = folder_path + csv_files[i]\n\n'
'    print("Statistics on " + csv_files[i] + "\\n-------------------------------------------------\\n")\n'
'    \n'
'    data = pd.read_csv(current_file).sort_values("importance", ascending=False)\n'
'    print(f"First {feat2show} features by importance")\n'
'    best = data.head(feat2show)\n'
'    print(best)\n'
'    print("\\n")\n'
'    print(f"Best {feat2show} {exp.upper()} features by importance")\n'
'    # Remove aggregate "Total" bands\n'
'    nobands = data.drop(data[data["feature"].str.contains("Total")].index)\n'
'    \n'
'    print(nobands.head(feat2show))\n'
'    \n'
'    print("\\n")\n'
'    over_thres = nobands[nobands["importance"] >= threshold]\n'
'    print(f"{exp.upper()} features over threshold {threshold}: {over_thres.shape[0]}")\n'
'    print(over_thres)\n'
'    \n'
'    # Save results in dictionary\n'
'    stats[csv_files[i]] = {"importance": data, "importance_best": data.head(feat2show), '
'                           "nobands_importance": nobands, \n'
'                           "nobands_best": nobands.head(feat2show), "nobands_ot": over_thres}\n'
'    print("\\n\\n")\n'
)

In [59]:
# Save general statistics to a text file
with open('general_stats.' + exp.upper() + '_' + target + '.txt', 'w') as f:
    f.write(cap.stdout)

In [60]:
# Analyze common occurrences across different "best N" thresholds
full_interlist = {}
outf = open('occurrences_stats.' + exp.upper() + '_' + target + '.txt', 'w')

for n in [feat2show, feat2show+10, feat2show+20, feat2show+30]:
    print(f"Common {exp.upper()} features among experiments (best {n})")
    outf.write(f"Common {exp.upper()} features among experiments (best {n})\n")
    interlist = []
    best_dict = {}
    imp_dict = {}
    std_dict = {}
    
    # Collect the top features for each experiment
    for k, v in stats.items():
        band_name = k.split("_", 2)[-1].split(".")[0]
                                                           
        curr_imp_full = v["nobands_importance"]
                                                         
        curr_best_feat = curr_imp_full["feature"].head(n).to_numpy()
        best_dict[band_name] = curr_best_feat

        curr_best_imp = curr_imp_full["importance"].head(n).to_numpy()
        imp_dict[band_name] = curr_best_imp
        
        curr_best_std = curr_imp_full["std"].head(n).to_numpy()
        std_dict[band_name] = curr_best_std
        
        
        for el in curr_best_feat:
            if el not in interlist:
                interlist.append(el)
                
    full_interlist["nf_" + str(n)] = interlist
    rank_dict = {}
    for h in best_dict.keys():
        rank_dict["Rank_" + h] = []
        rank_dict["Imp_" + h] = []
        rank_dict["Std_" + h] = []

    feature_list = []
    # INTERLIST: Collection of all unique HAAR/HARALICK features across all bands 
    # within the top N features, with no repetitions.

    for el in interlist:
        if el not in feature_list:
            feature_list.append(el)        
        for b, f in best_dict.items():
            pos = np.where(f == el)[0]
            
                             
            if len(pos) > 0:
                f_pos = pos[0] + 1
                imp = imp_dict[b][pos][0]
                s = std_dict[b][pos][0]
                                 
            else:
                # Use -999 as a placeholder for missing features in a specific band
                f_pos = -999
                imp = -999
                s = -999
            rank_dict["Rank_" + b].append(f_pos)
            rank_dict["Imp_" + b].append(imp)
            rank_dict["Std_" + b].append(s)

                     
    # Create ranking DataFrame
    rank_df = pd.DataFrame(rank_dict, index=feature_list)
    for b in best_dict.keys():
        rank_df.astype({"Rank_" + b: 'int16'})
    
    # Replace placeholders with empty strings for cleaner CSV output
    rank_df = rank_df.replace(-999, "")
    rank_df.to_csv("feature_ranking_best" + str(n) + "_" + exp.upper() + "_" + target + ".csv")

    # Calculate occurrence statistics
                        
    occ_dict = {}
    for i, f in enumerate(feature_list):
        curr_rank = []
        for k, v in rank_dict.items():
            if k.startswith("Rank"):
                if v[i] != -999:
                    curr_rank.append(v[i])
        occ_dict[f] = curr_rank

    stats_dict = {"N occurrences": [], "% occurrences": [], "Mean": [], "Best": [], "Worst": []}
    for f, l in occ_dict.items():
        n_occurrence = len(l)
        n_perc = n_occurrence / nfiles * 100
        mean_pos = np.mean(l)
        min_pos = np.min(l)
        max_pos = np.max(l)
        stats_dict["N occurrences"].append(n_occurrence)
        stats_dict["% occurrences"].append(n_perc)
        stats_dict["Mean"].append(mean_pos)
        stats_dict["Best"].append(min_pos)
        stats_dict["Worst"].append(max_pos)
        
    # Sort results by frequency (descending) and then by average rank (ascending)
    stats_df = pd.DataFrame(stats_dict, index=occ_dict.keys())      
    stats_df = stats_df.sort_values(by=['N occurrences', 'Mean'], ascending=[False, True])
    print(stats_df)
    
    # Save statistics to text and CSV
    outf.write(stats_df.to_string() + "\n\n")
    stats_df.to_csv('occurrences_stats' + str(n) + exp.upper() + '_' + target + '.csv')
    print("\n")

outf.close()

Common HAAR features among experiments (best 20)
                         N occurrences  % occurrences    Mean  Best  Worst
7080norm_haar_2x_8                  10          100.0   3.500     1      9
7080norm_haar_2x_7                  10          100.0   4.000     1      8
7080norm_haar_2y_4_flip             10          100.0   9.600     1     19
7080norm_haar_3x_4                   9           90.0  14.000    10     19
2130norm_haar_2x_8_flip              8           80.0   2.625     1      7
...                                ...            ...     ...   ...    ...
7080norm_haar_2y_6                   1           10.0  19.000    19     19
0512norm_haar_2x_8_flip              1           10.0  19.000    19     19
4050norm_haar_2x_6                   1           10.0  20.000    20     20
4050norm_haar_3x_4                   1           10.0  20.000    20     20
1216norm_haar_2x_4_flip              1           10.0  20.000    20     20

[69 rows x 5 columns]


Common HAAR features among