In [9]:
"""
================================================================================
BEST FEATURES DATASET CREATOR
================================================================================
Authors: Giuseppe Riccio, Stefano Cavuoti
Date: 2026-02-24
Description:
    This script filters and combines the most significant features identified 
    during the importance analysis phase. It extracts the "top N" Haar and 
    Haralick features and merges them with the original physical fluxes 
    (Total, CSM, Ejecta, Mask).
    
    Key functionalities:
    1. Reads occurrence statistics to identify the best performing features.
    2. Dynamically builds the column list for physical data based on filenames.
    3. Extracts selected columns from the "OnlyHaar" and "OnlyHaralick" datasets.
    4. Merges physical data and best features into a final optimized dataset.

Dependencies:
    - numpy
    - pandas
    - os

Usage:
    - Requires 'occurrences_stats' CSV files generated by the analysis script.
    - Requires the merged 'onlyHaar' and 'onlyHaralick' CSV files.
================================================================================
"""

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import pandas as pd

In [19]:
# Configuration and targets
target = "CSM"
nfeat = 20        # Reference number of features from the stats file
best2get = 3      # Number of top features to extract from each set
txtfilelist = [
    "occurrences_stats" + str(nfeat) + "HAAR_" + target + ".csv", 
    "occurrences_stats" + str(nfeat) + "HARALICK_" + target + ".csv"
]

# Identify the best features from the statistics files
features_cols = []
for txtfile in txtfilelist:
    data = pd.read_csv(txtfile, index_col=[0])
    print(f"Processing stats from: {txtfile}")
    print(data)
    best = data.head(best2get)
    best_features = list(best.index.values)
    features_cols.append(best_features)
                    

print(f"Selected features: {features_cols}")

                         N occurrences  % occurrences    Mean  Best  Worst
7080norm_haar_2x_8                  10          100.0   3.500     1      9
7080norm_haar_2x_7                  10          100.0   4.000     1      8
7080norm_haar_2y_4_flip             10          100.0   9.600     1     19
7080norm_haar_3x_4                   9           90.0  14.000    10     19
2130norm_haar_2x_8_flip              8           80.0   2.625     1      7
...                                ...            ...     ...   ...    ...
7080norm_haar_2y_6                   1           10.0  19.000    19     19
0512norm_haar_2x_8_flip              1           10.0  19.000    19     19
4050norm_haar_2x_6                   1           10.0  20.000    20     20
4050norm_haar_3x_4                   1           10.0  20.000    20     20
1216norm_haar_2x_4_flip              1           10.0  20.000    20     20

[69 rows x 5 columns]
                              N occurrences  % occurrences       Mean  Best  

In [20]:
# Source merged datasets
all_haar = 'im2_AllColumns_onlyHaar.csv'
all_haralick = 'im2_AllColumns_onlyHaralick.csv'

# Directory containing the source CSV files
folder_path = 'im2' + os.sep
                                                                        

# Get the list of CSV files in the folder, sorted alphabetically
csv_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv')])

# Build the list of physical flux columns (Total, CSM, Ejecta, Mask) for each file
fluxes_col = []
for f in csv_files:
    flux = f.split(".")[0]
    fluxes_col.append(flux + "_Total")
    fluxes_col.append(flux + "_CSM")
    fluxes_col.append(flux + "_Ejecta")
    fluxes_col.append(flux + "_Mask")

# Prepare column selection: Physical fluxes + Best Haar features
cols = fluxes_col + features_cols[0]

# Load selected columns from Haar and Haralick datasets
haar_data = pd.read_csv(all_haar, usecols=cols)

haralick_data = pd.read_csv(all_haralick, usecols=features_cols[1])

In [21]:
# Combine everything into the final dataset
dataset_full = pd.concat([haar_data, haralick_data], axis=1)

# Export to a new CSV file
output_name = "im2_best" + str(best2get) + "feat_" + target + ".csv"
dataset_full.to_csv(output_name, index=False)

print(f"Final dataset saved as: {output_name}")