In [None]:
import os
import pandas as pd

from ms.config.pipeline_constants import CONF
from ms.utils.navigation import pjoin

In [None]:
import numpy as np


def collect_results(root_folder):
    records = []

    for target_model in os.listdir(root_folder):
        target_path = os.path.join(root_folder, target_model)
        if not os.path.isdir(target_path):
            continue

        for selector in os.listdir(target_path):
            selector_path = os.path.join(target_path, selector)
            pred_path = os.path.join(selector_path, 'pred')
            if not os.path.exists(pred_path):
                continue

            for metamodel_file in os.listdir(pred_path):
                if not metamodel_file.endswith('.csv'):
                    continue

                metamodel = metamodel_file.replace('.csv', '')
                file_path = os.path.join(pred_path, metamodel_file)

                df = pd.read_csv(file_path, index_col=0)  # rows are metrics

                for metric, row in df.iterrows():
                    fold_data = {'train': [], 'test': []}

                    for col_name, value in row.items():
                        try:
                            fold_type, fold_num = col_name.split('_')
                            if fold_type in fold_data:
                                fold_data[fold_type].append(float(value))
                        except ValueError:
                            continue  # Skip malformed column names

                    # Add original metric
                    for fold_type in ['train', 'test']:
                        values = fold_data[fold_type]
                        if values:
                            values_series = pd.Series(values)
                            records.append({
                                'target_model': target_model,
                                'selector': selector,
                                'metamodel': metamodel,
                                'metric': metric,
                                'fold_type': fold_type,
                                'mean': values_series.mean(),
                                'std': values_series.std()
                            })

                    # If MSE, also compute and append RMSE
                    if metric.lower() == 'mse':
                        for fold_type in ['train', 'test']:
                            rmse_values = [np.sqrt(v) for v in fold_data[fold_type]]
                            if rmse_values:
                                rmse_series = pd.Series(rmse_values)
                                records.append({
                                    'target_model': target_model,
                                    'selector': selector,
                                    'metamodel': metamodel,
                                    'metric': 'rmse',
                                    'fold_type': fold_type,
                                    'mean': rmse_series.mean(),
                                    'std': rmse_series.std()
                                })

    return pd.DataFrame.from_records(records)

In [None]:
root_folder = pjoin(CONF.results_path, "tabzilla", "raw")  # Replace with your actual path

In [None]:
results_df = collect_results(root_folder)

In [None]:
results_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def plot_rmse_bars(df, save_dir=None):
     # Filter only RMSE on test folds
    rmse_df = df[(df['metric'] == 'rmse') & (df['fold_type'] == 'test')]

    target_models = sorted(rmse_df['target_model'].unique())
    selectors = sorted(rmse_df['selector'].unique())

    width = 0.1  # width of each bar
    spacing = 0.15  # spacing between groups

    x_indices = np.arange(len(target_models))

    for metamodel in rmse_df['metamodel'].unique():
        sub_df = rmse_df[rmse_df['metamodel'] == metamodel]

        fig, ax = plt.subplots(figsize=(12, 6))

        for i, selector in enumerate(selectors):
            selector_df = sub_df[sub_df['selector'] == selector]
            
            means = []
            stds = []

            # Safely get mean and std for each target_model, skip if not available
            for tm in target_models:
                target_data = selector_df[selector_df['target_model'] == tm]
                if not target_data.empty:
                    means.append(target_data['mean'].values[0])
                    stds.append(target_data['std'].values[0])
                else:
                    # Skip this target_model if data is missing
                    break
            else:  # Only plot if all target_models have data
                offset = (i - len(selectors)/2) * width + width/2
                bar_positions = x_indices + offset
                ax.bar(bar_positions, means, width=width, yerr=stds, label=selector, capsize=4)

        ax.set_xticks(x_indices)
        ax.set_xticklabels(target_models, rotation=45)
        ax.set_xlabel('Target Model')
        ax.set_ylabel('RMSE (mean ± std)')
        ax.set_title(f'RMSE (Test) for Metamodel: {metamodel}')
        ax.legend(title='Selector')
        ax.grid(True, axis='y', linestyle='--', alpha=0.5)

        plt.tight_layout()

        if save_dir:
            plt.savefig(f"{save_dir}/rmse_{metamodel}_matplotlib.png")
        else:
            plt.show()

        plt.close()

In [None]:
plot_rmse_bars(results_df)

In [55]:
def aggregate_selected_features(root_dir, normalize_by_min=False):
    """
    Traverse folder structure and return selected features for each selector
    per target model, sorted by mean absolute value across folds.

    Criteria:
    - Selected in >= half the folds (non-NaN).
    - All selected values have same sign.
    - Score = mean of absolute values (across folds).
    - Optionally, reduce all lists to length of smallest list for each target model.

    Args:
        root_dir (str): Root folder path.
        normalize_by_min (bool): Whether to truncate lists to the min number of features per target model.

    Returns:
        dict[target_model][selector] = list of (feature, score), sorted descending.
    """
    results = {}

    for target_model in os.listdir(root_dir):
        target_path = os.path.join(root_dir, target_model)
        if not os.path.isdir(target_path):
            continue

        results[target_model] = {}
        min_len = float('inf')

        for selector in os.listdir(target_path):
            selector_path = os.path.join(target_path, selector)
            data_csv = os.path.join(selector_path, "data.csv")
            if not os.path.isfile(data_csv):
                continue

            df = pd.read_csv(data_csv, index_col=0)
            value_cols = [col for col in df.columns if col.startswith("value_")]
            values = df[value_cols]

            # Selection criteria
            non_nan_counts = values.notna().sum(axis=1)
            min_required = len(value_cols) // 2
            selected_mask = non_nan_counts >= min_required
            filtered = values[selected_mask]

            def has_consistent_sign(row):
                signs = np.sign(row.dropna())
                return np.all(signs > 0) or np.all(signs < 0)

            consistent_mask = filtered.apply(has_consistent_sign, axis=1)
            final_df = filtered[consistent_mask]

            # Scoring and sorting
            feature_scores = final_df.abs().mean(axis=1)
            sorted_features = sorted(feature_scores.items(), key=lambda x: -x[1])

            results[target_model][selector] = sorted_features

            # Track min length if normalization is enabled
            if normalize_by_min:
                min_len = min(min_len, len(sorted_features))
                
        print(f"Target model: {target_model}, Min length: {min_len}")

        # Truncate lists if requested
        if normalize_by_min and min_len < float('inf'):
            for selector in results[target_model]:
                results[target_model][selector] = results[target_model][selector][:min_len]

    return results

In [56]:
sel_f = aggregate_selected_features(root_folder, True)

Target model: LinearModel, Min length: 16
Target model: RandomForest, Min length: 21
Target model: rtdl_FTTransformer, Min length: 10
Target model: rtdl_MLP, Min length: 14
Target model: rtdl_ResNet, Min length: 11
Target model: XGBoost, Min length: 8


In [57]:
sel_f

{'LinearModel': {'base': [('f__pymfe.general.attr_to_inst', 1.0),
   ('f__pymfe.statistical.cor.sd', 1.0),
   ('f__pymfe.statistical.kurtosis.kurtosis', 1.0),
   ('f__pymfe.statistical.iq_range.skewness', 1.0),
   ('f__pymfe.statistical.mad.min', 1.0),
   ('f__pymfe.statistical.mean.mean', 1.0),
   ('f__pymfe.statistical.iq_range.mean', 1.0),
   ('f__pymfe.statistical.gravity', 1.0),
   ('f__pymfe.statistical.g_mean.min', 1.0),
   ('f__pymfe.statistical.g_mean.mean', 1.0),
   ('f__pymfe.statistical.eigenvalues.min', 1.0),
   ('f__pymfe.statistical.cov.min', 1.0),
   ('f__pymfe.statistical.cov.kurtosis', 1.0),
   ('f__pymfe.statistical.cor.skewness', 1.0),
   ('f__pymfe.statistical.cor.min', 1.0),
   ('f__pymfe.statistical.kurtosis.min', 1.0)],
  'corr': [('f__pymfe.model-based.nodes_per_inst', 0.6277515396426375),
   ('f__pymfe.landmarking.naive_bayes.mean', 0.6265063964555322),
   ('f__pymfe.landmarking.naive_bayes.min', 0.5733331312172671),
   ('f__pymfe.statistical.can_cor.min', 0.5