# Step 0: Results Gatherer

In [1]:
# !pip install -U numpy pandas

In [2]:
import json
import os
import pandas as pd

In [3]:
metric = 'f1_score_weighted'

datasets = {
    'binary': [31, 37, 44, 1462, 1479, 1510, 40945],
    'multiclass': [23, 36, 54, 181, 1466, 40691, 40975],
    'multilabel_native': [285, 41464, 41465, 41468, 41470, 41471, 41473],
    'multilabel_powerset': ['285ps', '41464ps', '41465ps', '41468ps', '41470ps', '41471ps', '41473ps']
}

frameworks = [
    '4intelligence', 'autogluon', 'autokeras', 'autopytorch', 'autosklearn', 'evalml', 'fedot',
    'flaml', 'gama', 'h2o', 'lightautoml', 'lightwood', 'mljar', 'naive', 'pycaret', 'tpot',
]

In [4]:
import pandas as pd
import os

def clean_and_save_results(results_list, scenario, output_dir="stats"):
    """
    Cleans the experimental results by:
    1. Removing frameworks that failed across all datasets (all NaN in F1 Score & Training Time).
    2. Removing rows where F1 Score < 0 or Training Time < 0.

    Parameters:
    - results_list (list of dict): List of trial results.
    - scenario (str): The name of the scenario (used for file naming).
    - output_dir (str): The directory where results will be saved (default: 'stats').

    Returns:
    - pd.DataFrame: The cleaned DataFrame with invalid frameworks and trial results removed.
    """
    # Convert results to DataFrame
    df = pd.DataFrame(results_list)
    df["Trial"] = df["Trial"].astype('Int64')

    # Remove frameworks that failed across ALL datasets
    failed_frameworks = df.groupby("Framework")[["F1 Score", "Training Time"]].apply(lambda x: x.isna().all().all())
    failed_frameworks = failed_frameworks[failed_frameworks].index.tolist()  # Get list of failed frameworks
    df = df[~df["Framework"].isin(failed_frameworks)]  # Drop those frameworks

    # Remove rows where F1 Score < 0 or Training Time < 0
    df = df[(df["F1 Score"] >= 0) & (df["Training Time"] >= 0)]

    # Save to CSV
    filename = f"{output_dir}/{scenario}/{scenario}_experimental_results.csv"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    df.to_csv(filename, index=False)

    return df  # Return cleaned DataFrame

In [5]:
all_results_list = [] # Results for all scenarios

for scenario, dataset_ids in datasets.items():
    
    this_results_list = []  # Results for this scenario

    for dataset_id in dataset_ids:

        for framework in frameworks:
    
            try:
                with open(f'results/{scenario}/{dataset_id}/automl_{framework}.json', 'r') as fp:
                    content = json.load(fp)
                    f1_scores = [x['f1_score_weighted'] for x in content['results']]  # Extract scores
                    training_times = [x['training_time'] for x in content['results']]  # Extract scores

                    # Store each trial
                    for trial_idx, (score, time) in enumerate(zip(f1_scores, training_times)):
                        trial_result = {
                            "Dataset": dataset_id,
                            "Dataset Type": scenario,
                            "Framework": framework,
                            "Trial": trial_idx + 1,  # Make trials 1-based for readability
                            "F1 Score": score,
                            "Training Time": time
                        }
                        this_results_list.append(trial_result)
                        all_results_list.append(trial_result)
    
            except Exception as e:
                # Missing file or error → No results for this framework/dataset
                trial_result_error = {
                    "Dataset": dataset_id,
                    "Dataset Type": scenario,
                    "Framework": framework,
                    "Trial": None,
                    "F1 Score": None,
                    "Training Time": None
                }
                this_results_list.append(trial_result_error)
                all_results_list.append(trial_result_error)

    this_df = clean_and_save_results(this_results_list, scenario)


all_df = clean_and_save_results(all_results_list, "all")

# Show DataFrame
all_df

Unnamed: 0,Dataset,Dataset Type,Framework,Trial,F1 Score,Training Time
0,31,binary,4intelligence,1,0.598200,174.789680
1,31,binary,4intelligence,2,0.602864,171.077410
2,31,binary,4intelligence,3,0.600504,121.127800
3,31,binary,4intelligence,4,0.600504,158.295441
4,31,binary,4intelligence,5,0.599930,168.605997
...,...,...,...,...,...,...
6426,41473ps,multilabel_powerset,tpot,16,0.193780,545.683002
6427,41473ps,multilabel_powerset,tpot,17,0.196621,550.792275
6428,41473ps,multilabel_powerset,tpot,18,0.149474,351.656300
6429,41473ps,multilabel_powerset,tpot,19,0.170534,581.061009
