# Auswertung der Ergebnisse
Dieses Notebook liest die Ergebnisse aus den JSON-Dateien im Ordner `results` aus und stellt sie in einer Ã¼bersichtlichen Tabelle dar.

In [None]:
import os
import json
import pandas as pd
from IPython.display import display
import itertools

pd.set_option('display.max_rows', None)

results_dir = 'results/all-2025-08-08_16:13:57'
data = []

for dataset_folder in os.listdir(results_dir):
    dataset_path = os.path.join(results_dir, dataset_folder)
    if os.path.isdir(dataset_path):
        for filename in os.listdir(dataset_path):
            if filename.endswith('.json'):
                filepath = os.path.join(dataset_path, filename)
                with open(filepath, 'r') as f:
                    content = json.load(f)
                    
                training_method = content.get('training_method', 'N/A')
                testing_method = content.get('testing_method', 'N/A')
                is_filtered_file = '_filter' in filename
                
                for result in content.get('anonymization_results', []):
                    anonymization_level = result.get('level', 'N/A')
                    
                    if is_filtered_file and result.get('filtering', {}).get('enabled') and result.get('filtering', {}).get('results'):
                        for filter_result in result['filtering']['results']:
                            data.append({
                                'Dataset': dataset_folder,
                                'Training Method': training_method,
                                'Testing Method': testing_method,
                                'Anonymization Level': anonymization_level,
                                'F1 Score (avg)': filter_result.get('metrics', {}).get('f1_score_avg', 0),
                                'Filtered': 'Yes',
                                'Filter Mode': filter_result.get('mode', 'N/A'),
                                'n_duplicates': filter_result.get('n_duplicates', 'N/A')
                            })
                    else:
                        # This handles both non-filtered files and non-filtered results within filtered files
                        if result.get('metrics', {}).get('f1_score_avg') is not None:
                            data.append({
                                'Dataset': dataset_folder,
                                'Training Method': training_method,
                                'Testing Method': testing_method,
                                'Anonymization Level': anonymization_level,
                                'F1 Score (avg)': result.get('metrics', {}).get('f1_score_avg', 0),
                                'Filtered': 'No',
                                'Filter Mode': 'N/A',
                                'n_duplicates': 'N/A'
                            })

df = pd.DataFrame(data)
df['F1 Score (avg)'] = (df['F1 Score (avg)'] * 100).round(1)

# --- Add missing rows ---
training_method_order = ['original', 'no_preprocessing', 'forced_generalization', 'weighted_specialization']
testing_method_order = ['original', 'no_preprocessing', 'forced_generalization', 'weighted_specialization']
anonymization_level_order = ['no', 'basic', 'all']
datasets = df['Dataset'].unique()

filter_configs = []
filter_configs.append({'Filtered': 'No', 'Filter Mode': 'N/A', 'n_duplicates': 'N/A'})
filter_modes = ['unique', 'random', 'imputation', 'knn', 'autoencoder']
for mode in filter_modes:
    if mode == 'unique':
        filter_configs.append({'Filtered': 'Yes', 'Filter Mode': mode, 'n_duplicates': 0})
    else:
        for n_dup in [1, 3, 5]:
            filter_configs.append({'Filtered': 'Yes', 'Filter Mode': mode, 'n_duplicates': n_dup})

all_combinations = list(itertools.product(datasets, training_method_order, testing_method_order, anonymization_level_order))

expected_df = []
for d, tr, te, al in all_combinations:
    if tr == 'weighted_specialization':
        if al == 'no':
            # For 'no' anon level, expect non-filtered and one specific filtered result
            expected_df.append({
                'Dataset': d, 'Training Method': tr, 'Testing Method': te, 'Anonymization Level': al,
                'Filtered': 'No', 'Filter Mode': 'N/A', 'n_duplicates': 'N/A'
            })
            expected_df.append({
                'Dataset': d, 'Training Method': tr, 'Testing Method': te, 'Anonymization Level': al,
                'Filtered': 'Yes', 'Filter Mode': 'unique', 'n_duplicates': 0
            })
        else:
            # For 'basic' or 'all' anon, expect all filter configurations
            for fc in filter_configs:
                expected_df.append({
                    'Dataset': d, 'Training Method': tr, 'Testing Method': te, 'Anonymization Level': al,
                    'Filtered': fc['Filtered'], 'Filter Mode': fc['Filter Mode'], 'n_duplicates': fc['n_duplicates']
                })
    else:
        # For other training methods, only expect non-filtered results
        fc = filter_configs[0]
        expected_df.append({
            'Dataset': d, 'Training Method': tr, 'Testing Method': te, 'Anonymization Level': al,
            'Filtered': fc['Filtered'], 'Filter Mode': fc['Filter Mode'], 'n_duplicates': fc['n_duplicates']
        })

expected_df = pd.DataFrame(expected_df)

merged_df = pd.merge(expected_df, df, on=list(expected_df.columns), how='left')
merged_df['F1 Score (avg)'] = merged_df['F1 Score (avg)'].fillna('TODO')

df = merged_df

# Define custom sort order
df['Training Method'] = pd.Categorical(df['Training Method'], categories=training_method_order, ordered=True)
df['Testing Method'] = pd.Categorical(df['Testing Method'], categories=testing_method_order, ordered=True)
df['Anonymization Level'] = pd.Categorical(df['Anonymization Level'], categories=anonymization_level_order, ordered=True)

df = df.sort_values(by=['Dataset', 'Testing Method', 'Training Method', 'Anonymization Level', 'Filtered', 'Filter Mode', 'n_duplicates'])

In [52]:
print("Adult Dataset F1-Scores")
display(df[df["Dataset"]=="adult"].drop("Dataset", axis=1))

Adult Dataset F1-Scores


Unnamed: 0,Training Method,Testing Method,Anonymization Level,Filtered,Filter Mode,n_duplicates,F1 Score (avg)
165,original,original,no,No,,,81.3
166,original,original,basic,No,,,81.3
167,original,original,all,No,,,81.3
179,no_preprocessing,original,no,No,,,80.9
180,no_preprocessing,original,basic,No,,,78.7
181,no_preprocessing,original,all,No,,,48.0
193,forced_generalization,original,no,No,,,81.1
194,forced_generalization,original,basic,No,,,80.4
195,forced_generalization,original,all,No,,,58.2
207,weighted_specialization,original,no,No,,,81.1


In [53]:
print("\nDiabetes Dataset F1-Scores")
display(df[df["Dataset"]=="diabetes"].drop("Dataset", axis=1))


Diabetes Dataset F1-Scores


Unnamed: 0,Training Method,Testing Method,Anonymization Level,Filtered,Filter Mode,n_duplicates,F1 Score (avg)
0,original,original,no,No,,,75.3
1,original,original,basic,No,,,75.3
2,original,original,all,No,,,75.3
15,no_preprocessing,original,no,No,,,74.7
16,no_preprocessing,original,basic,No,,,72.5
17,no_preprocessing,original,all,No,,,33.8
30,forced_generalization,original,no,No,,,74.7
31,forced_generalization,original,basic,No,,,73.2
32,forced_generalization,original,all,No,,,33.8
45,weighted_specialization,original,no,No,,,75.3
