In [9]:
import os
import json
import pandas as pd
import re

# Path to the main directory containing subdirectories
main_directory = 'results/models'

# Initialize a list to store data
data = []

# Regex pattern to extract prompt type, epochs, and seed from subdirectory name
pattern = re.compile(r't5-large_prompt_type_(.*?)_epochs_(\d+)_seed_(\d+)')

# Iterate over each subdirectory in the main directory
for subdir in os.listdir(main_directory):
    subdir_path = os.path.join(main_directory, subdir)

    if os.path.isdir(subdir_path):
        metrics_file = os.path.join(subdir_path, 'test_metrics.json')

        # Check if the metrics file exists in the subdirectory
        if os.path.isfile(metrics_file):
            # Load the JSON data
            with open(metrics_file, 'r') as f:
                metrics = json.load(f)

            # Extract prompt type, epochs, and seed using regex
            match = pattern.match(subdir)
            if match:
                prompt_type = match.group(1)
                epochs = int(match.group(2))
                seed = int(match.group(3))

                # Add the data to the list
                data.append({
                    'subdirectory': subdir,
                    'prompt_type': prompt_type,
                    'epochs': epochs,
                    'seed': seed,
                    'f1': metrics.get('f1'),
                    'precision': metrics.get('precision'),
                    'recall': metrics.get('recall'),
                    'accuracy': metrics.get('accuracy')
                })

# Create a DataFrame from the collected data
raw_data = pd.DataFrame(data)

# Display the DataFrame
raw_data


Unnamed: 0,subdirectory,prompt_type,epochs,seed,f1,precision,recall,accuracy
0,t5-large_prompt_type_with_fallacy_definition_e...,with_fallacy_definition,50,183,0.113541,0.094208,0.142857,0.659459
1,t5-large_prompt_type_with_fallacy_definition_e...,with_fallacy_definition,50,64,0.113541,0.094208,0.142857,0.659459
2,t5-large_prompt_type_with_fallacy_definition_e...,with_fallacy_definition,50,98,0.113541,0.094208,0.142857,0.659459
3,t5-large_prompt_type_with_fallacy_definition_e...,with_fallacy_definition,5,183,0.016131,0.068132,0.009386,0.003604
4,t5-large_prompt_type_with_fallacy_definition_e...,with_fallacy_definition,5,64,0.005013,0.011905,0.003175,0.0
5,t5-large_prompt_type_with_fallacy_definition_e...,with_fallacy_definition,5,98,0.010485,0.033613,0.006211,0.001802
6,t5-large_prompt_type_with_fallacy_definition_w...,with_fallacy_definition_with_NotA,50,183,0.113541,0.094208,0.142857,0.659459
7,t5-large_prompt_type_with_fallacy_definition_w...,with_fallacy_definition_with_NotA,50,64,0.113541,0.094208,0.142857,0.659459
8,t5-large_prompt_type_with_fallacy_definition_w...,with_fallacy_definition_with_NotA,50,98,0.113541,0.094208,0.142857,0.659459
9,t5-large_prompt_type_with_fallacy_definition_w...,with_fallacy_definition_with_NotA,5,183,0.003072,0.142857,0.001553,0.0


In [12]:
raw_data.dtypes

subdirectory     object
prompt_type      object
epochs            int64
seed              int64
f1              float64
precision       float64
recall          float64
accuracy        float64
dtype: object

In [19]:
summary_data = raw_data \
    .drop(columns=['seed', 'subdirectory']) \
    .groupby(['prompt_type', 'epochs']).mean() \
    .reset_index() \
    .sort_values(by='f1', ascending=False)
summary_data

Unnamed: 0,prompt_type,epochs,f1,precision,recall,accuracy
7,without_fallacy_definition_with_NotA,50,0.159817,0.284321,0.171268,0.661261
5,without_fallacy_definition,50,0.15901,0.329836,0.167176,0.657057
1,with_fallacy_definition,50,0.113541,0.094208,0.142857,0.659459
3,with_fallacy_definition_with_NotA,50,0.113541,0.094208,0.142857,0.659459
6,without_fallacy_definition_with_NotA,5,0.113541,0.094208,0.142857,0.659459
4,without_fallacy_definition,5,0.047139,0.077351,0.097603,0.045646
0,with_fallacy_definition,5,0.010543,0.037883,0.006257,0.001802
2,with_fallacy_definition_with_NotA,5,0.002285,0.087302,0.001165,0.000601


Models of 50 epochs are better than models of 5 (which is totally expected, this is more of a sanity check). 
Models that don't include the fallacy definitions in their prompt perform better. 
Adding the comment of the None of the above class in the prompt doesn't improve much the performance.  