## Eval time goal

In [2]:
import pandas as pd

# Load both CSV files
file_2024 = 'outputs/eval_time_goal_gen20240920-142509.csv'
file_2023 = 'results/eval_time_goal_gen20230720-142101.csv'

# Read the CSV files into dataframes
df_2024 = pd.read_csv(file_2024)
df_2023 = pd.read_csv(file_2023)



In [7]:
# Clean dataframes by removing extra unnamed columns
df_2023_clean = df_2023[['Goal Type', 'Goal number', 'INPUT - Goal', 'n_run', 'time (s)']]
df_2024_clean = df_2024[['Goal Type', 'Goal number', 'INPUT - Goal', 'n_run', 'time (s)']]

# Merge both dataframes on common columns: 'Goal Type', 'Goal number', 'INPUT - Goal', and 'n_run'
comparison_df = pd.merge(df_2024_clean, df_2023_clean, on=['Goal Type', 'Goal number', 'INPUT - Goal', 'n_run'], suffixes=('_2024', '_2023'))

# Calculate the difference in time (s)
comparison_df['time_difference'] = comparison_df['time (s)_2024'] - comparison_df['time (s)_2023']

# Show the first few rows of the comparison to verify
# Perform general comparison
# Calculate overall statistics for both years
mean_time_2023 = comparison_df['time (s)_2023'].mean()
mean_time_2024 = comparison_df['time (s)_2024'].mean()
mean_time_difference = comparison_df['time_difference'].mean()

# Calculate median times for both years
median_time_2023 = comparison_df['time (s)_2023'].median()
median_time_2024 = comparison_df['time (s)_2024'].median()

# Calculate overall statistics for comparison
comparison_stats = {
    'Mean Time 2023 (s)': float(mean_time_2023),
    'Mean Time 2024 (s)': float(mean_time_2024),
    'Mean Time Difference (s)': float(mean_time_difference),
    'Median Time 2023 (s)': float(median_time_2023),
    'Median Time 2024 (s)': float(median_time_2024)
}

comparison_stats


{'Mean Time 2023 (s)': 3.243075455725193,
 'Mean Time 2024 (s)': 3.9854286834597588,
 'Mean Time Difference (s)': 0.7423532277345657,
 'Median Time 2023 (s)': 1.528114557266235,
 'Median Time 2024 (s)': 3.9244431257247925}

## Eval Goal Gen

In [27]:
file_new_5shot = "results/complete_eval_goal_gen_5shot.csv"
df_new_2024 = "outputs/eval_goal_gen20240920-160735.csv"

# Attempt to load the second file using a different encoding to avoid UnicodeDecodeError
df_new_5shot = pd.read_csv(file_new_5shot, encoding='ISO-8859-1')
df_new_2024 = pd.read_csv(df_new_2024, encoding='ISO-8859-1')


In [23]:
# Clean up relevant columns for both files
# Focusing on unfiltered and ungrounded metrics for completeness, correctness, and test success in both files

# 2024 data file columns to focus on
df_new_2024_clean = df_new_2024[['METRIC 1: subgoal completeness - unfiltered and ungrounded', 
                                 'METRIC 2: subgoal correctness - unfiltered and ungrounded', 
                                 'METRIC 3: test success - unfiltered and ungrounded']].copy()

df_new_2024_clean.columns = ['completeness', 'correctness', 'test_success']  # Renaming for simplicity

# 5-shot data file columns to focus on
df_new_5shot_clean = df_new_5shot[['METRIC 1: subgoal completeness - unfiltered and ungrounded', 
                                   'METRIC 2: subgoal correctness - unfiltered and ungrounded', 
                                   'METRIC 3: test success - unfiltered and ungrounded']].copy()

df_new_5shot_clean.columns = ['completeness', 'correctness', 'test_success']  # Renaming for simplicity

# Convert any 'manual_check' or invalid entries to NaN and convert to numeric
df_new_2024_clean.replace('manual_check', pd.NA, inplace=True)
df_new_5shot_clean.replace('manual_check', pd.NA, inplace=True)

df_new_2024_clean = df_new_2024_clean.apply(pd.to_numeric, errors='coerce')
df_new_5shot_clean = df_new_5shot_clean.apply(pd.to_numeric, errors='coerce')

# Calculate overall statistics (mean) for both datasets
comparison_summary = pd.DataFrame({
    'Metric': ['Completeness', 'Correctness', 'Test Success'],
    '2024 (Mean)': [df_new_2024_clean['completeness'].mean(), 
                    df_new_2024_clean['correctness'].mean(), 
                    df_new_2024_clean['test_success'].mean()],
    '5-shot (Mean)': [df_new_5shot_clean['completeness'].mean(), 
                      df_new_5shot_clean['correctness'].mean(), 
                      df_new_5shot_clean['test_success'].mean()]
})

comparison_summary


Unnamed: 0,Metric,2024 (Mean),5-shot (Mean)
0,Completeness,0.529412,0.90115
1,Correctness,0.529412,0.857242
2,Test Success,1.0,0.892915


In [24]:
# Limpiar las columnas de interés
df_new_2024_clean = df_new_2024[['METRIC 1: subgoal completeness', 
                                 'METRIC 2: subgoal correctness', 
                                 'METRIC 3: test success']].copy()

df_new_5shot_clean = df_new_5shot[['METRIC 1: subgoal completeness', 
                                   'METRIC 2: subgoal correctness', 
                                   'METRIC 3: test success']].copy()

# Renombrar columnas para mayor simplicidad
df_new_2024_clean.columns = ['completeness', 'correctness', 'test_success']
df_new_5shot_clean.columns = ['completeness', 'correctness', 'test_success']

# Reemplazar valores no válidos como 'manual_check' por NaN y convertir a numérico
df_new_2024_clean.replace('manual_check', pd.NA, inplace=True)
df_new_5shot_clean.replace('manual_check', pd.NA, inplace=True)

df_new_2024_clean = df_new_2024_clean.apply(pd.to_numeric, errors='coerce')
df_new_5shot_clean = df_new_5shot_clean.apply(pd.to_numeric, errors='coerce')

# Calcular estadísticas (medias) para ambas evaluaciones
comparison_summary = pd.DataFrame({
    'Metric': ['Completeness', 'Correctness', 'Test Success'],
    '2024 (Mean)': [df_new_2024_clean['completeness'].mean(), 
                    df_new_2024_clean['correctness'].mean(), 
                    df_new_2024_clean['test_success'].mean()],
    '5-shot (Mean)': [df_new_5shot_clean['completeness'].mean(), 
                      df_new_5shot_clean['correctness'].mean(), 
                      df_new_5shot_clean['test_success'].mean()]
})

# Mostrar el resumen
print(comparison_summary)

         Metric  2024 (Mean)  5-shot (Mean)
0  Completeness     0.470588       0.843540
1   Correctness     0.470588       0.937316
2  Test Success     0.470588       0.926952


## Eval Agent Adapt

In [48]:
new = pd.read_csv('outputs/eval_agent_adapt20240928-203445_complete.csv', encoding='ISO-8859-1')
old = pd.read_csv('results/complete_eval_agent_adapt.csv', encoding='ISO-8859-1')

In [61]:
print(new[~(new['GT subgoals to disfavour'].isna())]['subgoals disfavour correctness'].mean())
print(new[~(new['GT subgoals to disfavour'].isna())]['subgoals disfavour completeness'].mean())

0.8235294117647058
0.6666666666666665


In [62]:
print(old[~(old['GT subgoals to disfavour'].isna())]['subgoals disfavour correctness'].mean())
print(old[~(old['GT subgoals to disfavour'].isna())]['subgoals disfavour completeness'].mean())

0.8125
0.698125
