In [2]:
import pandas as pd
from pathlib import Path

# Folder path (adjust if needed)
data_dir = Path("Result")  # or directly Path("/mnt/data")

# List of CSV files to merge (modify if file names differ)
files_to_merge = [
    "GPT_41.csv",
    "00_GPT_41_finetune_1.csv",
    "00_GPT_41_finetune_2.csv",
    "00_GPT_41_finetune_3.csv",
    "00_GPT_41_finetune_4.csv",
    "00_GPT_41_finetune_5.csv",
    "GPT_5_nano_nr.csv",
    "GPT_5_nano_r.csv",
    "GPT_5_nr.csv",
    "GPT_5_r.csv",
]

# Load and merge
merged_df = None

for file in files_to_merge:
    file_path = data_dir / file
    if file_path.exists():
        print(f"üîó Loading: {file}")
        temp_df = pd.read_csv(file_path)

        # Ensure consistent row alignment using index
        if merged_df is None:
            merged_df = temp_df.copy()
        else:
            merged_df = pd.concat([merged_df, temp_df], axis=1)
    else:
        print(f"‚ö†Ô∏è Skipped (not found): {file}")

# Save merged file
output_path = data_dir / "01_Result_All.csv"
merged_df.to_csv(output_path, index=False)
print(f"\n‚úÖ Merged file saved to: {output_path}")


üîó Loading: GPT_41.csv
üîó Loading: 00_GPT_41_finetune_1.csv
üîó Loading: 00_GPT_41_finetune_2.csv
üîó Loading: 00_GPT_41_finetune_3.csv
üîó Loading: 00_GPT_41_finetune_4.csv
üîó Loading: 00_GPT_41_finetune_5.csv
üîó Loading: GPT_5_nano_nr.csv
üîó Loading: GPT_5_nano_r.csv
üîó Loading: GPT_5_nr.csv
üîó Loading: GPT_5_r.csv

‚úÖ Merged file saved to: Result/01_Result_All.csv


In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load the dataset
file_path = 'Result/01_Result_All.csv'
df = pd.read_csv(file_path)

# Define the ground truth columns
ground_truth_cols = ['Q3_1', 'Q3_2', 'Q3_3', 'Q3_4', 'Q3_5', 'Q3_6', 'Q3_7', 'Q3_8']

# Initialize a dataframe to store the results
complete_results_df = pd.DataFrame(columns=['Theme', 'Model', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])

# Model groupings with their corresponding columns
complete_model_mapping = {
    'GPT_41': [f'GPT_41_{i}' for i in range(1, 9)],
    'GPT_41_finetune_1': [f'GPT_41_finetune_1_{i}' for i in range(1, 9)],
    'GPT_41_finetune_2': [f'GPT_41_finetune_2_{i}' for i in range(1, 9)],
    'GPT_41_finetune_3': [f'GPT_41_finetune_3_{i}' for i in range(1, 9)],
    'GPT_41_finetune_4': [f'GPT_41_finetune_4_{i}' for i in range(1, 9)],
    'GPT_41_finetune_5': [f'GPT_41_finetune_5_{i}' for i in range(1, 9)],
    'GPT_5_r': [f'GPT_5_r_{i}' for i in range(1, 9)],
    'GPT_5_nr': [f'GPT_5_nr_{i}' for i in range(1, 9)],
    'GPT_5_nano_r': [f'GPT_5_nano_r_{i}' for i in range(1, 9)],
    'GPT_5_nano_nr': [f'GPT_5_nano_nr_{i}' for i in range(1, 9)]

}

# Evaluate performance for each model and theme
for model_name, columns in complete_model_mapping.items():
    for idx, theme in enumerate(ground_truth_cols):
        if idx < len(columns):
            accuracy = accuracy_score(df[theme], df[columns[idx]])
            f1 = f1_score(df[theme], df[columns[idx]], average='macro')
            precision = precision_score(df[theme], df[columns[idx]], average='macro')
            recall = recall_score(df[theme], df[columns[idx]], average='macro')

            # Append the results to the dataframe
            complete_results_df = pd.concat([complete_results_df, pd.DataFrame({
                'Theme': [theme],
                'Model': [model_name],
                'Accuracy': [accuracy],
                'F1 Score': [f1],
                'Precision': [precision],
                'Recall': [recall]
            })], ignore_index=True)



  complete_results_df = pd.concat([complete_results_df, pd.DataFrame({
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
complete_results_df

Unnamed: 0,Theme,Model,Accuracy,F1 Score,Precision,Recall
0,Q3_1,GPT_41,0.901257,0.664182,0.677278,0.653508
1,Q3_2,GPT_41,0.892280,0.773945,0.768180,0.780138
2,Q3_3,GPT_41,0.971275,0.810705,0.756429,0.899728
3,Q3_4,GPT_41,0.815081,0.716917,0.695403,0.762821
4,Q3_5,GPT_41,0.910233,0.608452,0.579608,0.741509
...,...,...,...,...,...,...
75,Q3_4,GPT_5_nano_nr,0.834829,0.503565,0.670475,0.521798
76,Q3_5,GPT_5_nano_nr,0.973070,0.551987,0.986511,0.531250
77,Q3_6,GPT_5_nano_nr,0.608618,0.608162,0.673024,0.665354
78,Q3_7,GPT_5_nano_nr,0.619390,0.541969,0.576835,0.553301


In [5]:
complete_results_df.to_csv("Result/02_performance.csv", index=False)