In [1]:
import json
import glob
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict, Counter
import os

# Set up the path to the human study results
base_path = "/Users/piushorn/PycharmProjects/pdf-to-text-benchmark/artifacts/runs/human_study"

# Find all eval_formula_results.json files
json_files = glob.glob(f"{base_path}/*/mistral/eval_formula_results.json")
print(f"Found {len(json_files)} JSON files to process")

# Initialize data storage for each LLM judge model
llm_scores = defaultdict(list)

# Process each JSON file
for json_file in json_files:
    with open(json_file, 'r') as f:
        data = json.load(f)

    # Extract scores for each LLM judge model
    for formula_eval in data:
        for llm_eval in formula_eval['llm_evals']:
            judge_model = llm_eval['judge_model']
            score = llm_eval['score']
            llm_scores[judge_model].append(score)
    


Found 20 JSON files to process


In [3]:
# Analysis: Formulas with perfect scores from both CDM (1.0) and GPT-5 (10.0)
print("Analysis: Formulas with Perfect Scores from Both CDM and GPT-5")
print("="*60)

# Initialize counters and storage
perfect_both_count = 0
cdm_perfect_count = 0
gpt5_perfect_count = 0
total_formulas = 0
perfect_both_formulas = []

# Process each JSON file again to collect CDM and GPT-5 scores per formula
for json_file in json_files:
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    for formula_eval in data:
        total_formulas += 1
        formula_number = formula_eval['formula_number']
        
        # Get CDM score
        cdm_score = formula_eval.get('cdm_eval', {}).get('score', None)
        
        # Get GPT-5 score
        gpt5_score = None
        for llm_eval in formula_eval['llm_evals']:
            if llm_eval['judge_model'] == 'gpt-5':
                gpt5_score = llm_eval['score']
                break
        
        # Count perfect scores
        if cdm_score == 1.0:
            cdm_perfect_count += 1
        
        if gpt5_score == 10.0:
            gpt5_perfect_count += 1
        
        # Check if both have perfect scores
        if cdm_score == 1.0 and gpt5_score == 10.0:
            perfect_both_count += 1
            perfect_both_formulas.append({
                'file': json_file.split('/')[-3],  # Extract folder number (000, 001, etc.)
                'formula_number': formula_number,
                'ground_truth': formula_eval['ground_truth_formula'],
                'extracted': formula_eval['extracted_formula'],
                'formula_type': formula_eval['formula_type']
            })

print(f"Total formulas analyzed: {total_formulas}")
print(f"CDM perfect scores (1.0): {cdm_perfect_count} ({cdm_perfect_count/total_formulas*100:.1f}%)")
print(f"GPT-5 perfect scores (10.0): {gpt5_perfect_count} ({gpt5_perfect_count/total_formulas*100:.1f}%)")
print(f"Both CDM=1.0 AND GPT-5=10.0: {perfect_both_count} ({perfect_both_count/total_formulas*100:.1f}%)")

print(f"\nOverlap Analysis:")
print(f"Of CDM perfect scores, {perfect_both_count}/{cdm_perfect_count} ({perfect_both_count/cdm_perfect_count*100:.1f}%) also got GPT-5 perfect")
print(f"Of GPT-5 perfect scores, {perfect_both_count}/{gpt5_perfect_count} ({perfect_both_count/gpt5_perfect_count*100:.1f}%) also got CDM perfect")


Analysis: Formulas with Perfect Scores from Both CDM and GPT-5
Total formulas analyzed: 390
CDM perfect scores (1.0): 245 (62.8%)
GPT-5 perfect scores (10.0): 251 (64.4%)
Both CDM=1.0 AND GPT-5=10.0: 217 (55.6%)

Overlap Analysis:
Of CDM perfect scores, 217/245 (88.6%) also got GPT-5 perfect
Of GPT-5 perfect scores, 217/251 (86.5%) also got CDM perfect
