# Problem Set Analysis: A vs B Naming Convention

Problems ending with '_A': Expected answer = TRUE (positive)
Problems ending with '_B': Expected answer = FALSE (negative)

In [5]:
import json
import pandas as pd

# Load the model.json data
with open('../model.json', 'r') as f:
    data = json.load(f)

In [6]:
# Calculate problem difficulty (percentage of models that got it wrong)
def calculate_problem_difficulty(data):
    results = []
    
    for problem_id, problem_data in data.items():
        correct_count = 0
        total_models = len(problem_data)
        
        expected_answer = True if problem_id.endswith('_A') else False
        
        for model, answer in problem_data.items():
            if answer == expected_answer:
                correct_count += 1
        
        accuracy = (correct_count / total_models) * 100 if total_models > 0 else 0
        
        results.append({
            'problem_id': problem_id,
            'accuracy': accuracy,
            'correct_models': correct_count,
            'total_models': total_models,
            'expected_answer': expected_answer
        })
    
    return results

problem_results = calculate_problem_difficulty(data)
df_problems = pd.DataFrame(problem_results)
df_problems = df_problems.sort_values('accuracy', ascending=False)

print("TOP 20 EASIEST PROBLEMS (highest accuracy):")
print("=" * 80)
for i, row in df_problems.head(50).iterrows():
    expected = 'TRUE' if row['expected_answer'] else 'FALSE'
    print(f"{row['problem_id']:<15} {row['accuracy']:.1f}% ({row['correct_models']}/{row['total_models']}) Expected: {expected}")

print("\n\nTOP 20 HARDEST PROBLEMS (lowest accuracy):")
print("=" * 80)
for i, row in df_problems.tail(50).iterrows():
    expected = 'TRUE' if row['expected_answer'] else 'FALSE'
    print(f"{row['problem_id']:<15} {row['accuracy']:.1f}% ({row['correct_models']}/{row['total_models']}) Expected: {expected}")

TOP 20 EASIEST PROBLEMS (highest accuracy):
0128_A          100.0% (37/37) Expected: TRUE
0572_A          100.0% (37/37) Expected: TRUE
0971_A          100.0% (37/37) Expected: TRUE
0755_A          100.0% (37/37) Expected: TRUE
0094_A          100.0% (37/37) Expected: TRUE
0014_A          100.0% (37/37) Expected: TRUE
0351_B          100.0% (37/37) Expected: FALSE
0732_A          100.0% (37/37) Expected: TRUE
0990_A          100.0% (37/37) Expected: TRUE
0974_A          100.0% (37/37) Expected: TRUE
0184_A          100.0% (37/37) Expected: TRUE
0113_A          100.0% (37/37) Expected: TRUE
0753_A          100.0% (37/37) Expected: TRUE
0252_A          100.0% (37/37) Expected: TRUE
0906_A          100.0% (37/37) Expected: TRUE
0713_A          100.0% (37/37) Expected: TRUE
0023_A          100.0% (37/37) Expected: TRUE
0330_A          100.0% (37/37) Expected: TRUE
0509_A          100.0% (37/37) Expected: TRUE
0159_A          100.0% (37/37) Expected: TRUE
0974_B          100.0% (37/37) Expe