# Multi-Model Bongard Problem Difficulty Analysis

Analyzes all model results to find:
- **Top 100 worst performing problems** (lowest success rate across models)
- **Models that did well in top 100 worst performing problems**
- Overall model comparison

In [2]:
import os
import json
import pandas as pd
from collections import defaultdict
import glob

In [3]:
# Configuration - look in results directory
results_dir = "../results"
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

def analyze_all_models(results_dir):
    """Analyze all JSON files in results directory."""
    
    # Find all JSON files in results directory
    json_files = glob.glob(os.path.join(results_dir, "*.json"))
    print(f"Found {len(json_files)} JSON files in {results_dir}")
    
    if not json_files:
        print("No JSON files found!")
        return {}, {}
    
    # Track results per problem and per model
    problem_results = defaultdict(lambda: {'correct': 0, 'total': 0, 'models_correct': [], 'models_wrong': []})
    model_accuracy = {}
    
    for file_path in json_files:
        model_name = os.path.basename(file_path).replace('.json', '')
        
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            # Check if data is a list
            if not isinstance(data, list):
                continue
                
            correct = 0
            processed = 0
            
            for entry in data:
                # Check if entry is a dict and has required fields
                if not isinstance(entry, dict):
                    continue
                if 'uid' not in entry or 'answer' not in entry:
                    continue
                    
                uid = entry['uid']
                answer = entry['answer']
                
                # Determine expected answer based on UID suffix
                if uid.endswith('A'):
                    expected = 'positive'
                elif uid.endswith('B'):
                    expected = 'negative'
                else:
                    continue
                
                processed += 1
                
                # Track per problem
                problem_results[uid]['total'] += 1
                if answer == expected:
                    correct += 1
                    problem_results[uid]['correct'] += 1
                    problem_results[uid]['models_correct'].append(model_name)
                else:
                    problem_results[uid]['models_wrong'].append(model_name)
            
            if processed > 0:
                # Track per model
                accuracy = (correct / processed) * 100
                model_accuracy[model_name] = {
                    'correct': correct,
                    'total': processed,
                    'accuracy': accuracy
                }
                
        except Exception as e:
            pass  # Silently skip errors
    
    return problem_results, model_accuracy

def find_worst_problems(problem_results, top_n=100):
    """Find the worst performing problems."""
    
    if not problem_results:
        return pd.DataFrame()
    
    problem_stats = []
    for uid, stats in problem_results.items():
        success_rate = (stats['correct'] / stats['total']) * 100 if stats['total'] > 0 else 0
        problem_stats.append({
            'UID': uid,
            'Success_Rate': success_rate,
            'Correct': stats['correct'],
            'Total': stats['total'],
            'Models_Correct': ', '.join(stats['models_correct']) if stats['models_correct'] else 'None'
        })
    
    # Sort by success rate (worst first)
    problem_stats.sort(key=lambda x: x['Success_Rate'])
    
    return pd.DataFrame(problem_stats[:top_n])


In [4]:
# Run the analysis
problem_results, model_accuracy = analyze_all_models(results_dir)

with open('models.json', 'w') as f:
    json.dump({
        'problem_results': dict(problem_results), 
        'model_accuracy': model_accuracy
    }, f, indent=2)

if model_accuracy:
    model_df = pd.DataFrame.from_dict(model_accuracy, orient='index').sort_values('accuracy')

    # Find worst performing problems
    worst_problems = find_worst_problems(problem_results, top_n=100)

    if not worst_problems.empty:
        # Add SCMR column
        worst_problems['SDR_GPT41_Correct'] = worst_problems['Models_Correct'].apply(
            lambda x: 'Yes' if 'sdr_gpt41_gpt41' in str(x) else 'No'
        )
        
        print(f"\nTOP 100 WORST PERFORMING PROBLEMS:")
        print("-" * 80)
        
        # Show all rows without truncation
        with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
            display(worst_problems)
        
    else:
        print("No problems to analyze!")
else:
    print("No valid model results found!")


Found 40 JSON files in ../results

TOP 100 WORST PERFORMING PROBLEMS:
--------------------------------------------------------------------------------


Unnamed: 0,UID,Success_Rate,Correct,Total,Models_Correct,SDR_GPT41_Correct
0,0739_A,0.0,0,40,,No
1,0713_B,0.0,0,40,,No
2,0159_B,0.0,0,40,,No
3,0789_B,0.0,0,40,,No
4,0640_B,2.5,1,40,sdr_gpt5_gptoss,No
5,0753_B,2.5,1,40,sdr_gpt41_llama4,No
6,0025_A,2.5,1,40,sdr_gpt41_llama4,No
7,0386_B,2.5,1,40,cmr_llama4_llama4,No
8,0829_A,2.5,1,40,cvr_gemma3,No
9,0025_B,2.5,1,40,cmr_gpt41_gptoss,No


In [5]:
# %%
# Which models perform best on the worst 100 problems
if not worst_problems.empty and problem_results:
    worst_100_uids = set(worst_problems['UID'].tolist())
    
    # Track each model's performance on worst problems
    model_performance_on_worst = defaultdict(lambda: {'correct': 0, 'total': 0})
    
    for uid in worst_100_uids:
        if uid in problem_results:
            stats = problem_results[uid]
            
            for model in stats['models_correct']:
                model_performance_on_worst[model]['correct'] += 1
                model_performance_on_worst[model]['total'] += 1
            
            for model in stats['models_wrong']:
                model_performance_on_worst[model]['total'] += 1
    
    # Create results
    results = []
    for model, stats in model_performance_on_worst.items():
        if stats['total'] > 0:
            success_rate = (stats['correct'] / stats['total']) * 100
            results.append({
                'Model': model,
                'Correct': stats['correct'],
                'Total': stats['total'],
                'Success_Rate': round(success_rate, 2)
            })
    
    # Sort by success rate and display
    results.sort(key=lambda x: x['Success_Rate'], reverse=True)
    performance_df = pd.DataFrame(results)
    
    print("MODEL PERFORMANCE ON WORST 100 PROBLEMS:")
    display(performance_df)
else:
    print("No data available for analysis")


MODEL PERFORMANCE ON WORST 100 PROBLEMS:


Unnamed: 0,Model,Correct,Total,Success_Rate
0,sdr_gpt5_gptoss,44,100,44.0
1,sdr_gpt5_gpt5,40,100,40.0
2,sdr_gpt41_gptoss,37,100,37.0
3,sdr_gpt41_deepseekr1,37,100,37.0
4,cvr_gpt5,37,100,37.0
5,cmr_llama4_qwen3,34,100,34.0
6,cmr_gpt5_gpt5,32,100,32.0
7,sdr_gpt41_qwen3,31,100,31.0
8,sdr_gpt41_llama4,31,100,31.0
9,cvr_gemini25,31,100,31.0


In [6]:
# # %%
# # Find problems that sdr_gpt41_gpt41 got wrong

# if problem_results:
#     sdr_wrong_problems = []
#     ll = []
#     for uid, stats in problem_results.items():
#         if 'sdr_gpt41_gpt41' in stats['models_wrong']:
#             cvr_correct = 'Yes' if 'cvr_gpt41' in stats['models_correct'] else 'No'
#             scmr2_correct = 'Yes' if 'scmr2_gpt41_gpt41' in stats['models_correct'] else 'No'
#             if cvr_correct == 'Yes' and scmr2_correct == 'Yes':
#                 ll.append(uid)
#             sdr_wrong_problems.append({
#                 'UID': uid,
#                 'CVR_GPT41_Correct': cvr_correct,
#                 'SCMR2_GPT41_Correct': scmr2_correct,

#             })
#     print(ll.sort())
#     print(len(ll))
    
#     if sdr_wrong_problems:
#         sdr_wrong_df = pd.DataFrame(sdr_wrong_problems).sort_values('UID').reset_index(drop=True)
        
#         print(f"PROBLEMS THAT SDR_GPT41_GPT41 GOT WRONG ({len(sdr_wrong_problems)} total):")
#         print("-" * 60)
        
#         # Define styling function
#         def highlight_both_correct(row):
#             if row['CVR_GPT41_Correct'] == 'Yes' and row['SCMR2_GPT41_Correct'] == 'Yes':
#                 return ['background-color: purple'] * len(row)
#             else:
#                 return [''] * len(row)
        
#         with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
#             display(sdr_wrong_df.style.apply(highlight_both_correct, axis=1))
        
#     else:
#         print("SDR_GPT41_GPT41 got all problems correct!")
# else:
#     print("No valid problem results found!")

In [7]:
# Find problems that CVR GPT-5 got right but CMR GPT-5 and SDR GPT-5 both got wrong
if problem_results:
    cvr_gpt5_better_problems = []
    
    for uid, stats in problem_results.items():
        # Check if CVR GPT-5 got it right
        cvr_gpt5_correct = 'cvr_gpt5' in stats['models_correct']
        
        # Check if CMR GPT-5 and SDR GPT-5 both got it wrong
        cmr_gpt5_wrong = 'cmr_gpt5_gpt5' in stats['models_wrong']
        sdr_gpt5_wrong = 'sdr_gpt5_gpt5' in stats['models_wrong']
        
        if cvr_gpt5_correct and cmr_gpt5_wrong and sdr_gpt5_wrong:
            # Check CVR GPT-4.1 performance for additional context
            cvr_gpt41_correct = 'Yes' if 'cvr_gpt41' in stats['models_correct'] else 'No'
            
            cvr_gpt5_better_problems.append({
                'UID': uid,
                'CVR_GPT5_Correct': 'Yes',
                'CMR_GPT5_Correct': 'No',
                'SDR_GPT5_Correct': 'No',
                'CVR_GPT41_Correct': cvr_gpt41_correct,
                'Success_Rate': round((stats['correct'] / stats['total']) * 100, 1) if stats['total'] > 0 else 0
            })
    
    if cvr_gpt5_better_problems:
        cvr_better_df = pd.DataFrame(cvr_gpt5_better_problems).sort_values('Success_Rate').reset_index(drop=True)
        
        print(f"PROBLEMS THAT CVR GPT-5 GOT RIGHT BUT CMR & SDR GPT-5 BOTH GOT WRONG ({len(cvr_gpt5_better_problems)} total):")
        print("-" * 85)
        
        # Show summary statistics
        cvr_gpt41_also_correct = sum(1 for p in cvr_gpt5_better_problems if p['CVR_GPT41_Correct'] == 'Yes')
        
        print(f"Of the {len(cvr_gpt5_better_problems)} problems where CVR GPT-5 outperformed both CMR & SDR GPT-5:")
        print(f"- CVR GPT-4.1 also got {cvr_gpt41_also_correct} correct ({cvr_gpt41_also_correct/len(cvr_gpt5_better_problems)*100:.1f}%)")
        print(f"- Average success rate across all models: {sum(p['Success_Rate'] for p in cvr_gpt5_better_problems)/len(cvr_gpt5_better_problems):.1f}%")
        print()
        
        with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
            display(cvr_better_df)
        
    else:
        print("No problems found where CVR GPT-5 outperformed both CMR GPT-5 and SDR GPT-5!")
else:
    print("No valid problem results found!")

PROBLEMS THAT CVR GPT-5 GOT RIGHT BUT CMR & SDR GPT-5 BOTH GOT WRONG (19 total):
-------------------------------------------------------------------------------------
Of the 19 problems where CVR GPT-5 outperformed both CMR & SDR GPT-5:
- CVR GPT-4.1 also got 10 correct (52.6%)
- Average success rate across all models: 41.7%



Unnamed: 0,UID,CVR_GPT5_Correct,CMR_GPT5_Correct,SDR_GPT5_Correct,CVR_GPT41_Correct,Success_Rate
0,0310_B,Yes,No,No,No,7.5
1,0739_B,Yes,No,No,No,10.0
2,0440_B,Yes,No,No,No,12.5
3,0091_A,Yes,No,No,No,20.0
4,0748_B,Yes,No,No,Yes,20.0
5,0228_B,Yes,No,No,Yes,22.5
6,0766_A,Yes,No,No,No,30.0
7,0402_A,Yes,No,No,Yes,32.5
8,0376_B,Yes,No,No,Yes,32.5
9,0165_A,Yes,No,No,Yes,47.5
