# Multi-Model Bongard Problem Difficulty Analysis

Analyzes all model results to find:
- **Top 100 worst performing problems** (lowest success rate across models)
- **Models that did well in top 100 worst performing problems**
- Overall model comparison

In [46]:
import os
import json
import pandas as pd
from collections import defaultdict
import glob

In [47]:
# Configuration - look in results directory
results_dir = "results/"
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

def analyze_all_models(results_dir):
    """Analyze all JSON files in results directory."""
    
    # Find all JSON files in results directory
    json_files = glob.glob(os.path.join(results_dir, "*.json"))
    print(f"Found {len(json_files)} JSON files in {results_dir}")
    
    if not json_files:
        print("No JSON files found!")
        return {}, {}
    
    # Track results per problem and per model
    problem_results = defaultdict(lambda: {'correct': 0, 'total': 0, 'models_correct': [], 'models_wrong': []})
    model_accuracy = {}
    
    for file_path in json_files:
        model_name = os.path.basename(file_path).replace('.json', '')
        
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            # Check if data is a list
            if not isinstance(data, list):
                continue
                
            correct = 0
            processed = 0
            
            for entry in data:
                # Check if entry is a dict and has required fields
                if not isinstance(entry, dict):
                    continue
                if 'uid' not in entry or 'answer' not in entry:
                    continue
                    
                uid = entry['uid']
                answer = entry['answer']
                
                # Determine expected answer based on UID suffix
                if uid.endswith('A'):
                    expected = 'positive'
                elif uid.endswith('B'):
                    expected = 'negative'
                else:
                    continue
                
                processed += 1
                
                # Track per problem
                problem_results[uid]['total'] += 1
                if answer == expected:
                    correct += 1
                    problem_results[uid]['correct'] += 1
                    problem_results[uid]['models_correct'].append(model_name)
                else:
                    problem_results[uid]['models_wrong'].append(model_name)
            
            if processed > 0:
                # Track per model
                accuracy = (correct / processed) * 100
                model_accuracy[model_name] = {
                    'correct': correct,
                    'total': processed,
                    'accuracy': accuracy
                }
                
        except Exception as e:
            pass  # Silently skip errors
    
    return problem_results, model_accuracy

def find_worst_problems(problem_results, top_n=100):
    """Find the worst performing problems."""
    
    if not problem_results:
        return pd.DataFrame()
    
    problem_stats = []
    for uid, stats in problem_results.items():
        success_rate = (stats['correct'] / stats['total']) * 100 if stats['total'] > 0 else 0
        problem_stats.append({
            'UID': uid,
            'Success_Rate': success_rate,
            'Correct': stats['correct'],
            'Total': stats['total'],
            'Models_Correct': ', '.join(stats['models_correct']) if stats['models_correct'] else 'None'
        })
    
    # Sort by success rate (worst first)
    problem_stats.sort(key=lambda x: x['Success_Rate'])
    
    return pd.DataFrame(problem_stats[:top_n])


In [48]:
# Run the analysis
problem_results, model_accuracy = analyze_all_models(results_dir)

if model_accuracy:
    print(f"MODEL ACCURACY SUMMARY:")
    print("-" * 50)
    model_df = pd.DataFrame.from_dict(model_accuracy, orient='index').sort_values('accuracy')
    display(model_df)

    # Find worst performing problems
    worst_problems = find_worst_problems(problem_results, top_n=100)

    if not worst_problems.empty:
        # Add SCMR column
        worst_problems['SDR_GPT41_Correct'] = worst_problems['Models_Correct'].apply(
            lambda x: 'Yes' if 'sdr_gpt41_gpt41' in str(x) else 'No'
        )
        
        print(f"\nTOP 100 WORST PERFORMING PROBLEMS:")
        print("-" * 80)
        
        # Show all rows without truncation
        with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
            display(worst_problems)
        
    else:
        print("No problems to analyze!")
else:
    print("No valid model results found!")


Found 31 JSON files in results/
MODEL ACCURACY SUMMARY:
--------------------------------------------------


Unnamed: 0,correct,total,accuracy
cmr_llava16_gemma3,241,400,60.25
scmr_gemma3_gemma3,244,400,61.0
cmr_llama4_gemma3,250,400,62.5
cmr_llava16_llama4,253,400,63.25
cmr_llava16_qwen3,253,400,63.25
cvr_mistral3,254,400,63.5
cmr_llava16_deepseekr1,255,400,63.75
cmr_qwen25_gemma3,258,400,64.5
cmr_gemma3_gemma3,258,400,64.5
cmr_llama4_llama4,264,400,66.0



TOP 100 WORST PERFORMING PROBLEMS:
--------------------------------------------------------------------------------


Unnamed: 0,UID,Success_Rate,Correct,Total,Models_Correct,SDR_GPT41_Correct
0,0399_A,0.0,0,31,,No
1,0159_B,0.0,0,31,,No
2,0640_B,0.0,0,31,,No
3,0713_B,0.0,0,31,,No
4,0753_B,0.0,0,31,,No
5,0094_B,0.0,0,31,,No
6,0739_B,0.0,0,31,,No
7,0025_A,0.0,0,31,,No
8,0739_A,0.0,0,31,,No
9,0789_B,3.225806,1,31,sdr_gpt41_gpt41,Yes


In [49]:
# %%
# Which models perform best on the worst 100 problems
if not worst_problems.empty and problem_results:
    worst_100_uids = set(worst_problems['UID'].tolist())
    
    # Track each model's performance on worst problems
    model_performance_on_worst = defaultdict(lambda: {'correct': 0, 'total': 0})
    
    for uid in worst_100_uids:
        if uid in problem_results:
            stats = problem_results[uid]
            
            for model in stats['models_correct']:
                model_performance_on_worst[model]['correct'] += 1
                model_performance_on_worst[model]['total'] += 1
            
            for model in stats['models_wrong']:
                model_performance_on_worst[model]['total'] += 1
    
    # Create results
    results = []
    for model, stats in model_performance_on_worst.items():
        if stats['total'] > 0:
            success_rate = (stats['correct'] / stats['total']) * 100
            results.append({
                'Model': model,
                'Correct': stats['correct'],
                'Total': stats['total'],
                'Success_Rate': round(success_rate, 2)
            })
    
    # Sort by success rate and display
    results.sort(key=lambda x: x['Success_Rate'], reverse=True)
    performance_df = pd.DataFrame(results)
    
    print("MODEL PERFORMANCE ON WORST 100 PROBLEMS:")
    display(performance_df)
else:
    print("No data available for analysis")


MODEL PERFORMANCE ON WORST 100 PROBLEMS:


Unnamed: 0,Model,Correct,Total,Success_Rate
0,sdr_gpt41_gpt41,40,100,40.0
1,sdr_gpt41_qwen3,38,100,38.0
2,cmr_llama4_qwen3,34,100,34.0
3,cvr_gpt41,30,100,30.0
4,scmr_gpt41_qwen3,29,100,29.0
5,cmr_gpt41_qwen3,27,100,27.0
6,cmr_llama4_deepseekr1,24,100,24.0
7,cmr_llava16_qwen3,24,100,24.0
8,cmr_llama4_llama4,22,100,22.0
9,cmr_gemma3_deepseekr1,22,100,22.0
