In [16]:
import os
import json
import glob
from collections import defaultdict

# Configuration
results_dir = "../results"

def main():
    # Process all model results
    problems = defaultdict(lambda: {'concept': '', 'caption': '', 'models': {}})
    
    for file_path in glob.glob(os.path.join(results_dir, "*.json")):
        model_name = os.path.basename(file_path).replace('.json', '')
        
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            for entry in data:
                if not isinstance(entry, dict) or 'uid' not in entry or 'answer' not in entry:
                    continue
                
                uid = entry['uid']
                answer = entry['answer']
                concept = entry.get('concept', 'Unknown')
                caption = entry.get('caption', 'Unknown')
                
                # Determine expected answer
                if uid.endswith('A'):
                    expected = 'positive'
                elif uid.endswith('B'):
                    expected = 'negative'
                else:
                    continue
                
                # Set concept and caption if not already set
                # if 'concept' not in problems[uid]:
                problems[uid]['concept'] = concept
                problems[uid]['caption'] = caption
                
                # Record result
                problems[uid]['models'][model_name] = "correct" if answer == expected else "incorrect"
                        
        except:
            continue
    
    # Add totals and save
    final_data = {}
    for uid, data in problems.items():
        models = data['models']
        
        # Calculate accuracy by model type for this problem
        cvr_correct = cvr_total = 0
        cmr_correct = cmr_total = 0
        sdr_correct = sdr_total = 0
        
        for model, result in models.items():
            if model.startswith('cvr_'):
                cvr_total += 1
                if result == 'correct':
                    cvr_correct += 1
            elif model.startswith('cmr_'):
                cmr_total += 1
                if result == 'correct':
                    cmr_correct += 1
            elif model.startswith('sdr_'):
                sdr_total += 1
                if result == 'correct':
                    sdr_correct += 1
        
        final_data[uid] = {
            'concept': data['concept'],
            'caption': data['caption'],
            'total': len(models),
            'correct': sum(1 for result in models.values() if result == "correct"),
            'cvr_accuracy': round(cvr_correct/cvr_total*100, 1) if cvr_total > 0 else 0,
            'cmr_accuracy': round(cmr_correct/cmr_total*100, 1) if cmr_total > 0 else 0,
            'sdr_accuracy': round(sdr_correct/sdr_total*100, 1) if sdr_total > 0 else 0,
            'models': models
        }
    
    # Sort by UID (number first, then A/B suffix)
    sorted_data = dict(sorted(final_data.items(), key=lambda x: (int(x[0][:-2]), x[0][-1])))
    
    with open('problems.json', 'w') as f:
        json.dump(sorted_data, f, indent=2)
    
    # Calculate model type accuracies
    cvr_correct = cvr_total = 0
    cmr_correct = cmr_total = 0
    sdr_correct = sdr_total = 0
    
    for uid, data in sorted_data.items():
        for model, result in data['models'].items():
            if model.startswith('cvr_'):
                cvr_total += 1
                if result == 'correct':
                    cvr_correct += 1
            elif model.startswith('cmr_'):
                cmr_total += 1
                if result == 'correct':
                    cmr_correct += 1
            elif model.startswith('sdr_'):
                sdr_total += 1
                if result == 'correct':
                    sdr_correct += 1
    
    print(f"Processed {len(sorted_data)} problems -> problems.json")
    print(f"CVR Accuracy: {cvr_correct}/{cvr_total} ({cvr_correct/cvr_total*100:.1f}%)" if cvr_total > 0 else "CVR Accuracy: No CVR models found")
    print(f"CMR Accuracy: {cmr_correct}/{cmr_total} ({cmr_correct/cmr_total*100:.1f}%)" if cmr_total > 0 else "CMR Accuracy: No CMR models found")
    print(f"SDR Accuracy: {sdr_correct}/{sdr_total} ({sdr_correct/sdr_total*100:.1f}%)" if sdr_total > 0 else "SDR Accuracy: No SDR models found")

if __name__ == "__main__":
    main()

Processed 400 problems -> problems.json
CVR Accuracy: 2023/2800 (72.2%)
CMR Accuracy: 6806/10000 (68.1%)
SDR Accuracy: 2118/3200 (66.2%)
