In [5]:
import os
import json
import pandas as pd
from collections import defaultdict

def analyze_bongard_results_table(results_path):
    """
    Analyze Bongard problem results and display wrong predictions in a DataFrame table.
    Expected pattern: UIDs ending with 'A' should be positive, 'B' should be negative.
    """
    
    # Load the JSON data
    with open(results_path, 'r') as f:
        data = json.load(f)
    
    # Initialize counters and storage for analysis
    correct_predictions = 0
    incorrect_predictions = 0
    wrong_entries = []
    
    print(f"Analyzing {len(data)} entries from {results_path}")
    print("=" * 60)
    
    for entry in data:
        uid = entry['uid']
        answer = entry['answer']
        
        # Determine expected answer based on UID suffix
        if uid.endswith('A'):
            expected = 'positive'
        elif uid.endswith('B'):
            expected = 'negative'
        else:
            print(f"WARNING: UID {uid} doesn't end with A or B")
            continue
        
        # Check if prediction matches expected
        if answer == expected:
            correct_predictions += 1
        else:
            incorrect_predictions += 1
            wrong_entries.append({
                'UID': uid,
                'Expected': expected,
                'Actual': answer,
                'Concept': entry.get('concept', 'N/A')
            })
    
    # Print summary statistics
    total = correct_predictions + incorrect_predictions
    accuracy = (correct_predictions / total) * 100 if total > 0 else 0
    
    print(f"SUMMARY:")
    print(f"Total entries: {total}")
    print(f"Correct predictions: {correct_predictions}")
    print(f"Incorrect predictions: {incorrect_predictions}")
    print(f"Accuracy: {accuracy:.2f}%")
    print("=" * 60)
    
    # Create DataFrame and display
    if wrong_entries:
        df_wrong = pd.DataFrame(wrong_entries)
        
        # Configure pandas display options for better table formatting
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_colwidth', 50)
        pd.set_option('display.width', None)
        
        print(f"\nINCORRECT PREDICTIONS ({len(wrong_entries)} entries):")
        print("-" * 80)
        print(df_wrong.to_string(index=True))
        
        print(f"\nBREAKDOWN BY EXPECTED ANSWER:")
        print("-" * 40)
        wrong_by_expected = df_wrong['Expected'].value_counts()
        for expected, count in wrong_by_expected.items():
            print(f"Expected {expected}, got opposite: {count} entries")
        
        # Save to CSV for detailed review (including all fields)
        df_full = pd.DataFrame([{
            'UID': entry['uid'],
            'Expected': 'positive' if entry['uid'].endswith('A') else 'negative',
            'Actual': entry['answer'],
            'Concept': entry.get('concept', 'N/A'),
            'Query_Details': entry.get('query_details', 'N/A'),
            'Distinguishing_Feature': entry.get('distinguishing_feature', 'N/A')
        } for entry in data if (entry['uid'].endswith('A') and entry['answer'] != 'positive') or 
                                (entry['uid'].endswith('B') and entry['answer'] != 'negative')])
        
        output_file = results_path.replace('.json', '_wrong_predictions.csv')
        df_full.to_csv(output_file, index=False)
        print(f"\nDetailed wrong predictions saved to: {output_file}")
        
        return df_wrong
    else:
        print("\n🎉 All predictions are correct!")
        return pd.DataFrame()

if __name__ == "__main__":
    # Set the path to your results file
    results_path = "../results/scmr_gpt41_gpt41.json"
    
    # Check if file exists
    if not os.path.exists(results_path):
        print(f"Error: File not found at {results_path}")
        print("Please check the path and try again.")
    else:
        # Run the analysis and get DataFrame
        df_wrong = analyze_bongard_results_table(results_path)
        
        # Show some analysis
        if not df_wrong.empty:
            print(f"\nANALYSIS:")
            print(f"- Total wrong: {len(df_wrong)}")
            print(f"- Wrong A's (should be positive): {len(df_wrong[df_wrong['Expected'] == 'positive'])}")
            print(f"- Wrong B's (should be negative): {len(df_wrong[df_wrong['Expected'] == 'negative'])}")


Analyzing 400 entries from ../results/scmr_gpt41_gpt41.json
SUMMARY:
Total entries: 400
Correct predictions: 284
Incorrect predictions: 116
Accuracy: 71.00%

INCORRECT PREDICTIONS (116 entries):
--------------------------------------------------------------------------------
        UID  Expected    Actual                                Concept
0    0640_B  negative  positive              person holding fish river
1    0878_B  negative  positive                            stack money
2    0999_A  positive  negative           well trimmed bonsai tree pot
3    0572_B  negative  positive                       passenger trains
4    0057_B  negative  positive                       aerial view city
5    0225_B  negative  positive                          tomato dishes
6    0399_B  negative  positive             closeup fingers pair hands
7    0310_B  negative  positive                    bridge across river
8    0098_A  positive  negative                woman red dress dancing
9    0789_B  n