In [None]:
# GSM8K and GSM-Symbolic Evaluation

import json
import pandas as pd
from collections import Counter

with open('YOUR-PATH-HERE', 'r') as f:
    data = json.load(f)

baseline_results = []
for item in data['results']:
    for noise_test in item['noise_tests']:
        if noise_test['noise_scale'] == 0.0:
            first_branch_answer = noise_test['branch_answers'][0]
            expected = noise_test['expected_answer']
            is_correct = (first_branch_answer == expected)
            
            baseline_results.append({
                'question_id': item['question_id'],
                'is_correct': is_correct,
                'first_answer': first_branch_answer,
                'expected_answer': expected
            })

baseline_df = pd.DataFrame(baseline_results)
baseline_correct = baseline_df['is_correct'].sum()
baseline_count = len(baseline_df)
baseline_incorrect = baseline_count - baseline_correct
baseline_accuracy = (baseline_correct / baseline_count * 100)

print("=== BASELINE ===")
print(f"Question Count: {baseline_count}")
print(f"Correct: {baseline_correct}")
print(f"Incorrect: {baseline_incorrect}")
print(f"Accuracy: {baseline_accuracy:.2f}%")

results = []
for item in data['results']:
    for noise_test in item['noise_tests']:
        if noise_test['noise_scale'] == 0.2:
            vote_dist = noise_test['vote_distribution']
            max_votes = max(vote_dist.values())
            num_with_max = sum(1 for v in vote_dist.values() if v == max_votes)
            
            if max_votes == 5:
                category = 'Unanimous (5/5)'
            elif max_votes == 4:
                category = 'Strong Majority (4/5)'
            elif max_votes == 3:
                category = 'Moderate Majority (3/5)'
            elif max_votes == 2 and num_with_max == 1:
                category = 'Minimal Majority (2/5)'
            elif max_votes == 2 and num_with_max >= 2:
                category = 'Split Vote (2/2)'
            else:
                category = 'Other'
            
            results.append({
                'question_id': item['question_id'],
                'category': category,
                'is_correct': noise_test['is_correct'],
                'majority_answer': noise_test['majority_answer'],
                'expected_answer': noise_test['expected_answer']
            })

df = pd.DataFrame(results)

summary = df.groupby('category').agg(
    Question_Count=('question_id', 'count'),
    Correct=('is_correct', 'sum'),
    Incorrect=('is_correct', lambda x: (~x).sum())
).reset_index()

summary['No_Answer'] = 0
summary['Accuracy'] = (summary['Correct'] / summary['Question_Count'] * 100).round(2)

print("\n=== MAJORITY VOTING (noise_scale = 0.2) ===")
print(summary.to_string(index=False))

print("\n=== For LaTeX table ===")
print(f"& Baseline (No majority) & {baseline_count} & {baseline_correct} & 0 & {baseline_incorrect} & {baseline_accuracy:.2f} \\\\")
print("\\cmidrule(lr){2-7}")

category_order = ['Unanimous (5/5)', 'Strong Majority (4/5)', 'Moderate Majority (3/5)', 
                  'Minimal Majority (2/5)', 'Split Vote (2/2)']
for cat in category_order:
    row = summary[summary['category'] == cat]
    if not row.empty:
        row = row.iloc[0]
        print(f"& {row['category']} & {row['Question_Count']} & {row['Correct']} & {row['No_Answer']} & {row['Incorrect']} & {row['Accuracy']:.2f} \\\\")
    else:
        print(f"& {cat} & 0 & 0 & 0 & 0 & -- \\\\")

In [None]:
# MMLU Evaluation

import json
import pandas as pd
import re
from collections import Counter

def extract_answer(text):
    """
    Extract the final answer (A, B, C, or D) from model output text.
    Handles multiple formats including LaTeX, Chinese, and various phrasings.
    """
    if not text or not isinstance(text, str):
        return "NO_ANSWER_FOUND"
    
    text = text.strip()
    
    patterns = [
        r'\\boxed\s*\{\s*([A-Da-d])\s*\}',
        r'[Tt]he\s+answer\s+is[:\s]*\(?\\?\s*([A-Da-d])\b',
        r'[Tt]he\s+answer\s+is[:\s]*\(?([A-Da-d])\)?[\.\s]*$',
        r'[Tt]he\s+answer\s+is\s+([A-Da-d])[\.\,\s]',
        r'[Aa]nswer[:\s]+is?\s*\(?([A-Da-d])\)?',
        r'[Tt]he\s+correct\s+answer\s+is[:\s]*\(?([A-Da-d])\)?',
        r'[Tt]herefore,?\s+the\s+answer\s+is[:\s]*\(?([A-Da-d])\)?',
        r'[Tt]hus,?\s+the\s+answer\s+is[:\s]*\(?([A-Da-d])\)?',
        r'[Ss]o,?\s+the\s+answer\s+is[:\s]*\(?([A-Da-d])\)?',
        r'[Tt]he\s+answer\s+is\s+([A-Da-d])\.\s*$',
        r'答案是\s*([A-Da-d])',
        r'正确答案是\s*([A-Da-d])',
        r'选择?\s*([A-Da-d])',
        r'答案[：:]\s*([A-Da-d])',
        r'[Oo]ption\s+([A-Da-d])\s*(?:is correct|is the answer)',
        r'\b([A-Da-d])\s+is\s+(?:the\s+)?correct',
        r'\bis\s+([A-Da-d])\.?\s*$',
        r'(?:^|\n)\s*([A-Da-d])[\.\)]\s*$',
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
        if matches:
            answer = matches[-1].upper()
            if answer in ['A', 'B', 'C', 'D']:
                return answer
    
    lines = text.split('\n')
    for line in reversed(lines[-10:]):
        match = re.search(r'answer\s+is\s+(?:\(?\\?boxed\s*\{?\s*)?([A-Da-d])', line, re.IGNORECASE)
        if match:
            return match.group(1).upper()
        
        if len(line.strip()) < 20:
            match = re.search(r'^[^a-zA-Z]*([A-Da-d])\.?\s*$', line.strip())
            if match:
                return match.group(1).upper()
    
    return "NO_ANSWER_FOUND"

with open('YOUR-PATH-HERE', 'r') as f:
    data = json.load(f)

for item in data['results']:
    for noise_test in item['noise_tests']:
        new_branch_answers = []
        for branch_text in noise_test['branch_texts']:
            answer = extract_answer(branch_text)
            new_branch_answers.append(answer)
        
        noise_test['branch_answers'] = new_branch_answers
        noise_test['vote_distribution'] = dict(Counter(new_branch_answers))
        
        valid_answers = [a for a in new_branch_answers if a != "NO_ANSWER_FOUND"]
        if valid_answers:
            vote_counts = Counter(valid_answers)
            noise_test['majority_answer'] = vote_counts.most_common(1)[0][0]
        else:
            noise_test['majority_answer'] = "NO_ANSWER_FOUND"
        
        noise_test['is_correct'] = (noise_test['majority_answer'] == noise_test['expected_answer'])

baseline_results = []
for item in data['results']:
    for noise_test in item['noise_tests']:
        if noise_test['noise_scale'] == 0.0:
            first_branch_answer = noise_test['branch_answers'][0]
            expected = noise_test['expected_answer']
            is_correct = (first_branch_answer == expected)
            
            baseline_results.append({
                'question_id': item['question_id'],
                'is_correct': is_correct,
                'first_answer': first_branch_answer,
                'expected_answer': expected
            })

baseline_df = pd.DataFrame(baseline_results)
baseline_correct = baseline_df['is_correct'].sum()
baseline_count = len(baseline_df)
baseline_incorrect = baseline_count - baseline_correct
baseline_accuracy = (baseline_correct / baseline_count * 100)

print("=== BASELINE ===")
print(f"Question Count: {baseline_count}")
print(f"Correct: {baseline_correct}")
print(f"Incorrect: {baseline_incorrect}")
print(f"Accuracy: {baseline_accuracy:.2f}%")

results = []
for item in data['results']:
    for noise_test in item['noise_tests']:
        if noise_test['noise_scale'] == 0.2:
            branch_answers = noise_test['branch_answers']
            valid_answers = [a for a in branch_answers if a != "NO_ANSWER_FOUND"]
            
            if valid_answers:
                vote_counts = Counter(valid_answers)
                max_votes = max(vote_counts.values())
                num_with_max = sum(1 for v in vote_counts.values() if v == max_votes)
            else:
                max_votes = 0
                num_with_max = 0
            
            if max_votes == 5:
                category = 'Unanimous (5/5)'
            elif max_votes == 4:
                category = 'Strong Majority (4/5)'
            elif max_votes == 3:
                category = 'Moderate Majority (3/5)'
            elif max_votes == 2 and num_with_max == 1:
                category = 'Minimal Majority (2/5)'
            elif max_votes == 2 and num_with_max >= 2:
                category = 'Split Vote (2/2)'
            else:
                category = 'Other'
            
            results.append({
                'question_id': item['question_id'],
                'category': category,
                'is_correct': noise_test['is_correct'],
                'majority_answer': noise_test['majority_answer'],
                'expected_answer': noise_test['expected_answer']
            })

df = pd.DataFrame(results)

summary = df.groupby('category').agg(
    Question_Count=('question_id', 'count'),
    Correct=('is_correct', 'sum'),
    Incorrect=('is_correct', lambda x: (~x).sum())
).reset_index()

summary['No_Answer'] = 0
summary['Accuracy'] = (summary['Correct'] / summary['Question_Count'] * 100).round(2)

print("\n=== MAJORITY VOTING (noise_scale = 0.2) ===")
print(summary.to_string(index=False))

print("\n=== For LaTeX table ===")
print(f"& Baseline (No majority) & {baseline_count} & {baseline_correct} & 0 & {baseline_incorrect} & {baseline_accuracy:.2f} \\\\")
print("\\cmidrule(lr){2-7}")

category_order = ['Unanimous (5/5)', 'Strong Majority (4/5)', 'Moderate Majority (3/5)', 
                  'Minimal Majority (2/5)', 'Split Vote (2/2)']
for cat in category_order:
    row = summary[summary['category'] == cat]
    if not row.empty:
        row = row.iloc[0]
        print(f"& {row['category']} & {row['Question_Count']} & {row['Correct']} & {row['No_Answer']} & {row['Incorrect']} & {row['Accuracy']:.2f} \\\\")
    else:
        print(f"& {cat} & 0 & 0 & 0 & 0 & -- \\\\")