In [22]:
API_KEY= [UR_API]
GEMINI_API_KEY = [UR_GEMINI_API]

In [23]:
import os
import json
import time
import tiktoken
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from openai import OpenAI
import google.generativeai as genai
import logging
import re

In [24]:
#GEMINI ONE
import os
import json
import time
import tiktoken
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from openai import OpenAI
import google.generativeai as genai
import logging
import re


# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Gemini API Configuration
genai.configure(api_key=GEMINI_API_KEY)

# Load MATH-500 dataset
ds = load_dataset("HuggingFaceH4/MATH-500")

# Models to Compare
models = {
    "Gemini 2.0 Flash": "gemini-2.0-flash",
}

tokenizer = tiktoken.get_encoding("cl100k_base")

In [10]:
#OPEN Router One

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# OpenRouter Client
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=API_KEY,
)

# Gemini API Configuration
genai.configure(api_key=GEMINI_API_KEY)

# Load MATH-500 dataset
ds = load_dataset("HuggingFaceH4/MATH-500")

# Models to Compare
models = {
    "LLaMA 3 8B": "meta-llama/llama-3-8b-instruct:free",
    "DeepSeek Chat": "deepseek/deepseek-chat:free",
    "Dolphin 3.0 R1": "cognitivecomputations/dolphin3.0-r1-mistral-24b:free",
}

#DO one model at time

tokenizer = tiktoken.get_encoding("cl100k_base")

In [27]:
def create_prompt(problem, level="unknown"):
    return f"""Below is a math problem. Solve it step by step and provide the final answer.

PROBLEM:
{problem}

Please take your time to work through this carefully. Do all the steps and reason it properly and give me all the steps.
In case of a word problem, where the answer is in a sentence, give the answer in a word and remove the units like inches or cm etc.
Give me the final answer as
FINAL ANSWER: [your_answer]
Make sure this final answer is a numerical value or word  or an expression. In case of an expression or a word, return it in LaTeX format. """

def clean_latex_answer(answer):
    """
    Clean and normalize LaTeX answers by removing \boxed{}, \left, \right, and inline math markers.
    """
    # Remove all instances of \boxed{...}
    answer = re.sub(r'\\boxed\s*{\s*(.*?)\s*}', r'\1', answer)

    # Remove \left and \right
    answer = re.sub(r'\\left\s*|\s*\\right', '', answer)

    # Remove LaTeX inline math markers like \( ... \) and $...$
    answer = re.sub(r'\\\(|\\\)|\$', '', answer)

    # Remove extra spaces and strip trailing characters
    answer = answer.strip().rstrip(".,:;")
    
    return answer

def extract_answer(response):
    # Check for common final answer patterns
    final_answer_patterns = [
        "Final answer:", "FINAL ANSWER:", "final answer is", "answer:", "Answer:"
    ]

    for phrase in final_answer_patterns:
        if phrase in response:
            answer_part = response.split(phrase)[-1].strip()
            
            # Apply cleanup using regex
            answer_part = clean_latex_answer(answer_part)

            # Remove brackets
            answer_part = answer_part.strip('[]')
            decimal_match = re.search(r"[-+]?\d*\.\d+", answer_part)
            if decimal_match:
                return decimal_match.group(0)

            # Get the first line or until a period
            if "\n" in answer_part:
                return answer_part.split("\n")[0].strip().rstrip(".")
            elif "." in answer_part:
                return answer_part.split(".")[0].strip()
            else:
                return answer_part.strip()

    # For word problems (name, single word answers)
    lines = response.strip().split('\n')
    for line in reversed(lines):
        line = line.strip()
        if '"' in line or "'" in line:
            try:
                return line.split('"')[1] if '"' in line else line.split("'")[1]
            except IndexError:
                continue
        elif len(line.split()) == 1:
            return line

    return "Answer not found"




def normalize_answer(answer):
    """Normalize answer format for consistent comparison."""
    answer = str(answer).strip()

    # Convert LaTeX variants with \text{} to plain text
    if answer.startswith(r"\text{") and answer.endswith("}"):
        answer = answer[6:-1]  # Extract the content within \text{}

    # Remove LaTeX block markers
    answer = re.sub(r'\\\[|\]', '', answer)
    
    answer = re.sub(r"\\!", "", answer)


    # Remove ** or other extra symbols
    answer = re.sub(r'^\*+|\*+$', '', answer)

    if re.match(r'^[\d,\.\s]+$', answer):
        answer = answer.replace(",", "")
        
    # Handle LaTeX-like outputs
    answer = answer.replace("\\dfrac", "\\frac")
    answer = answer.replace("\\left", "").replace("\\right", "")
    answer = answer.replace("^\\circ", "").replace("^{\\circ}", "").replace("^°", "")
    answer = answer.replace("}{", "/").replace("{", "").replace("}", "")

    # Handle LaTeX matrices (pmatrix, bmatrix, etc.)
    matrix_pattern = r"\\begin\{(?:bmatrix|pmatrix|vmatrix|Vmatrix)\}(.*?)\\end\{(?:bmatrix|pmatrix|vmatrix|Vmatrix)\}"
    matrix_match = re.search(matrix_pattern, answer, re.DOTALL)

    if matrix_match:
        matrix_content = matrix_match.group(1)
        # Clean up the matrix and format it as a simple vector
        matrix_vector = "[" + ", ".join(matrix_content.split("\\\\")) + "]"
        answer = matrix_vector.strip()

    return answer.strip().rstrip(".,:;")



def check_numeric_equality(val1, val2):
    try:
        # Handle fractions
        if "/" in val1 and "/" in val2:
            try:
                v1_num, v1_denom = map(float, val1.split("/"))
                v2_num, v2_denom = map(float, val2.split("/"))

                # Check for division by zero
                if v1_denom == 0 or v2_denom == 0:
                    logger.warning(f"Division by zero detected when comparing {val1} and {val2}")
                    return False

                return abs((v1_num / v1_denom) - (v2_num / v2_denom)) < 1e-6
            except ValueError as e:
                logger.warning(f"Error parsing fractions {val1} and {val2}: {str(e)}")
                return False

        # Handle direct numeric comparison
        try:
            return abs(float(val1) - float(val2)) < 1e-6
        except ValueError as e:
            logger.warning(f"Error converting to float {val1} and {val2}: {str(e)}")
            return False
    except Exception as e:
        logger.error(f"Unexpected error in numeric equality check between {val1} and {val2}: {str(e)}")
        return False

def check_angular_equality(val1, val2):
    """Compare angular values, handling degrees and radians."""
    try:
        # Convert all to radians for comparison
        def to_radians(val):
            val = val.replace("\\pi", "pi").replace("π", "pi")

            # Handle degrees
            if "°" in val:
                try:
                    degrees = float(val.replace("°", ""))
                    return degrees * (3.14159265359 / 180)
                except ValueError:
                    pass

            # Handle pi fractions (pi/2, 3pi/4, etc.)
            if "pi" in val:
                try:
                    if "/" in val:
                        parts = val.replace("pi", "").split("/")
                        if parts[0].strip() == "":
                            parts[0] = "1"
                        num = float(parts[0])
                        denom = float(parts[1])
                        return num * 3.14159265359 / denom
                    else:
                        multiplier = 1.0
                        if val.replace("pi", "").strip():
                            multiplier = float(val.replace("pi", ""))
                        return multiplier * 3.14159265359
                except (ValueError, ZeroDivisionError):
                    pass

            # If it's just a number, return it
            try:
                return float(val)
            except ValueError:
                return val

        # Convert both values to radians
        val1_rad = to_radians(val1)
        val2_rad = to_radians(val2)

        # If conversion failed (returned non-numeric), do string comparison
        if isinstance(val1_rad, str) or isinstance(val2_rad, str):
            return val1 == val2

        # Compare angles in radians, taking periodicity into account
        val1_mod = val1_rad % (2 * 3.14159265359)
        val2_mod = val2_rad % (2 * 3.14159265359)

        return abs(val1_mod - val2_mod) < 1e-4
    except Exception as e:
        logger.error(f"Error in angular equality check: {str(e)}")
        return False

def check_answer(predicted, correct):
    """Check if predicted answer matches correct answer with improved normalization."""
    try:
        # Normalize both answers
        predicted_norm = normalize_answer(predicted)
        correct_norm = normalize_answer(correct)

        # Direct string comparison after normalization
        if predicted_norm == correct_norm:
            return True

        # Strip all LaTeX and spaces for strict comparison
        predicted_clean = predicted_norm.replace("\\", "").replace(" ", "").lower()
        correct_clean = correct_norm.replace("\\", "").replace(" ", "").lower()

        if predicted_clean == correct_clean:
            return True

        # Special handling for common cases

        # 1. Angular measurements
        if "°" in predicted_norm or "°" in correct_norm or "pi" in predicted_norm or "pi" in correct_norm:
            return check_angular_equality(predicted_norm, correct_norm)

        # 2. Numeric equality for numbers and fractions
        try:
            # Try direct numeric comparison for simple numbers
            if predicted_clean.replace(".", "").isdigit() and correct_clean.replace(".", "").isdigit():
                return abs(float(predicted_clean) - float(correct_clean)) < 1e-6

            # Handle fractions
            if "/" in predicted_clean and "/" in correct_clean:
                p_parts = predicted_clean.split("/")
                c_parts = correct_clean.split("/")

                if len(p_parts) == 2 and len(c_parts) == 2:
                    p_num, p_denom = float(p_parts[0]), float(p_parts[1])
                    c_num, c_denom = float(c_parts[0]), float(c_parts[1])

                    # Check for division by zero
                    if p_denom == 0 or c_denom == 0:
                        return False

                    return abs((p_num / p_denom) - (c_num / c_denom)) < 1e-6
        except ValueError:
            pass

        # 3. Coordinate pairs (x,y) or (r,θ)
        if "(" in predicted_clean and ")" in predicted_clean and "(" in correct_clean and ")" in correct_clean:
            try:
                # Extract values inside parentheses
                p_coords = predicted_clean.split("(")[1].split(")")[0].split(",")
                c_coords = correct_clean.split("(")[1].split(")")[0].split(",")

                if len(p_coords) == 2 and len(c_coords) == 2:
                    # Compare first coordinate
                    if p_coords[0] != c_coords[0]:
                        try:
                            if abs(float(p_coords[0]) - float(c_coords[0])) > 1e-6:
                                return False
                        except ValueError:
                            if p_coords[0] != c_coords[0]:
                                return False

                    # Compare second coordinate (could be angle)
                    if "pi" in p_coords[1] or "pi" in c_coords[1]:
                        return check_angular_equality(p_coords[1], c_coords[1])
                    else:
                        try:
                            return abs(float(p_coords[1]) - float(c_coords[1])) < 1e-6
                        except ValueError:
                            return p_coords[1] == c_coords[1]
            except Exception as e:
                logger.warning(f"Error comparing coordinates: {str(e)}")

        # If all else fails, return False
        return False
    except Exception as e:
        logger.error(f"Error comparing answers '{predicted}' and '{correct}': {str(e)}")
        return False

import time

request_count = 0

def query_model(model_id, prompt):
    global request_count
    try:
        if model_id == "gemini-2.0-flash":
            model = genai.GenerativeModel(model_id)
            response = model.generate_content(prompt)
            result = response.text
        else:
            completion = client.chat.completions.create(
                model=model_id,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
                max_tokens=2048
            )
            if not completion or not completion.choices:
                return "Error: Empty response from model."
            result = completion.choices[0].message.content

        request_count += 1

        # After every 15 requests, wait for 30 seconds
        if request_count % 15 == 0:
            print("Rate limit reached. Pausing for 30 seconds...")
            time.sleep(30)

        return result
        
    except Exception as e:
        logger.error(f"Error querying model {model_id}: {str(e)}")
        return f"Error: {str(e)}"


def evaluate_models():
    results = {model_name: {"correct": 0, "total": 0, "responses": [], "errors": []} for model_name in models}
    test_data = list(ds["test"])[:250]  #Sample size

    print(f"Evaluating {len(test_data)} samples.")

    for idx, sample in enumerate(tqdm(test_data, desc="Evaluating")):
        try:
            problem = sample['problem']
            correct_answer = sample['answer']
            level = sample.get('level', 'unknown')

            prompt = create_prompt(problem, level)

            for model_name, model_id in models.items():
                try:
                    response = query_model(model_id, prompt)

                    try:
                        predicted_answer = extract_answer(response)
                    except Exception as e:
                        logger.error(f"Failed to extract answer for problem {idx}: {str(e)}")
                        predicted_answer = "Error extracting answer"
                        results[model_name]['errors'].append({
                            "problem_idx": idx,
                            "error_type": "extraction_error",
                            "error_msg": str(e)
                        })

                    try:
                        is_correct = check_answer(predicted_answer, correct_answer)
                    except Exception as e:
                        logger.error(f"Failed to check answer for problem {idx}: {str(e)}")
                        is_correct = False
                        results[model_name]['errors'].append({
                            "problem_idx": idx,
                            "error_type": "check_error",
                            "error_msg": str(e)
                        })

                    results[model_name]['total'] += 1
                    if is_correct:
                        results[model_name]['correct'] += 1

                    results[model_name]['responses'].append({
                        "problem": problem,
                        "level": level,
                        "correct_answer": correct_answer,
                        "model_response": response,
                        "extracted_answer": predicted_answer,
                        "is_correct": is_correct
                    })
                except Exception as e:
                    logger.error(f"Failed to process sample {idx} with model {model_name}: {str(e)}")
                    results[model_name]['errors'].append({
                        "problem_idx": idx,
                        "error_type": "processing_error",
                        "error_msg": str(e)
                    })
        except Exception as e:
            logger.error(f"Failed to process sample {idx}: {str(e)}")
            for model_name in models:
                results[model_name]['errors'].append({
                    "problem_idx": idx,
                    "error_type": "sample_error",
                    "error_msg": str(e)
                })

    return results

def calculate_metrics(results):
    metrics = {}
    for model_name, data in results.items():
        accuracy = (data['correct'] / data['total'] * 100) if data['total'] > 0 else 0
        error_count = len(data['errors'])
        metrics[model_name] = {
            "Overall Accuracy": f"{accuracy:.2f}%",
            "Error Count": error_count,
            "Completed Samples": data['total'],
            "Errors Per Sample": f"{error_count/data['total']:.4f}" if data['total'] > 0 else "N/A"
        }
    return metrics

def print_answer_comparison(results):
    """Print a side-by-side comparison of extracted answers vs correct answers with normalization."""
    print("\n===== ANSWER COMPARISON =====")
    print(f"{'Problem #':<8} {'Model':<15} {'Correct?':<8} {'Raw Extracted':<30} {'Normalized':<25} {'Correct Answer':<30}")
    print("-" * 120)

    for model_name, data in results.items():
        for idx, response_data in enumerate(data['responses']):
            correct_mark = "✓" if response_data['is_correct'] else "✗"
            extracted = response_data['extracted_answer']
            correct = response_data['correct_answer']

            # Show normalized version too
            normalized = normalize_answer(extracted)

            # Truncate long answers for display
            if len(str(extracted)) > 28:
                extracted = str(extracted)[:25] + "..."
            if len(str(normalized)) > 23:
                normalized = str(normalized)[:20] + "..."
            if len(str(correct)) > 28:
                correct = str(correct)[:25] + "..."

            print(f"{idx:<8} {model_name:<15} {correct_mark:<8} {extracted:<30} {normalized:<25} {correct:<30}")

    print("\n")

def run_evaluation():
    print("Starting MATH-500 model evaluation...")
    results = evaluate_models()
    metrics = calculate_metrics(results)

    print("\n===== EVALUATION RESULTS =====")
    for model_name, data in metrics.items():
        print(f"{model_name}: {data['Overall Accuracy']} (Errors: {data['Error Count']})")

    # Print answer comparison
    print_answer_comparison(results)

    # Save detailed results to file
    os.makedirs("results", exist_ok=True)
    with open("results/math500_detailed_results.json", "w") as f:
        json.dump(results, f, indent=2)
    with open("results/math500_metrics.json", "w") as f:
        json.dump(metrics, f, indent=2)

    # Create a separate answer comparison file
    answer_comparison = []
    for model_name, data in results.items():
        for idx, response_data in enumerate(data['responses']):
            answer_comparison.append({
                "problem_idx": idx,
                "problem": response_data['problem'][:100] + "..." if len(response_data['problem']) > 100 else response_data['problem'],
                "model": model_name,
                "extracted_answer": response_data['extracted_answer'],
                "correct_answer": response_data['correct_answer'],
                "is_correct": response_data['is_correct']
            })

    with open("results/math500_answer_comparison.json", "w") as f:
        json.dump(answer_comparison, f, indent=2)

    # Create a more readable CSV version
    comparison_df = pd.DataFrame(answer_comparison)
    comparison_df.to_csv("results/math500_answer_comparison.csv", index=False)

    print("Results saved to 'results/' directory.")

    # Create error report
    error_summary = {}
    for model_name, data in results.items():
        error_types = {}
        for error in data['errors']:
            error_type = error['error_type']
            if error_type not in error_types:
                error_types[error_type] = 0
            error_types[error_type] += 1
        error_summary[model_name] = error_types

    with open("results/math500_error_summary.json", "w") as f:
        json.dump(error_summary, f, indent=2)

    print("\n===== ERROR SUMMARY =====")
    for model_name, errors in error_summary.items():
        print(f"{model_name} errors:")
        for error_type, count in errors.items():
            print(f"  - {error_type}: {count}")

    comparison_df = pd.DataFrame.from_dict({k: {m: v[m] for m in ["Overall Accuracy", "Error Count"]}
                                           for k, v in metrics.items()}, orient='index').reset_index()
    comparison_df.columns = ['Model', 'Overall Accuracy (%)', 'Error Count']
    comparison_df.to_csv("results/math500_comparison.csv", index=False)
    print(comparison_df)

# Run the evaluation
run_evaluation()

Starting MATH-500 model evaluation...
Evaluating 250 samples.


Evaluating:   6%|███▊                                                                 | 14/250 [00:40<12:18,  3.13s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  12%|████████                                                             | 29/250 [01:59<11:38,  3.16s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  18%|████████████▏                                                        | 44/250 [03:07<13:50,  4.03s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  24%|████████████████▎                                                    | 59/250 [04:14<06:45,  2.12s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  30%|████████████████████▍                                                | 74/250 [05:25<06:01,  2.06s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  36%|████████████████████████▌                                            | 89/250 [06:41<09:25,  3.52s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  42%|████████████████████████████▎                                       | 104/250 [08:12<14:41,  6.04s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  48%|████████████████████████████████▎                                   | 119/250 [09:41<06:34,  3.01s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  54%|████████████████████████████████████▍                               | 134/250 [11:06<04:50,  2.50s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  60%|████████████████████████████████████████▌                           | 149/250 [12:17<04:12,  2.50s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  66%|████████████████████████████████████████████▌                       | 164/250 [13:37<03:43,  2.59s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  72%|████████████████████████████████████████████████▋                   | 179/250 [15:00<03:22,  2.86s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  78%|████████████████████████████████████████████████████▊               | 194/250 [16:14<02:10,  2.34s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  84%|████████████████████████████████████████████████████████▊           | 209/250 [17:25<01:47,  2.62s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  90%|████████████████████████████████████████████████████████████▉       | 224/250 [18:37<01:10,  2.72s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  96%|█████████████████████████████████████████████████████████████████   | 239/250 [19:54<00:39,  3.56s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 250/250 [21:00<00:00,  5.04s/it]


===== EVALUATION RESULTS =====
Gemini 2.0 Flash: 90.00% (Errors: 0)

===== ANSWER COMPARISON =====
Problem # Model           Correct? Raw Extracted                  Normalized                Correct Answer                
------------------------------------------------------------------------------------------------------------------------
0        Gemini 2.0 Flash ✓        (3, \frac{\pi}{2})             (3, \frac\pi/2)           \left( 3, \frac{\pi}{2} \...  
1        Gemini 2.0 Flash ✓        p - q                          p - q                     p - q                         
2        Gemini 2.0 Flash ✓        \frac{14}{3}                   \frac14/3                 \frac{14}{3}                  
3        Gemini 2.0 Flash ✓        9                              9                         9                             
4        Gemini 2.0 Flash ✓        \text{Evelyn}                  Evelyn                    \text{Evelyn}                 
5        Gemini 2.0 Flash ✓        42    




In [30]:
import json
import os
import pandas as pd

# Function to load existing results
def load_existing_results():
    try:
        with open("results/math500_detailed_results.json", "r") as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        print("Warning: Could not load existing results file. Starting fresh.")
        return {model_name: {"correct": 0, "total": 0, "responses": [], "errors": []} for model_name in models}

# Modify your evaluate_models function to accept start and end indices
def evaluate_models(start_idx=0, end_idx=None):
    # Load existing results
    results = load_existing_results()
    
    # Initialize results dict for models not in existing results
    for model_name in models:
        if model_name not in results:
            results[model_name] = {"correct": 0, "total": 0, "responses": [], "errors": []}
    
    # Get test data for the specified range
    test_data = list(ds["test"])
    if end_idx is None:
        end_idx = len(test_data)
    test_data = test_data[start_idx:end_idx]

    print(f"Evaluating samples from index {start_idx} to {end_idx-1}.")

    for idx, sample in enumerate(tqdm(test_data, desc="Evaluating")):
        # Use the absolute index for reporting
        abs_idx = start_idx + idx
        try:
            problem = sample['problem']
            correct_answer = sample['answer']
            level = sample.get('level', 'unknown')

            prompt = create_prompt(problem, level)

            for model_name, model_id in models.items():
                try:
                    response = query_model(model_id, prompt)

                    try:
                        predicted_answer = extract_answer(response)
                    except Exception as e:
                        logger.error(f"Failed to extract answer for problem {abs_idx}: {str(e)}")
                        predicted_answer = "Error extracting answer"
                        results[model_name]['errors'].append({
                            "problem_idx": abs_idx,
                            "error_type": "extraction_error",
                            "error_msg": str(e)
                        })

                    try:
                        is_correct = check_answer(predicted_answer, correct_answer)
                    except Exception as e:
                        logger.error(f"Failed to check answer for problem {abs_idx}: {str(e)}")
                        is_correct = False
                        results[model_name]['errors'].append({
                            "problem_idx": abs_idx,
                            "error_type": "check_error",
                            "error_msg": str(e)
                        })

                    results[model_name]['total'] += 1
                    if is_correct:
                        results[model_name]['correct'] += 1

                    results[model_name]['responses'].append({
                        "problem": problem,
                        "level": level,
                        "correct_answer": correct_answer,
                        "model_response": response,
                        "extracted_answer": predicted_answer,
                        "is_correct": is_correct
                    })
                except Exception as e:
                    logger.error(f"Failed to process sample {abs_idx} with model {model_name}: {str(e)}")
                    results[model_name]['errors'].append({
                        "problem_idx": abs_idx,
                        "error_type": "processing_error",
                        "error_msg": str(e)
                    })
        except Exception as e:
            logger.error(f"Failed to process sample {abs_idx}: {str(e)}")
            for model_name in models:
                results[model_name]['errors'].append({
                    "problem_idx": abs_idx,
                    "error_type": "sample_error",
                    "error_msg": str(e)
                })

    return results

# Updated run_evaluation function to handle ranges
def run_evaluation(start_idx=0, end_idx=None):
    print(f"Starting MATH-500 model evaluation for samples {start_idx} to {end_idx if end_idx else 'end'}...")
    results = evaluate_models(start_idx, end_idx)
    metrics = calculate_metrics(results)

    print("\n===== EVALUATION RESULTS =====")
    for model_name, data in metrics.items():
        print(f"{model_name}: {data['Overall Accuracy']} (Errors: {data['Error Count']})")

    # Print answer comparison
    print_answer_comparison(results)

    # Save detailed results to file
    os.makedirs("results", exist_ok=True)
    with open("results/math500_detailed_results.json", "w") as f:
        json.dump(results, f, indent=2)
    with open("results/math500_metrics.json", "w") as f:
        json.dump(metrics, f, indent=2)

    # Create a separate answer comparison file
    answer_comparison = []
    for model_name, data in results.items():
        for idx, response_data in enumerate(data['responses']):
            answer_comparison.append({
                "problem_idx": idx,
                "problem": response_data['problem'][:100] + "..." if len(response_data['problem']) > 100 else response_data['problem'],
                "model": model_name,
                "extracted_answer": response_data['extracted_answer'],
                "correct_answer": response_data['correct_answer'],
                "is_correct": response_data['is_correct']
            })

    with open("results/math500_answer_comparison.json", "w") as f:
        json.dump(answer_comparison, f, indent=2)

    # Create a more readable CSV version
    comparison_df = pd.DataFrame(answer_comparison)
    comparison_df.to_csv("results/math500_answer_comparison.csv", index=False)

    print("Results saved to 'results/' directory.")

    # Create error report
    error_summary = {}
    for model_name, data in results.items():
        error_types = {}
        for error in data['errors']:
            error_type = error['error_type']
            if error_type not in error_types:
                error_types[error_type] = 0
            error_types[error_type] += 1
        error_summary[model_name] = error_types

    with open("results/math500_error_summary.json", "w") as f:
        json.dump(error_summary, f, indent=2)

    print("\n===== ERROR SUMMARY =====")
    for model_name, errors in error_summary.items():
        print(f"{model_name} errors:")
        for error_type, count in errors.items():
            print(f"  - {error_type}: {count}")

    comparison_df = pd.DataFrame.from_dict({k: {m: v[m] for m in ["Overall Accuracy", "Error Count"]}
                                           for k, v in metrics.items()}, orient='index').reset_index()
    comparison_df.columns = ['Model', 'Overall Accuracy (%)', 'Error Count']
    comparison_df.to_csv("results/math500_comparison.csv", index=False)
    print(comparison_df)

# Also, update the normalize_answer function to handle the comma issue:
def normalize_answer(answer):
    """Normalize answer format for consistent comparison."""
    answer = str(answer).strip()

    # Convert LaTeX variants with \text{} to plain text
    if answer.startswith(r"\text{") and answer.endswith("}"):
        answer = answer[6:-1]  # Extract the content within \text{}

    # Remove LaTeX block markers
    answer = re.sub(r'\\\[|\]', '', answer)
    
    # Remove LaTeX spacing commands
    answer = re.sub(r"\\!", "", answer)

    # For numeric answers, strip commas and other formatting characters
    if re.match(r'^[\d,\.\s]+$', answer):
        answer = answer.replace(",", "")

    # Remove ** or other extra symbols
    answer = re.sub(r'^\*+|\*+$', '', answer)

    # Handle LaTeX-like outputs
    answer = answer.replace("\\dfrac", "\\frac")
    answer = answer.replace("\\left", "").replace("\\right", "")
    answer = answer.replace("^\\circ", "").replace("^{\\circ}", "").replace("^°", "")
    answer = answer.replace("}{", "/").replace("{", "").replace("}", "")

    # Handle LaTeX matrices (pmatrix, bmatrix, etc.)
    matrix_pattern = r"\\begin\{(?:bmatrix|pmatrix|vmatrix|Vmatrix)\}(.*?)\\end\{(?:bmatrix|pmatrix|vmatrix|Vmatrix)\}"
    matrix_match = re.search(matrix_pattern, answer, re.DOTALL)

    if matrix_match:
        matrix_content = matrix_match.group(1)
        # Clean up the matrix and format it as a simple vector
        matrix_vector = "[" + ", ".join(matrix_content.split("\\\\")) + "]"
        answer = matrix_vector.strip()

    return answer.strip().rstrip(".,:;")

# Example of how to run the next 250 samples
# Assuming you've already run samples 0-249
run_evaluation(260, 500)

Starting MATH-500 model evaluation for samples 260 to 500...
Evaluating samples from index 260 to 499.


Evaluating:   6%|████                                                                 | 14/240 [00:29<08:07,  2.16s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  12%|████████▎                                                            | 29/240 [01:51<17:32,  4.99s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  18%|████████████▋                                                        | 44/240 [03:18<15:35,  4.77s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  25%|████████████████▉                                                    | 59/240 [04:36<08:35,  2.85s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  31%|█████████████████████▎                                               | 74/240 [05:51<09:12,  3.33s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  37%|█████████████████████████▌                                           | 89/240 [07:02<05:52,  2.33s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  43%|█████████████████████████████▍                                      | 104/240 [08:25<05:19,  2.35s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  50%|█████████████████████████████████▋                                  | 119/240 [09:38<05:25,  2.69s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  56%|█████████████████████████████████████▉                              | 134/240 [10:49<05:00,  2.83s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  62%|██████████████████████████████████████████▏                         | 149/240 [12:10<04:46,  3.15s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  68%|██████████████████████████████████████████████▍                     | 164/240 [13:32<06:47,  5.36s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  75%|██████████████████████████████████████████████████▋                 | 179/240 [14:50<02:51,  2.81s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  81%|██████████████████████████████████████████████████████▉             | 194/240 [16:00<01:46,  2.32s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  87%|███████████████████████████████████████████████████████████▏        | 209/240 [17:07<01:08,  2.21s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating:  93%|███████████████████████████████████████████████████████████████▍    | 224/240 [18:25<00:55,  3.47s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating: 100%|███████████████████████████████████████████████████████████████████▋| 239/240 [19:46<00:04,  4.23s/it]

Rate limit reached. Pausing for 30 seconds...


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 240/240 [20:17<00:00,  5.07s/it]


===== EVALUATION RESULTS =====
Gemini 2.0 Flash: 88.58% (Errors: 0)

===== ANSWER COMPARISON =====
Problem # Model           Correct? Raw Extracted                  Normalized                Correct Answer                
------------------------------------------------------------------------------------------------------------------------
0        Gemini 2.0 Flash ✓        (3, \frac{\pi}{2})             (3, \frac\pi/2)           \left( 3, \frac{\pi}{2} \...  
1        Gemini 2.0 Flash ✓        p - q                          p - q                     p - q                         
2        Gemini 2.0 Flash ✓        \frac{14}{3}                   \frac14/3                 \frac{14}{3}                  
3        Gemini 2.0 Flash ✓        9                              9                         9                             
4        Gemini 2.0 Flash ✓        \text{Evelyn}                  Evelyn                    \text{Evelyn}                 
5        Gemini 2.0 Flash ✓        42    


