In [None]:
import json
import os
import re
import time
import logging
import queue
import threading
import traceback
from tqdm import tqdm
from collections import deque
from google import genai

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('json_evaluation.log'),
        logging.StreamHandler()
    ]
)


class GeminiTranslationApiManager:
    """
    Manages multiple Gemini API keys with rotation and rate limiting for evaluation tasks.
    """

    def __init__(self, api_keys, calls_per_day=1000, rate_limit_delay=5):
        if not api_keys:
            raise ValueError("api_keys must contain at least one key")

        self.api_keys = deque(api_keys)
        self.calls_per_day = calls_per_day
        self.rate_limit_delay = rate_limit_delay

        self.usage_count = {key: 0 for key in api_keys}
        self.current_key = self.api_keys[0]
        self.client = genai.Client(api_key=self.current_key)

        self.lock = threading.Lock()
        self.call_queue = queue.Queue()
        self.worker_thread = threading.Thread(target=self._process_queue, name="GeminiWorker")
        self.worker_thread.daemon = True
        self.worker_thread.start()

        logging.info(f"Gemini API Manager initialized with {len(api_keys)} keys")

    def _rotate_key(self):
        """Rotate to the next available API key."""
        with self.lock:
            self.api_keys.rotate(1)
            self.current_key = self.api_keys[0]
            self.client = genai.Client(api_key=self.current_key)
            usage = self.usage_count.get(self.current_key, 0)
        logging.info(f"Rotated to new API key (usage: {usage})")

    def _find_available_key(self):
        """Find an API key that hasn't reached the daily limit."""
        with self.lock:
            if self.usage_count.get(self.current_key, 0) < self.calls_per_day:
                return True

        initial_key = self.current_key
        for _ in range(len(self.api_keys)):
            self._rotate_key()
            with self.lock:
                if self.usage_count.get(self.current_key, 0) < self.calls_per_day:
                    return True
            if self.current_key == initial_key:
                return False
        return False

    def _process_queue(self):
        """Process the queue of API calls."""
        while True:
            try:
                args, kwargs, result_queue = self.call_queue.get()

                if not self._find_available_key():
                    err = {"error": "All API keys have reached their daily limit"}
                    result_queue.put(err)
                    self.call_queue.task_done()
                    time.sleep(10)
                    continue

                try:
                    response = self.client.models.generate_content(*args, **kwargs)
                    result_queue.put({"response": response})
                    with self.lock:
                        self.usage_count[self.current_key] += 1

                except Exception as api_exc:
                    msg = str(api_exc).lower()
                    if 'quota' in msg or 'rate limit' in msg:
                        with self.lock:
                            self.usage_count[self.current_key] = self.calls_per_day
                        logging.warning(f"API key reached quota/rate-limit: {api_exc}")
                    result_queue.put({"error": str(api_exc)})

                time.sleep(self.rate_limit_delay)
                self.call_queue.task_done()

            except Exception as e:
                logging.error(f"Queue processing error: {e}\n{traceback.format_exc()}")
                time.sleep(1)

    def generate_content(self, *args, timeout=300, **kwargs):
        """Make an API call to generate content."""
        result_queue = queue.Queue()
        self.call_queue.put((args, kwargs, result_queue))

        try:
            result = result_queue.get(timeout=timeout)
        except queue.Empty:
            raise TimeoutError("Timed out waiting for API response")

        if "error" in result:
            raise Exception(result["error"])
        return result["response"]

    def get_usage_stats(self):
        """Get usage statistics for all keys."""
        with self.lock:
            per_key = dict(self.usage_count)
        total_used = sum(per_key.values())
        total_available = len(self.api_keys) * self.calls_per_day
        return {
            "per_key": per_key,
            "total_used": total_used,
            "total_available": total_available,
            "percent_used": (total_used / total_available) * 100 if total_available > 0 else 0
        }


class GeminiEvaluator:
    """
    Gemini evaluator that enforces a simple ASCII thinking marker (<<END_THINK>>)
    and parses a machine-friendly JSON block emitted by the model when available.
    """

    THINK_MARKER = "<<END_THINK>>"

    # Example for few-shot prompting
    EXAMPLE_EVALUATION = """
EXAMPLE:
Student Answer: 3/4
Ground Truth: 0.75
Analysis: Converting fraction to decimal: 3 ÷ 4 = 0.75. Both represent the same mathematical value.
<<END_THINK>>
{"decision":"YES", "confidence":0.95, "short_reason":"3/4 equals 0.75, same value"}
DECISION: YES

EXAMPLE:
Student Answer: x = 2, y = 3
Ground Truth: (2, 3)
Analysis: Both express the same solution set for variables x and y, just in different notation.
<<END_THINK>>
{"decision":"YES", "confidence":0.9, "short_reason":"Same solution values, different format"}
DECISION: YES

EXAMPLE:
Student Answer: 45°
Ground Truth: π/4
Analysis: Converting: 45° = 45 × π/180 = π/4 radians. Equivalent angle representations.
<<END_THINK>>
{"decision":"YES", "confidence":0.95, "short_reason":"45° equals π/4 radians"}
DECISION: YES
"""

    def __init__(self, api_manager: GeminiTranslationApiManager, seed=42):
        self.api_manager = api_manager
        self.seed = seed
        self.model_name = "gemini-2.5-flash-lite"

        logging.info(f"Gemini evaluator initialized with model: {self.model_name}")
        print(f"Gemini evaluator initialized with model: {self.model_name}")

    def generate_response(self, prompt):
        """
        Generate response using Gemini API and split at THINK_MARKER.
        Returns: (thinking_text, final_text, full_response_text)
        """
        try:
            response = self.api_manager.generate_content(
                model=self.model_name,
                contents=prompt,
                timeout=300
            )

            # Extract response text
            full_response = ""
            if hasattr(response, 'text'):
                full_response = response.text
            elif isinstance(response, dict) and 'text' in response:
                full_response = response['text']
            else:
                full_response = str(response)

            full_response = full_response.strip()

            # Try to find the last occurrence of the THINK_MARKER in the response
            marker = self.THINK_MARKER
            idx = full_response.rfind(marker)
            if idx >= 0:
                thinking = full_response[:idx].strip()
                final = full_response[idx + len(marker):].strip()
                return thinking, final, full_response

            # If no marker present, return empty thinking and whole text as final
            return "", full_response, full_response

        except Exception as e:
            logging.error(f"Error generating response with Gemini: {e}", exc_info=True)
            return "", f"Error: {str(e)}", ""


def validate_inputs(generated_answer, exact_answer):
    """
    Validate inputs before processing.
    Returns: (is_valid, error_message)
    """
    if not exact_answer or not generated_answer:
        return False, "Missing essential data"

    generated_str = str(generated_answer)
    exact_str = str(exact_answer)

    # Gemini has very high context limits, so we can be more lenient
    if len(generated_str) > 100000:  # Much higher limit for Gemini
        return False, f"Generated answer too long ({len(generated_str)} chars)"

    if len(exact_str) > 100000:  # Much higher limit for Gemini
        return False, f"Exact answer too long ({len(exact_str)} chars)"

    return True, ""


def _extract_json_block(text):
    """
    Find the last valid JSON object in text using improved pattern matching.
    Returns (json_obj, json_text) or (None, None)
    """
    if not text:
        return None, None

    # First try: regex pattern for JSON-like structures
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.findall(json_pattern, text)

    # Try matches in reverse order (last match first)
    for match in reversed(matches):
        try:
            obj = json.loads(match)
            return obj, match
        except:
            continue

    # Fallback: original greedy approach
    last_open = text.rfind('{')
    if last_open == -1:
        return None, None

    candidate = text[last_open:]
    brace_count = 0

    for i, char in enumerate(candidate):
        if char == '{':
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0:
                try:
                    maybe = candidate[:i+1]
                    obj = json.loads(maybe)
                    return obj, maybe
                except:
                    continue

    return None, None


def extract_decision_and_confidence(final_text):
    """
    Try to extract a decision and optional confidence from the final_text.
    Priority:
      1) JSON block with keys 'decision' (YES/NO) and optional 'confidence'
      2) A final line 'DECISION: YES' or 'DECISION: NO'
      3) Conservative heuristic text match (more specific than original)
    Returns: (decision_str_or_none, confidence_or_none, short_reason_or_none)
    """
    if not final_text:
        return None, None, None

    # 1) Try JSON block first
    obj, obj_text = _extract_json_block(final_text)
    if obj is not None:
        decision = None
        conf = None
        short = None

        # Normalize decision key
        for key in ("decision", "Decision", "DECISION"):
            if key in obj:
                decision_val = str(obj[key]).strip().upper()
                if decision_val in ("YES", "NO", "TRUE", "FALSE", "1", "0"):
                    decision = "YES" if decision_val in ("YES", "TRUE", "1") else "NO"
                break

        # Extract confidence
        for key in ("confidence", "conf", "confidence_score"):
            if key in obj:
                try:
                    conf_val = float(obj[key])
                    if 0.0 <= conf_val <= 1.0:
                        conf = conf_val
                    elif 0.0 <= conf_val <= 100.0:  # Handle percentage format
                        conf = conf_val / 100.0
                except:
                    conf = None
                break

        # Extract short reason
        for key in ("short_reason", "reason", "short_reasoning"):
            if key in obj:
                short = str(obj[key])
                break

        if decision in ("YES", "NO"):
            return decision.lower(), conf, short

    # 2) Check final lines for DECISION: YES/NO
    lines = [ln.strip() for ln in final_text.strip().splitlines() if ln.strip()]
    if lines:
        last_line = lines[-1].upper().rstrip('.')
        decision_patterns = [
            "DECISION: YES", "DECISION YES", "YES", "DECISION: YES.",
            "DECISION: NO", "DECISION NO", "NO", "DECISION: NO."
        ]

        for pattern in decision_patterns:
            if last_line == pattern:
                return 'yes' if 'YES' in pattern else 'no', None, None

    # 3) More conservative heuristic matching
    # Only trigger if we see specific mathematical equivalence phrases
    # in the context of a conclusion
    tail = " ".join(lines[-3:]).upper()
    conclusion_indicators = ["THEREFORE", "THUS", "CONCLUSION", "FINAL", "RESULT"]

    has_conclusion_context = any(indicator in tail for indicator in conclusion_indicators)

    if has_conclusion_context:
        positive_phrases = [
            "MATHEMATICALLY EQUIVALENT", "ARE THE SAME", "REPRESENT THE SAME",
            "SAME VALUE", "EQUIVALENT EXPRESSIONS", "EQUAL VALUES"
        ]
        negative_phrases = [
            "NOT EQUIVALENT", "ARE DIFFERENT", "NOT THE SAME",
            "DIFFERENT VALUES", "NOT EQUAL", "DISTINCT"
        ]

        if any(phrase in tail for phrase in positive_phrases):
            return 'yes', None, None
        if any(phrase in tail for phrase in negative_phrases):
            return 'no', None, None

    return None, None, None


def compare_answers_with_gemini(gemini_evaluator, generated_answer, exact_answer, problem_index, require_json=False):
    """
    Compare generated answer with exact answer using Gemini as a judge.
    If require_json=True, prefer outputs that contain a JSON block.
    Returns a dict with thinking, final_text, decision ('yes'/'no'/None), confidence, short_reason
    """
    try:
        if isinstance(generated_answer, list):
            generated_answer_text = str(generated_answer[0]) if generated_answer else "No answer provided"
        else:
            generated_answer_text = str(generated_answer) if generated_answer else "No answer provided"

        # Input validation
        is_valid, error_msg = validate_inputs(generated_answer_text, exact_answer)
        if not is_valid:
            return {
                'thinking': '',
                'final_text': f'Input validation failed: {error_msg}',
                'full_decoded': '',
                'decision': None,
                'confidence': None,
                'short_reason': None,
            }

        prompt = f"""You are an expert mathematics evaluation specialist. Your task is to determine if two mathematical answers are mathematically equivalent according to the rubric below.

EVALUATION CRITERIA:
1) MATHEMATICAL EQUIVALENCE: same value/expression/solution set.
2) PROOF VALIDATION: same logical conclusion for proof tasks.
3) SET EQUIVALENCE: sets equal regardless of order.
4) NUMERICAL PRECISION: accept reasonable rounding differences (±0.01).
5) UNITS: units must be compatible/convertible.
6) FORMATTING: ignore LaTeX/formatting/language differences.
7) TRIGONOMETRIC EQUIVALENCE: accept equivalent angle representations.
8) ALGEBRAIC EQUIVALENCE: accept equivalent algebraic forms.

{GeminiEvaluator.EXAMPLE_EVALUATION}

EVALUATION PROCESS:
Step 1: Extract core mathematical content from both answers.
Step 2: Identify domain (algebra, geometry, calculus, etc.) and apply domain-specific checks.
Step 3: Check for mathematical equivalence using appropriate methods.
Step 4: Note any fundamental differences in mathematical meaning.
Step 5: Make final decision with confidence assessment.

OUTPUT REQUIREMENTS (follow exactly):
1) Give a concise step-by-step analysis.
2) THEN insert the THINKING marker on its own line exactly:
{GeminiEvaluator.THINK_MARKER}
3) After the marker, provide the final machine-friendly JSON block like:
   {{"decision":"YES" or "NO", "confidence":0.0-1.0, "short_reason":"..."}}
4) Finally end the entire response with a single final line (no trailing text):
   DECISION: YES
   OR
   DECISION: NO

Now evaluate the following:

STUDENT ANSWER:
{generated_answer_text}

GROUND TRUTH:
{exact_answer}
"""

        thinking, final_text, full = gemini_evaluator.generate_response(prompt)

        # If the marker wasn't present in the response, thinking will be empty and final_text contains everything.
        decision, confidence, short_reason = extract_decision_and_confidence(final_text)

        return {
            'thinking': thinking,
            'final_text': final_text,
            'full_decoded': full,
            'decision': decision,
            'confidence': confidence,
            'short_reason': short_reason,
        }

    except Exception as e:
        logging.error(f"Error comparing answers for problem {problem_index}: {e}", exc_info=True)
        return {
            'thinking': '',
            'final_text': f'Error occurred during comparison: {str(e)}',
            'full_decoded': '',
            'decision': None,
            'confidence': None,
            'short_reason': None,
        }


def validate_and_retry_comparison(gemini_evaluator, generated_answer, exact_answer, problem_index, max_retries=3, require_json=False):
    """
    Retry loop that prefers a JSON decision when require_json=True.
    Returns (thinking, final_text, decision, confidence, short_reason)
    """
    last_result = None

    for attempt in range(1, max_retries + 1):
        result = compare_answers_with_gemini(
            gemini_evaluator,
            generated_answer,
            exact_answer,
            problem_index,
            require_json=require_json
        )

        decision = result['decision']
        thinking = result['thinking']
        final_text = result['final_text']
        confidence = result.get('confidence')
        short_reason = result.get('short_reason')

        last_result = result

        # Success condition: we got a valid decision
        if decision in ('yes', 'no'):
            # If we require JSON and got confidence, that's preferred
            if require_json and confidence is not None:
                logging.info(f"Problem {problem_index}: Got JSON decision with confidence {confidence}")
            return thinking, final_text, decision, confidence, short_reason

        logging.warning(f"Attempt {attempt}/{max_retries} failed to get explicit decision for problem {problem_index}; decision={decision}")

        # For retries, we could adjust temperature slightly
        if attempt < max_retries:
            time.sleep(0.1)  # Brief pause between retries

    # If all attempts fail, default conservatively to 'no'
    logging.error(f"All {max_retries} attempts failed for problem {problem_index}; defaulting to 'no'")
    if last_result is None:
        return '', 'All attempts failed - no output', 'no', None, 'Evaluation failed'

    return (
        last_result['thinking'],
        last_result['final_text'],
        'no',  # Conservative default
        last_result.get('confidence'),
        last_result.get('short_reason') or 'Failed to get clear decision'
    )


def save_json_atomic(obj, path):
    """Save JSON file atomically to prevent corruption."""
    tmp = path + '.tmp'
    with open(tmp, 'w', encoding='utf-8') as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)
    os.replace(tmp, path)


def calculate_confidence_stats(evaluation_results):
    """Calculate statistics for confidence scores."""
    confidences = [r.get('confidence') for r in evaluation_results if r.get('confidence') is not None]
    if not confidences:
        return {"count": 0}

    return {
        "count": len(confidences),
        "mean": sum(confidences) / len(confidences),
        "min": min(confidences),
        "max": max(confidences),
        "median": sorted(confidences)[len(confidences) // 2]
    }


def main():
    """Main function to execute the evaluation process."""

    # Configuration
    JSON_FILE_PATH = '/content/Turkish_Mathstral_150.json'
    OUTPUT_FILE_PATH = '/content/evaluation_results_Turkish_Mathstral_150.json'

    # Judge generation settings
    MAX_RETRIES = 3
    PREFER_JSON = True

    # Load API keys - you'll need to set this up
    # Replace this with your API key loading mechanism
    try:
        with open('/content/part_3_api_key.txt','r') as fh:
            keys = fh.read().strip()

        if keys:
            os.environ['GEMINI_API_KEYS'] = keys
            logging.info("Loaded API keys successfully")

        api_keys = None
        if 'keys' in locals() and keys:
            api_keys = [k.strip() for k in keys.split(',') if k.strip()]

        if not api_keys:
            env_val = os.environ.get('GEMINI_API_KEYS', '').strip()
            if env_val:
                api_keys = [k.strip() for k in env_val.split(',') if k.strip()]

        if not api_keys:
            raise ValueError("No API keys found")

    except Exception as e:
        logging.error(f"Failed to load API keys: {e}")
        print(f"Error loading API keys: {e}")
        return

    # Initialize API manager
    api_manager = GeminiTranslationApiManager(
        api_keys=api_keys,
        calls_per_day=1000,
        rate_limit_delay=5
    )

    # Initialize evaluator
    gemini_evaluator = GeminiEvaluator(
        api_manager=api_manager,
        seed=42  # For reproducibility
    )

    # Load input data
    try:
        with open(JSON_FILE_PATH, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
        logging.info(f"Loaded {len(json_data)} problems from JSON file")
        print(f"Loaded {len(json_data)} problems from JSON file")
    except Exception as e:
        logging.error(f"Failed to load JSON input: {e}")
        print(f"Error loading JSON: {e}")
        return

    evaluation_results = []
    total_problems = len(json_data)
    correct_answers = 0
    processed_problems = 0
    problems_with_confidence = 0

    # Process each problem
    for idx, problem in enumerate(tqdm(json_data, desc='Evaluating answers with Gemini')):
        try:
            problem_index = problem.get('problem_index', idx)
            generated_answers = problem.get('extracted_final_answers', [])
            exact_answer = problem.get('exact_answer', '')
            question = problem.get('question', '')
            raw_answer_type = problem.get('raw_answer_type', '')

            # Skip problems with missing data
            if not generated_answers or not exact_answer:
                logging.warning(f"Skipping problem {problem_index}: missing data")
                evaluation_results.append({
                    'problem_index': problem_index,
                    'question': question,
                    'answer_type': raw_answer_type,
                    'extracted_final_answers': generated_answers,
                    'exact_answer': exact_answer,
                    'decision': 'no',
                    'confidence': None,
                    'short_reason': 'Missing essential data',
                    'thinking': '',
                    'final_text': 'Missing essential data',
                })
                continue

            # Perform evaluation
            thinking, final_text, decision, confidence, short_reason = validate_and_retry_comparison(
                gemini_evaluator,
                generated_answers,
                exact_answer,
                problem_index,
                max_retries=MAX_RETRIES,
                require_json=PREFER_JSON,
            )

            # Store results
            entry = {
                'problem_index': problem_index,
                'question': question,
                'answer_type': raw_answer_type,
                'extracted_final_answers': generated_answers,
                'exact_answer': exact_answer,
                'decision': decision,
                'confidence': confidence,
                'short_reason': short_reason,
                'thinking': thinking,
                'final_text': final_text,
            }
            evaluation_results.append(entry)

            # Update counters
            processed_problems += 1
            if decision == 'yes':
                correct_answers += 1
            if confidence is not None:
                problems_with_confidence += 1

            # Progress logging
            print(f"Problem {problem_index}: {decision.upper()}" +
                  (f" (conf: {confidence:.2f})" if confidence else ""))
            logging.info(f"Problem {problem_index}: {decision.upper()}" +
                        (f" (confidence: {confidence})" if confidence else ""))

            # Intermediate saves and progress reports
            if processed_problems % 50 == 0:
                accuracy = (correct_answers / processed_problems) * 100 if processed_problems else 0
                conf_rate = (problems_with_confidence / processed_problems) * 100
                logging.info(f"Progress: {processed_problems}/{total_problems} "
                           f"Accuracy: {correct_answers}/{processed_problems} ({accuracy:.2f}%) "
                           f"Confidence rate: {problems_with_confidence}/{processed_problems} ({conf_rate:.1f}%)")

                # Save intermediate results
                intermediate_file = OUTPUT_FILE_PATH.replace('.json', f'_checkpoint_{processed_problems}.json')
                save_json_atomic(evaluation_results, intermediate_file)

        except Exception as e:
            logging.error(f"Error processing problem {problem.get('problem_index', idx)}: {e}")
            evaluation_results.append({
                'problem_index': problem.get('problem_index', idx),
                'question' : problem.get('question', ''),
                'answer_type' : problem.get('raw_answer_type', ''),
                'extracted_final_answers': problem.get('extracted_final_answers', []),
                'exact_answer': problem.get('exact_answer', ''),
                'decision': 'no',
                'confidence': None,
                'short_reason': f'Error during evaluation: {str(e)}',
                'thinking': '',
                'final_text': f'Error during evaluation: {str(e)}',
            })
            continue

    # Calculate final statistics
    overall_accuracy = (correct_answers / processed_problems) * 100 if processed_problems else 0
    confidence_rate = (problems_with_confidence / processed_problems) * 100 if processed_problems else 0
    confidence_stats = calculate_confidence_stats(evaluation_results)

    # Prepare final results
    final_results = {
        'evaluation_metadata': {
            'model_used': gemini_evaluator.model_name,
            'total_problems': total_problems,
            'successfully_processed': processed_problems,
            'correct_answers': correct_answers,
            'overall_accuracy_percentage': round(overall_accuracy, 2),
            'problems_with_confidence': problems_with_confidence,
            'confidence_rate_percentage': round(confidence_rate, 2),
            'confidence_statistics': confidence_stats,
            'evaluation_timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
            'settings': {
                'max_retries': MAX_RETRIES,
                'prefer_json': PREFER_JSON,
                'seed': gemini_evaluator.seed,
            }
        },
        'problem_evaluations': evaluation_results,
    }

    # Save final results
    save_json_atomic(final_results, OUTPUT_FILE_PATH)

    # Print summary
    print('\n' + '='*80)
    print('EVALUATION COMPLETED!')
    print('='*80)
    print(f"Model Used: {gemini_evaluator.model_name}")
    print(f"Total Problems: {total_problems}")
    print(f"Successfully Processed: {processed_problems}")
    print(f"Correct Answers: {correct_answers}")
    print(f"Overall Accuracy: {overall_accuracy:.2f}%")
    print(f"Problems with Confidence: {problems_with_confidence} ({confidence_rate:.1f}%)")
    if confidence_stats.get('count', 0) > 0:
        print(f"Average Confidence: {confidence_stats['mean']:.3f}")
        print(f"Confidence Range: {confidence_stats['min']:.3f} - {confidence_stats['max']:.3f}")
    print(f"Results saved to: {OUTPUT_FILE_PATH}")
    print('='*80)

    # Final logging
    logging.info('JSON answer comparison with Gemini completed!')
    logging.info(f"Final Results: {correct_answers}/{processed_problems} correct ({overall_accuracy:.2f}%)")
    logging.info(f"Confidence provided for {problems_with_confidence}/{processed_problems} problems ({confidence_rate:.1f}%)")
    logging.info(f"Results saved to: {OUTPUT_FILE_PATH}")


if __name__ == '__main__':
    main()

Gemini evaluator initialized with model: gemini-2.5-flash-lite
Loaded 150 problems from JSON file


Evaluating answers with Gemini:   1%|          | 1/150 [00:00<02:14,  1.11it/s]

Problem 0: NO (conf: 1.00)


Evaluating answers with Gemini:   1%|▏         | 2/150 [00:07<10:43,  4.35s/it]

Problem 1: NO (conf: 1.00)


Evaluating answers with Gemini:   2%|▏         | 3/150 [00:13<12:28,  5.09s/it]

Problem 2: NO (conf: 1.00)


Evaluating answers with Gemini:   3%|▎         | 4/150 [00:20<14:01,  5.76s/it]

Problem 3: NO (conf: 1.00)


Evaluating answers with Gemini:   3%|▎         | 5/150 [00:26<14:10,  5.87s/it]

Problem 4: YES


Evaluating answers with Gemini:   4%|▍         | 6/150 [00:32<14:15,  5.94s/it]

Problem 5: NO (conf: 1.00)


Evaluating answers with Gemini:   5%|▍         | 7/150 [00:38<14:19,  6.01s/it]

Problem 6: NO (conf: 1.00)


Evaluating answers with Gemini:   5%|▌         | 8/150 [00:44<14:16,  6.03s/it]

Problem 7: NO (conf: 1.00)


Evaluating answers with Gemini:   6%|▌         | 9/150 [00:50<14:09,  6.03s/it]

Problem 8: NO (conf: 1.00)


Evaluating answers with Gemini:   7%|▋         | 10/150 [00:56<13:53,  5.95s/it]

Problem 9: NO (conf: 1.00)


Evaluating answers with Gemini:   7%|▋         | 11/150 [01:02<13:39,  5.90s/it]

Problem 10: NO (conf: 1.00)


Evaluating answers with Gemini:   8%|▊         | 12/150 [01:08<13:46,  5.99s/it]

Problem 11: YES (conf: 1.00)


Evaluating answers with Gemini:   9%|▊         | 13/150 [01:14<13:49,  6.05s/it]

Problem 12: YES (conf: 1.00)


Evaluating answers with Gemini:   9%|▉         | 14/150 [01:20<13:50,  6.10s/it]

Problem 13: NO (conf: 1.00)


Evaluating answers with Gemini:  10%|█         | 15/150 [01:26<13:37,  6.06s/it]

Problem 14: NO (conf: 0.95)


Evaluating answers with Gemini:  11%|█         | 16/150 [01:33<13:59,  6.26s/it]

Problem 15: NO (conf: 1.00)


Evaluating answers with Gemini:  11%|█▏        | 17/150 [01:39<13:44,  6.20s/it]

Problem 16: NO (conf: 1.00)


Evaluating answers with Gemini:  12%|█▏        | 18/150 [01:46<13:55,  6.33s/it]

Problem 17: NO (conf: 1.00)


Evaluating answers with Gemini:  13%|█▎        | 19/150 [01:56<16:29,  7.56s/it]

Problem 18: NO (conf: 0.90)


Evaluating answers with Gemini:  13%|█▎        | 20/150 [02:02<15:22,  7.10s/it]

Problem 19: NO (conf: 1.00)


Evaluating answers with Gemini:  14%|█▍        | 21/150 [02:08<14:23,  6.70s/it]

Problem 20: YES (conf: 1.00)


Evaluating answers with Gemini:  15%|█▍        | 22/150 [02:14<14:01,  6.58s/it]

Problem 21: NO (conf: 1.00)


Evaluating answers with Gemini:  15%|█▌        | 23/150 [02:22<14:21,  6.78s/it]

Problem 22: NO (conf: 1.00)


Evaluating answers with Gemini:  16%|█▌        | 24/150 [02:28<13:40,  6.51s/it]

Problem 23: NO (conf: 1.00)


Evaluating answers with Gemini:  17%|█▋        | 25/150 [02:34<13:30,  6.49s/it]

Problem 24: NO (conf: 1.00)


Evaluating answers with Gemini:  17%|█▋        | 26/150 [02:40<13:02,  6.31s/it]

Problem 25: NO (conf: 1.00)


Evaluating answers with Gemini:  18%|█▊        | 27/150 [02:46<12:38,  6.16s/it]

Problem 26: NO (conf: 1.00)


Evaluating answers with Gemini:  19%|█▊        | 28/150 [02:52<12:45,  6.27s/it]

Problem 27: NO (conf: 1.00)


Evaluating answers with Gemini:  19%|█▉        | 29/150 [02:59<12:53,  6.39s/it]

Problem 28: YES (conf: 1.00)


Evaluating answers with Gemini:  20%|██        | 30/150 [03:05<12:45,  6.38s/it]

Problem 29: NO (conf: 1.00)


Evaluating answers with Gemini:  21%|██        | 31/150 [03:11<12:21,  6.23s/it]

Problem 30: YES (conf: 1.00)


Evaluating answers with Gemini:  21%|██▏       | 32/150 [03:17<12:05,  6.15s/it]

Problem 31: NO (conf: 1.00)


Evaluating answers with Gemini:  22%|██▏       | 33/150 [03:24<12:36,  6.46s/it]

Problem 32: YES (conf: 0.95)


Evaluating answers with Gemini:  23%|██▎       | 34/150 [03:30<12:01,  6.22s/it]

Problem 33: NO (conf: 1.00)


Evaluating answers with Gemini:  23%|██▎       | 35/150 [03:36<11:41,  6.10s/it]

Problem 34: NO (conf: 1.00)


Evaluating answers with Gemini:  24%|██▍       | 36/150 [03:42<11:32,  6.08s/it]

Problem 35: NO (conf: 1.00)


Evaluating answers with Gemini:  25%|██▍       | 37/150 [03:48<11:30,  6.11s/it]

Problem 36: NO (conf: 1.00)


Evaluating answers with Gemini:  25%|██▌       | 38/150 [03:54<11:29,  6.16s/it]

Problem 37: YES (conf: 0.95)


Evaluating answers with Gemini:  26%|██▌       | 39/150 [04:01<11:31,  6.23s/it]

Problem 38: YES (conf: 0.90)


Evaluating answers with Gemini:  27%|██▋       | 40/150 [04:07<11:35,  6.33s/it]

Problem 39: YES (conf: 1.00)


Evaluating answers with Gemini:  27%|██▋       | 41/150 [04:13<11:03,  6.09s/it]

Problem 40: NO (conf: 1.00)


Evaluating answers with Gemini:  28%|██▊       | 42/150 [04:19<10:53,  6.05s/it]

Problem 41: NO (conf: 1.00)


Evaluating answers with Gemini:  29%|██▊       | 43/150 [04:25<10:45,  6.03s/it]

Problem 42: NO (conf: 1.00)


Evaluating answers with Gemini:  29%|██▉       | 44/150 [04:30<10:33,  5.98s/it]

Problem 43: NO (conf: 1.00)


Evaluating answers with Gemini:  30%|███       | 45/150 [04:36<10:22,  5.93s/it]

Problem 44: NO (conf: 1.00)


Evaluating answers with Gemini:  31%|███       | 46/150 [04:42<10:16,  5.93s/it]

Problem 45: NO (conf: 1.00)


Evaluating answers with Gemini:  31%|███▏      | 47/150 [04:48<10:10,  5.92s/it]

Problem 46: NO (conf: 1.00)


Evaluating answers with Gemini:  32%|███▏      | 48/150 [04:54<10:05,  5.94s/it]

Problem 47: NO (conf: 1.00)


Evaluating answers with Gemini:  33%|███▎      | 49/150 [05:00<10:12,  6.06s/it]

Problem 48: NO (conf: 1.00)


Evaluating answers with Gemini:  33%|███▎      | 50/150 [05:06<10:01,  6.02s/it]

Problem 49: NO (conf: 1.00)


Evaluating answers with Gemini:  34%|███▍      | 51/150 [05:12<09:54,  6.00s/it]

Problem 50: NO (conf: 1.00)


Evaluating answers with Gemini:  35%|███▍      | 52/150 [05:18<09:43,  5.96s/it]

Problem 51: NO (conf: 1.00)


Evaluating answers with Gemini:  35%|███▌      | 53/150 [05:24<09:42,  6.01s/it]

Problem 52: NO (conf: 1.00)


Evaluating answers with Gemini:  36%|███▌      | 54/150 [05:30<09:25,  5.89s/it]

Problem 53: NO (conf: 1.00)


Evaluating answers with Gemini:  37%|███▋      | 55/150 [05:36<09:18,  5.88s/it]

Problem 54: NO (conf: 1.00)


Evaluating answers with Gemini:  37%|███▋      | 56/150 [05:41<09:07,  5.83s/it]

Problem 55: NO (conf: 1.00)


Evaluating answers with Gemini:  38%|███▊      | 57/150 [05:47<09:04,  5.85s/it]

Problem 56: YES (conf: 1.00)


Evaluating answers with Gemini:  39%|███▊      | 58/150 [05:53<08:53,  5.79s/it]

Problem 57: NO (conf: 1.00)


Evaluating answers with Gemini:  39%|███▉      | 59/150 [06:01<09:52,  6.51s/it]

Problem 58: NO (conf: 1.00)


Evaluating answers with Gemini:  40%|████      | 60/150 [06:07<09:37,  6.42s/it]

Problem 59: NO (conf: 1.00)


Evaluating answers with Gemini:  41%|████      | 61/150 [06:13<09:14,  6.23s/it]

Problem 60: YES (conf: 1.00)


Evaluating answers with Gemini:  41%|████▏     | 62/150 [06:20<09:12,  6.28s/it]

Problem 61: NO (conf: 1.00)


Evaluating answers with Gemini:  42%|████▏     | 63/150 [06:25<08:52,  6.12s/it]

Problem 62: NO (conf: 1.00)


Evaluating answers with Gemini:  43%|████▎     | 64/150 [06:31<08:40,  6.05s/it]

Problem 63: YES (conf: 1.00)


Evaluating answers with Gemini:  43%|████▎     | 65/150 [06:37<08:31,  6.02s/it]

Problem 64: NO (conf: 1.00)


Evaluating answers with Gemini:  44%|████▍     | 66/150 [06:43<08:31,  6.08s/it]

Problem 65: NO (conf: 1.00)


Evaluating answers with Gemini:  45%|████▍     | 67/150 [06:49<08:19,  6.01s/it]

Problem 66: NO (conf: 1.00)


Evaluating answers with Gemini:  45%|████▌     | 68/150 [06:55<08:09,  5.97s/it]

Problem 67: NO (conf: 1.00)


Evaluating answers with Gemini:  46%|████▌     | 69/150 [07:01<08:02,  5.96s/it]

Problem 68: YES (conf: 1.00)


Evaluating answers with Gemini:  47%|████▋     | 70/150 [07:07<07:49,  5.87s/it]

Problem 69: NO (conf: 1.00)


Evaluating answers with Gemini:  47%|████▋     | 71/150 [07:12<07:38,  5.81s/it]

Problem 70: NO (conf: 1.00)


Evaluating answers with Gemini:  48%|████▊     | 72/150 [07:19<07:42,  5.94s/it]

Problem 71: NO (conf: 1.00)


Evaluating answers with Gemini:  49%|████▊     | 73/150 [07:25<07:36,  5.93s/it]

Problem 72: YES (conf: 1.00)


Evaluating answers with Gemini:  49%|████▉     | 74/150 [07:30<07:25,  5.87s/it]

Problem 73: NO (conf: 1.00)


Evaluating answers with Gemini:  50%|█████     | 75/150 [07:36<07:18,  5.85s/it]

Problem 74: NO (conf: 1.00)


Evaluating answers with Gemini:  51%|█████     | 76/150 [07:42<07:14,  5.87s/it]

Problem 75: NO (conf: 1.00)


Evaluating answers with Gemini:  51%|█████▏    | 77/150 [07:48<07:14,  5.95s/it]

Problem 76: NO (conf: 1.00)


Evaluating answers with Gemini:  52%|█████▏    | 78/150 [07:54<07:12,  6.01s/it]

Problem 77: NO (conf: 1.00)


Evaluating answers with Gemini:  53%|█████▎    | 79/150 [08:00<07:03,  5.96s/it]

Problem 78: NO (conf: 1.00)


Evaluating answers with Gemini:  53%|█████▎    | 80/150 [08:06<07:01,  6.03s/it]

Problem 79: NO (conf: 1.00)


Evaluating answers with Gemini:  54%|█████▍    | 81/150 [08:12<06:50,  5.95s/it]

Problem 80: NO (conf: 1.00)


Evaluating answers with Gemini:  55%|█████▍    | 82/150 [08:18<06:45,  5.96s/it]

Problem 81: NO (conf: 1.00)


Evaluating answers with Gemini:  55%|█████▌    | 83/150 [08:24<06:43,  6.02s/it]

Problem 82: NO (conf: 1.00)


Evaluating answers with Gemini:  56%|█████▌    | 84/150 [08:30<06:35,  5.99s/it]

Problem 83: NO (conf: 1.00)


Evaluating answers with Gemini:  57%|█████▋    | 85/150 [08:36<06:25,  5.94s/it]

Problem 84: NO (conf: 1.00)


Evaluating answers with Gemini:  57%|█████▋    | 86/150 [08:42<06:22,  5.98s/it]

Problem 85: YES (conf: 1.00)


Evaluating answers with Gemini:  58%|█████▊    | 87/150 [08:54<08:11,  7.80s/it]

Problem 86: NO (conf: 0.90)


Evaluating answers with Gemini:  59%|█████▊    | 88/150 [09:01<07:49,  7.57s/it]

Problem 87: YES (conf: 1.00)


Evaluating answers with Gemini:  59%|█████▉    | 89/150 [09:07<07:15,  7.14s/it]

Problem 88: NO (conf: 1.00)


Evaluating answers with Gemini:  60%|██████    | 90/150 [09:13<06:50,  6.84s/it]

Problem 89: NO (conf: 0.95)


Evaluating answers with Gemini:  61%|██████    | 91/150 [09:20<06:44,  6.85s/it]

Problem 90: YES (conf: 0.85)


Evaluating answers with Gemini:  61%|██████▏   | 92/150 [09:28<06:44,  6.97s/it]

Problem 91: NO (conf: 0.99)


Evaluating answers with Gemini:  62%|██████▏   | 93/150 [09:34<06:25,  6.76s/it]

Problem 92: YES (conf: 1.00)


Evaluating answers with Gemini:  63%|██████▎   | 94/150 [09:40<06:07,  6.56s/it]

Problem 93: NO (conf: 1.00)


Evaluating answers with Gemini:  63%|██████▎   | 95/150 [09:46<05:55,  6.46s/it]

Problem 94: NO (conf: 1.00)


Evaluating answers with Gemini:  64%|██████▍   | 96/150 [09:53<05:49,  6.47s/it]

Problem 95: NO (conf: 1.00)


Evaluating answers with Gemini:  65%|██████▍   | 97/150 [09:59<05:35,  6.32s/it]

Problem 96: NO (conf: 1.00)


Evaluating answers with Gemini:  65%|██████▌   | 98/150 [10:04<05:19,  6.15s/it]

Problem 97: NO (conf: 1.00)


Evaluating answers with Gemini:  66%|██████▌   | 99/150 [10:11<05:13,  6.15s/it]

Problem 98: NO (conf: 1.00)


Evaluating answers with Gemini:  67%|██████▋   | 100/150 [10:16<05:02,  6.05s/it]

Problem 99: NO (conf: 1.00)


Evaluating answers with Gemini:  67%|██████▋   | 101/150 [10:23<05:03,  6.20s/it]

Problem 100: NO (conf: 1.00)


Evaluating answers with Gemini:  68%|██████▊   | 102/150 [10:29<04:51,  6.08s/it]

Problem 101: NO (conf: 1.00)


Evaluating answers with Gemini:  69%|██████▊   | 103/150 [10:35<04:47,  6.13s/it]

Problem 102: NO (conf: 1.00)


Evaluating answers with Gemini:  69%|██████▉   | 104/150 [10:41<04:42,  6.14s/it]

Problem 103: NO (conf: 1.00)


Evaluating answers with Gemini:  70%|███████   | 105/150 [10:47<04:37,  6.18s/it]

Problem 104: NO (conf: 1.00)


Evaluating answers with Gemini:  71%|███████   | 106/150 [10:54<04:44,  6.46s/it]

Problem 105: YES (conf: 0.95)


Evaluating answers with Gemini:  71%|███████▏  | 107/150 [11:03<05:10,  7.22s/it]

Problem 106: NO (conf: 1.00)


Evaluating answers with Gemini:  72%|███████▏  | 108/150 [11:09<04:44,  6.78s/it]

Problem 107: NO (conf: 1.00)


Evaluating answers with Gemini:  73%|███████▎  | 109/150 [11:15<04:29,  6.57s/it]

Problem 108: NO (conf: 1.00)


Evaluating answers with Gemini:  73%|███████▎  | 110/150 [11:21<04:14,  6.37s/it]

Problem 109: NO (conf: 1.00)


Evaluating answers with Gemini:  74%|███████▍  | 111/150 [11:27<04:05,  6.30s/it]

Problem 110: NO (conf: 1.00)


Evaluating answers with Gemini:  75%|███████▍  | 112/150 [11:34<04:02,  6.39s/it]

Problem 111: NO (conf: 0.90)


Evaluating answers with Gemini:  75%|███████▌  | 113/150 [11:40<03:55,  6.37s/it]

Problem 112: NO (conf: 1.00)


Evaluating answers with Gemini:  76%|███████▌  | 114/150 [11:52<04:42,  7.84s/it]

Problem 113: NO (conf: 0.95)


Evaluating answers with Gemini:  77%|███████▋  | 115/150 [11:58<04:17,  7.37s/it]

Problem 114: NO (conf: 0.70)


Evaluating answers with Gemini:  77%|███████▋  | 116/150 [12:04<03:59,  7.04s/it]

Problem 115: NO (conf: 1.00)


Evaluating answers with Gemini:  78%|███████▊  | 117/150 [12:10<03:38,  6.63s/it]

Problem 116: NO (conf: 1.00)


Evaluating answers with Gemini:  79%|███████▊  | 118/150 [12:16<03:31,  6.60s/it]

Problem 117: NO (conf: 1.00)


Evaluating answers with Gemini:  79%|███████▉  | 119/150 [12:23<03:27,  6.70s/it]

Problem 118: YES (conf: 0.90)


Evaluating answers with Gemini:  80%|████████  | 120/150 [12:29<03:14,  6.49s/it]

Problem 119: NO (conf: 1.00)


Evaluating answers with Gemini:  81%|████████  | 121/150 [12:35<03:04,  6.35s/it]

Problem 120: NO (conf: 1.00)


Evaluating answers with Gemini:  81%|████████▏ | 122/150 [12:42<02:58,  6.38s/it]

Problem 121: NO (conf: 1.00)


Evaluating answers with Gemini:  82%|████████▏ | 123/150 [12:48<02:50,  6.32s/it]

Problem 122: NO (conf: 1.00)


Evaluating answers with Gemini:  83%|████████▎ | 124/150 [12:54<02:42,  6.24s/it]

Problem 123: NO (conf: 1.00)


Evaluating answers with Gemini:  83%|████████▎ | 125/150 [13:00<02:34,  6.20s/it]

Problem 124: NO (conf: 0.90)


Evaluating answers with Gemini:  84%|████████▍ | 126/150 [13:06<02:27,  6.13s/it]

Problem 125: YES (conf: 1.00)


Evaluating answers with Gemini:  85%|████████▍ | 127/150 [13:12<02:22,  6.17s/it]

Problem 126: NO (conf: 1.00)


Evaluating answers with Gemini:  85%|████████▌ | 128/150 [13:18<02:12,  6.02s/it]

Problem 127: NO (conf: 1.00)


Evaluating answers with Gemini:  86%|████████▌ | 129/150 [13:24<02:05,  5.98s/it]

Problem 128: NO (conf: 0.95)


Evaluating answers with Gemini:  87%|████████▋ | 130/150 [13:30<01:58,  5.95s/it]

Problem 129: NO (conf: 1.00)


Evaluating answers with Gemini:  87%|████████▋ | 131/150 [13:36<01:54,  6.03s/it]

Problem 130: YES (conf: 0.90)


Evaluating answers with Gemini:  88%|████████▊ | 132/150 [13:42<01:51,  6.17s/it]

Problem 131: NO (conf: 1.00)


Evaluating answers with Gemini:  89%|████████▊ | 133/150 [14:03<02:57, 10.42s/it]

Problem 132: NO (conf: 1.00)


Evaluating answers with Gemini:  89%|████████▉ | 134/150 [14:10<02:30,  9.41s/it]

Problem 133: NO (conf: 1.00)


Evaluating answers with Gemini:  90%|█████████ | 135/150 [14:17<02:09,  8.63s/it]

Problem 134: NO (conf: 1.00)


Evaluating answers with Gemini:  91%|█████████ | 136/150 [14:22<01:48,  7.78s/it]

Problem 135: YES (conf: 1.00)


Evaluating answers with Gemini:  91%|█████████▏| 137/150 [14:29<01:35,  7.34s/it]

Problem 136: NO (conf: 1.00)


Evaluating answers with Gemini:  92%|█████████▏| 138/150 [14:36<01:28,  7.37s/it]

Problem 137: NO (conf: 0.95)


Evaluating answers with Gemini:  93%|█████████▎| 139/150 [14:42<01:15,  6.89s/it]

Problem 138: YES (conf: 1.00)


Evaluating answers with Gemini:  93%|█████████▎| 140/150 [14:48<01:05,  6.59s/it]

Problem 139: YES (conf: 1.00)


Evaluating answers with Gemini:  94%|█████████▍| 141/150 [14:54<00:57,  6.36s/it]

Problem 140: NO (conf: 1.00)


Evaluating answers with Gemini:  95%|█████████▍| 142/150 [14:59<00:49,  6.17s/it]

Problem 141: NO (conf: 1.00)


Evaluating answers with Gemini:  95%|█████████▌| 143/150 [15:05<00:42,  6.02s/it]

Problem 142: NO (conf: 1.00)


Evaluating answers with Gemini:  96%|█████████▌| 144/150 [15:11<00:36,  6.10s/it]

Problem 143: YES (conf: 1.00)


Evaluating answers with Gemini:  97%|█████████▋| 145/150 [15:17<00:30,  6.03s/it]

Problem 144: NO (conf: 1.00)


Evaluating answers with Gemini:  97%|█████████▋| 146/150 [15:25<00:25,  6.46s/it]

Problem 145: NO (conf: 0.99)


Evaluating answers with Gemini:  98%|█████████▊| 147/150 [15:31<00:18,  6.30s/it]

Problem 146: NO (conf: 1.00)


Evaluating answers with Gemini:  99%|█████████▊| 148/150 [15:37<00:12,  6.21s/it]

Problem 147: YES (conf: 1.00)


Evaluating answers with Gemini:  99%|█████████▉| 149/150 [15:43<00:06,  6.20s/it]

Problem 148: NO (conf: 1.00)


Evaluating answers with Gemini: 100%|██████████| 150/150 [15:49<00:00,  6.33s/it]

Problem 149: NO (conf: 1.00)

EVALUATION COMPLETED!
Model Used: gemini-2.5-flash-lite
Total Problems: 150
Successfully Processed: 150
Correct Answers: 28
Overall Accuracy: 18.67%
Problems with Confidence: 149 (99.3%)
Average Confidence: 0.989
Confidence Range: 0.700 - 1.000
Results saved to: /content/evaluation_results_Turkish_Mathstral_150.json





In [None]:
import json
import os
from collections import defaultdict, OrderedDict

def compute_accuracy(path):
    """
    Same computation but returns an OrderedDict with top-level keys in this order:
    Numerical, Proof, Symbolic, Overall
    Each Answer-Type dict fields are in the order:
    'Answer Type', 'accuracy_percent', 'no', 'other', 'total', 'yes'
    Overall fields in order:
    'accuracy_percent', 'decision_known_total', 'decision_known_yes', 'total_items', 'yes'
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")

    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    items = data.get('problem_evaluations', [])
    stats = defaultdict(lambda: {'total': 0, 'yes': 0, 'no': 0, 'other': 0})

    for it in items:
        a_type = it.get('answer_type')
        # Normalize Answer Type (keep None for missing)
        a_type_norm = a_type.strip() if isinstance(a_type, str) else None

        decision = str(it.get('decision', '')).strip().lower()

        stats[a_type_norm]['total'] += 1
        if decision == 'yes':
            stats[a_type_norm]['yes'] += 1
        elif decision == 'no':
            stats[a_type_norm]['no'] += 1
        else:
            stats[a_type_norm]['other'] += 1

    def summarize_for(key):
        for stored_key, v in stats.items():
            if stored_key and stored_key.lower() == key.lower():
                total = v['total']
                yes = v['yes']
                no = v['no']
                other = v['other']
                acc_frac = (yes / total) if total > 0 else None
                acc_percent = round(acc_frac * 100, 2) if acc_frac is not None else None
                # return ordered mapping for fields (regular dict order is stable in 3.7+)
                return OrderedDict([
                    ('Answer Type', stored_key),
                    ('accuracy_percent', acc_percent),
                    ('no', no),
                    ('other', other),
                    ('total', total),
                    ('yes', yes),
                ])
        # not found
        return OrderedDict([
            ('Answer Type', key),
            ('accuracy_percent', None),
            ('no', 0),
            ('other', 0),
            ('total', 0),
            ('yes', 0),
        ])

    # overall calculations
    total_items = sum(v['total'] for v in stats.values())
    total_yes = sum(v['yes'] for v in stats.values())
    overall_frac = (total_yes / total_items) if total_items > 0 else None
    overall_percent = round(overall_frac * 100, 2) if overall_frac is not None else None

    decision_known_total = sum((v['yes'] + v['no']) for v in stats.values())
    decision_known_yes = sum(v['yes'] for v in stats.values())

    # Ordered overall summary
    overall_summary = OrderedDict([
        ('accuracy_percent', overall_percent),
        ('decision_known_total', decision_known_total),
        ('decision_known_yes', decision_known_yes),
        ('total_items', total_items),
        ('yes', total_yes),
    ])

    # Build final OrderedDict in the exact order requested
    final = OrderedDict()
    final['Numerical'] = summarize_for('Numerical')
    final['Proof'] = summarize_for('Proof')
    final['Symbolic'] = summarize_for('Symbolic')
    final['Overall'] = overall_summary

    return final

if __name__ == "__main__":
    path = '/content/evaluation_results_Turkish_Mathstral_150.json'
    res = compute_accuracy(path)

    # Use json.dumps to print while preserving insertion order visually
    print(json.dumps(res, indent=4, ensure_ascii=False))

{
    "Numerical": {
        "Answer Type": "Numerical",
        "accuracy_percent": 6.0,
        "no": 47,
        "other": 0,
        "total": 50,
        "yes": 3
    },
    "Proof": {
        "Answer Type": "Proof",
        "accuracy_percent": 32.0,
        "no": 34,
        "other": 0,
        "total": 50,
        "yes": 16
    },
    "Symbolic": {
        "Answer Type": "Symbolic",
        "accuracy_percent": 18.0,
        "no": 41,
        "other": 0,
        "total": 50,
        "yes": 9
    },
    "Overall": {
        "accuracy_percent": 18.67,
        "decision_known_total": 150,
        "decision_known_yes": 28,
        "total_items": 150,
        "yes": 28
    }
}


In [None]:
# import json
# import os
# from collections import defaultdict

# def compute_accuracy(path):
#     """
#     Compute accuracy (decision == 'yes') for Answer Types:
#     Symbolic, Numerical, Proof. Returns a dict with summaries,
#     plus overall accuracy (including items with 'other') and
#     decision-known accuracy (only items with yes/no).
#     """
#     if not os.path.exists(path):
#         raise FileNotFoundError(f"File not found: {path}")

#     with open(path, 'r', encoding='utf-8') as f:
#         data = json.load(f)

#     items = data.get('problem_evaluations', [])
#     stats = defaultdict(lambda: {'total': 0, 'yes': 0, 'no': 0, 'other': 0})

#     for it in items:
#         a_type = it.get('answer_type')
#         # Normalize Answer Type (keep None for missing)
#         a_type_norm = a_type.strip() if isinstance(a_type, str) else None

#         decision = str(it.get('decision', '')).strip().lower()

#         stats[a_type_norm]['total'] += 1
#         if decision == 'yes':
#             stats[a_type_norm]['yes'] += 1
#         elif decision == 'no':
#             stats[a_type_norm]['no'] += 1
#         else:
#             stats[a_type_norm]['other'] += 1

#     # helper to find a stored key case-insensitively
#     def summarize_for(key):
#         for stored_key, v in stats.items():
#             if stored_key and stored_key.lower() == key.lower():
#                 total = v['total']
#                 yes = v['yes']
#                 no = v['no']
#                 other = v['other']
#                 acc = (yes / total) if total > 0 else None
#                 return {
#                     'Answer Type': stored_key,
#                     'total': total,
#                     'yes': yes,
#                     'no': no,
#                     'other': other,
#                     'accuracy_frac': acc,
#                     'accuracy_percent': round(acc * 100, 2) if acc is not None else None
#                 }
#         # not found
#         return {
#             'Answer Type': key,
#             'total': 0,
#             'yes': 0,
#             'no': 0,
#             'other': 0,
#             'accuracy_frac': None,
#             'accuracy_percent': None
#         }

#     # overall calculations
#     total_items = sum(v['total'] for v in stats.values())
#     total_yes = sum(v['yes'] for v in stats.values())
#     overall_frac = (total_yes / total_items) if total_items > 0 else None

#     # accuracy only on items where decision is yes/no (ignore 'other')
#     decision_known_total = sum((v['yes'] + v['no']) for v in stats.values())
#     decision_known_yes = sum(v['yes'] for v in stats.values())
#     decision_known_frac = (decision_known_yes / decision_known_total) if decision_known_total > 0 else None

#     overall_summary = {
#         'total_items': total_items,
#         'yes': total_yes,
#         'accuracy_frac': overall_frac,
#         'accuracy_percent': round(overall_frac * 100, 2) if overall_frac is not None else None,
#         'decision_known_total': decision_known_total,
#         'decision_known_yes': decision_known_yes,
#         'decision_known_accuracy_frac': decision_known_frac,
#         'decision_known_accuracy_percent': round(decision_known_frac * 100, 2) if decision_known_frac is not None else None
#     }

#     # return requested summaries plus overall
#     return {
#         'Symbolic': summarize_for('Symbolic'),
#         'Numerical': summarize_for('Numerical'),
#         'Proof': summarize_for('Proof'),
#         'Overall': overall_summary
#     }

# if __name__ == "__main__":
#     # adjust this path to your environment (Colab: /content, sandbox: /mnt/data, etc.)
#     path = '/content/evaluation_results_French_GPT_OSS_20B_150.json'
#     res = compute_accuracy(path)
#     import pprint
#     pprint.pprint(res)

#     # also print a short human-readable summary
#     overall = res['Overall']
#     if overall['accuracy_frac'] is not None:
#         print("\nOverall accuracy (counting every item): "
#               f"{overall['accuracy_percent']}% ({overall['yes']}/{overall['total_items']})")
#     else:
#         print("\nOverall accuracy: No items found.")

#     if overall['decision_known_accuracy_frac'] is not None:
#         print("Decision-known accuracy (only yes/no): "
#               f"{overall['decision_known_accuracy_percent']}% "
#               f"({overall['decision_known_yes']}/{overall['decision_known_total']})")
#     else:
#         print("Decision-known accuracy: No yes/no decisions found.")

{'Numerical': {'Answer Type': 'Numerical',
               'accuracy_frac': 0.7,
               'accuracy_percent': 70.0,
               'no': 15,
               'other': 0,
               'total': 50,
               'yes': 35},
 'Overall': {'accuracy_frac': 0.76,
             'accuracy_percent': 76.0,
             'decision_known_accuracy_frac': 0.76,
             'decision_known_accuracy_percent': 76.0,
             'decision_known_total': 150,
             'decision_known_yes': 114,
             'total_items': 150,
             'yes': 114},
 'Proof': {'Answer Type': 'Proof',
           'accuracy_frac': 0.74,
           'accuracy_percent': 74.0,
           'no': 13,
           'other': 0,
           'total': 50,
           'yes': 37},
 'Symbolic': {'Answer Type': 'Symbolic',
              'accuracy_frac': 0.84,
              'accuracy_percent': 84.0,
              'no': 8,
              'other': 0,
              'total': 50,
              'yes': 42}}

Overall accuracy (counting every 