In [None]:
import json
import os
import re
import time
import logging
import queue
import threading
import traceback
from tqdm import tqdm
from collections import deque
from google import genai

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('justification_evaluation.log'),
        logging.StreamHandler()
    ]
)


class GeminiTranslationApiManager:
    """
    Manages multiple Gemini API keys with rotation and rate limiting for evaluation tasks.
    """

    def __init__(self, api_keys, calls_per_day=1000, rate_limit_delay=5):
        if not api_keys:
            raise ValueError("api_keys must contain at least one key")

        self.api_keys = deque(api_keys)
        self.calls_per_day = calls_per_day
        self.rate_limit_delay = rate_limit_delay

        self.usage_count = {key: 0 for key in api_keys}
        self.current_key = self.api_keys[0]
        self.client = genai.Client(api_key=self.current_key)

        self.lock = threading.Lock()
        self.call_queue = queue.Queue()
        self.worker_thread = threading.Thread(target=self._process_queue, name="GeminiWorker")
        self.worker_thread.daemon = True
        self.worker_thread.start()

        logging.info(f"Gemini API Manager initialized with {len(api_keys)} keys")

    def _rotate_key(self):
        """Rotate to the next available API key."""
        with self.lock:
            self.api_keys.rotate(1)
            self.current_key = self.api_keys[0]
            self.client = genai.Client(api_key=self.current_key)
            usage = self.usage_count.get(self.current_key, 0)
        logging.info(f"Rotated to new API key (usage: {usage})")

    def _find_available_key(self):
        """Find an API key that hasn't reached the daily limit."""
        with self.lock:
            if self.usage_count.get(self.current_key, 0) < self.calls_per_day:
                return True

        initial_key = self.current_key
        for _ in range(len(self.api_keys)):
            self._rotate_key()
            with self.lock:
                if self.usage_count.get(self.current_key, 0) < self.calls_per_day:
                    return True
            if self.current_key == initial_key:
                return False
        return False

    def _process_queue(self):
        """Process the queue of API calls."""
        while True:
            try:
                args, kwargs, result_queue = self.call_queue.get()

                if not self._find_available_key():
                    err = {"error": "All API keys have reached their daily limit"}
                    result_queue.put(err)
                    self.call_queue.task_done()
                    time.sleep(10)
                    continue

                try:
                    response = self.client.models.generate_content(*args, **kwargs)
                    result_queue.put({"response": response})
                    with self.lock:
                        self.usage_count[self.current_key] += 1

                except Exception as api_exc:
                    msg = str(api_exc).lower()
                    if 'quota' in msg or 'rate limit' in msg:
                        with self.lock:
                            self.usage_count[self.current_key] = self.calls_per_day
                        logging.warning(f"API key reached quota/rate-limit: {api_exc}")
                    result_queue.put({"error": str(api_exc)})

                time.sleep(self.rate_limit_delay)
                self.call_queue.task_done()

            except Exception as e:
                logging.error(f"Queue processing error: {e}\n{traceback.format_exc()}")
                time.sleep(1)

    def generate_content(self, *args, timeout=300, **kwargs):
        """Make an API call to generate content."""
        result_queue = queue.Queue()
        self.call_queue.put((args, kwargs, result_queue))

        try:
            result = result_queue.get(timeout=timeout)
        except queue.Empty:
            raise TimeoutError("Timed out waiting for API response")

        if "error" in result:
            raise Exception(result["error"])
        return result["response"]

    def get_usage_stats(self):
        """Get usage statistics for all keys."""
        with self.lock:
            per_key = dict(self.usage_count)
        total_used = sum(per_key.values())
        total_available = len(self.api_keys) * self.calls_per_day
        return {
            "per_key": per_key,
            "total_used": total_used,
            "total_available": total_available,
            "percent_used": (total_used / total_available) * 100 if total_available > 0 else 0
        }


class GeminiJustificationEvaluator:
    """
    Gemini evaluator for comparing mathematical error justifications.
    Determines if LLM-generated error descriptions match the intended errors.
    """

    THINK_MARKER = "<<END_THINK>>"

    # Example for few-shot prompting
    EXAMPLE_EVALUATION = """
EXAMPLE 1:
Intended Error: "Incorrect application of the quadratic formula; missed the negative sign in the discriminant calculation"
Erroneous Part: "The step where discriminant is calculated as b² + 4ac instead of b² - 4ac"
LLM Generated Description: "The student incorrectly computed the discriminant using b² + 4ac rather than the correct formula b² - 4ac, leading to an incorrect result"
Analysis: The LLM correctly identified the specific error (wrong sign in discriminant), the exact location (discriminant calculation step), and the consequence. This matches both the error type and the erroneous part precisely.
<<END_THINK>>
{"secondary_decision":"True Positive", "confidence":0.95, "short_reason":"LLM accurately identified discriminant sign error"}
SECONDARY_DECISION: True Positive

EXAMPLE 2:
Intended Error: "Logical fallacy: affirming the consequent; claims 'if P then Q' is true, therefore 'if Q then P' must be true"
Erroneous Part: "Statement 'If a number can be squared to get another number, then it must be rational' - this is a logical fallacy"
LLM Generated Description: "The solution contains a calculation error in step 3 where the student multiplied instead of dividing"
Analysis: The LLM identified a completely different error (calculation mistake) than the intended error (logical fallacy). The LLM missed the actual reasoning flaw and focused on computational aspects instead.
<<END_THINK>>
{"secondary_decision":"False Positive", "confidence":0.90, "short_reason":"LLM identified wrong error type - computational vs logical"}
SECONDARY_DECISION: False Positive

EXAMPLE 3:
Intended Error: "Overgeneralization without proof; claims all square roots of perfect squares are rational without justification"
Erroneous Part: "The statement 'Any square root of a perfect square must be rational' lacks proper justification"
LLM Generated Description: "No errors detected. The solution correctly evaluates √9 = 3 and properly concludes it is rational"
Analysis: The LLM completely failed to detect any error despite the presence of an unjustified overgeneralization. This is a False Negative case.
<<END_THINK>>
{"secondary_decision":"False Negative", "confidence":0.85, "short_reason":"LLM failed to detect overgeneralization error"}
SECONDARY_DECISION: False Negative
"""

    def __init__(self, api_manager, seed=42):
        self.api_manager = api_manager
        self.seed = seed
        self.model_name = "gemini-2.5-flash-lite"

        logging.info(f"Gemini justification evaluator initialized with model: {self.model_name}")
        print(f"Gemini justification evaluator initialized with model: {self.model_name}")

    def generate_response(self, prompt):
        """
        Generate response using Gemini API and split at THINK_MARKER.
        Returns: (thinking_text, final_text, full_response_text)
        """
        try:
            response = self.api_manager.generate_content(
                model=self.model_name,
                contents=prompt,
                timeout=300
            )

            # Extract response text
            full_response = ""
            if hasattr(response, 'text'):
                full_response = response.text
            elif isinstance(response, dict) and 'text' in response:
                full_response = response['text']
            else:
                full_response = str(response)

            full_response = full_response.strip()

            # Try to find the last occurrence of the THINK_MARKER in the response
            marker = self.THINK_MARKER
            idx = full_response.rfind(marker)
            if idx >= 0:
                thinking = full_response[:idx].strip()
                final = full_response[idx + len(marker):].strip()
                return thinking, final, full_response

            # If no marker present, return empty thinking and whole text as final
            return "", full_response, full_response

        except Exception as e:
            logging.error(f"Error generating response with Gemini: {e}", exc_info=True)
            return "", f"Error: {str(e)}", ""


def _extract_json_block(text):
    """
    Find the last valid JSON object in text using improved pattern matching.
    Returns (json_obj, json_text) or (None, None)
    """
    if not text:
        return None, None

    # First try: regex pattern for JSON-like structures
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.findall(json_pattern, text)

    # Try matches in reverse order (last match first)
    for match in reversed(matches):
        try:
            obj = json.loads(match)
            return obj, match
        except:
            continue

    # Fallback: original greedy approach
    last_open = text.rfind('{')
    if last_open == -1:
        return None, None

    candidate = text[last_open:]
    brace_count = 0

    for i, char in enumerate(candidate):
        if char == '{':
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0:
                try:
                    maybe = candidate[:i+1]
                    obj = json.loads(maybe)
                    return obj, maybe
                except:
                    continue

    return None, None


def extract_secondary_decision_and_confidence(final_text):
    """
    Extract secondary decision, confidence, and short reason from the final_text.
    Priority:
      1) JSON block with keys 'secondary_decision', 'confidence', 'short_reason'
      2) A final line 'SECONDARY_DECISION: True Positive/False Positive/False Negative'
      3) Heuristic text matching
    Returns: (secondary_decision_str_or_none, confidence_or_none, short_reason_or_none)
    """
    if not final_text:
        return None, None, None

    valid_decisions = ["True Positive", "False Positive", "False Negative"]

    # 1) Try JSON block first
    obj, obj_text = _extract_json_block(final_text)
    if obj is not None:
        decision = None
        conf = None
        short = None

        # Normalize decision key
        for key in ("secondary_decision", "Secondary_Decision", "SECONDARY_DECISION", "decision"):
            if key in obj:
                decision_val = str(obj[key]).strip()
                # Match any of the valid decisions
                for valid in valid_decisions:
                    if decision_val.lower() == valid.lower():
                        decision = valid
                        break
                if decision:
                    break

        # Extract confidence
        for key in ("confidence", "conf", "confidence_score"):
            if key in obj:
                try:
                    conf_val = float(obj[key])
                    if 0.0 <= conf_val <= 1.0:
                        conf = conf_val
                    elif 0.0 <= conf_val <= 100.0:
                        conf = conf_val / 100.0
                except:
                    conf = None
                break

        # Extract short reason
        for key in ("short_reason", "reason", "short_reasoning"):
            if key in obj:
                short = str(obj[key])
                break

        if decision in valid_decisions:
            return decision, conf, short

    # 2) Check final lines for SECONDARY_DECISION: [value]
    lines = [ln.strip() for ln in final_text.strip().splitlines() if ln.strip()]
    if lines:
        last_line = lines[-1].strip().rstrip('.')
        for valid in valid_decisions:
            patterns = [
                f"SECONDARY_DECISION: {valid}",
                f"SECONDARY DECISION: {valid}",
                f"SECONDARY_DECISION {valid}",
                valid.upper()
            ]
            for pattern in patterns:
                if last_line.upper() == pattern.upper():
                    return valid, None, None

    # 3) Heuristic matching in conclusion context
    tail = " ".join(lines[-3:]).upper()
    conclusion_indicators = ["THEREFORE", "THUS", "CONCLUSION", "FINAL", "RESULT", "DECISION"]

    has_conclusion_context = any(indicator in tail for indicator in conclusion_indicators)

    if has_conclusion_context:
        for valid in valid_decisions:
            if valid.upper() in tail:
                return valid, None, None

    return None, None, None


def evaluate_justification_with_gemini(gemini_evaluator, problem_data, problem_index):
    """
    Evaluate if LLM-generated error description matches the intended error.
    Returns a dict with thinking, final_text, secondary_decision, confidence, short_reason
    """
    try:
        decision = problem_data.get('Decision', '')
        llm_error_desc = problem_data.get('llm_generated_error_desc', '')
        error_description = problem_data.get('error_description', '')
        erroneous_part = problem_data.get('erroneous_part', '')
        wrong_solution = problem_data.get('Wrong_Solution_Version_3', '')
        llm_no_error = problem_data.get('llm_no_error_justification', '')

        # Handle Decision = "NO" case
        if decision.upper() == "NO":
            # Still analyze why LLM failed, but decision is predetermined
            prompt = f"""You are an expert in evaluating mathematical error detection systems.

The LLM was tasked with identifying errors in a mathematical solution, but it FAILED to detect any errors (Decision: NO).

However, there WERE intentional errors present in the solution:

INTENDED ERROR DESCRIPTION:
{error_description}

SPECIFIC ERRONEOUS PART:
{erroneous_part}

WRONG SOLUTION (with errors):
{wrong_solution}

LLM'S RESPONSE (claiming no errors):
{llm_no_error if llm_no_error else "The LLM stated there were no errors."}

TASK:
Analyze WHY the LLM failed to detect these errors. Consider:
1) Did the error require advanced reasoning the LLM lacks?
2) Was the error subtle or well-disguised?
3) Did the LLM focus on surface-level correctness?
4) What specific detection capability was missing?

Provide your analysis, then output:
<<END_THINK>>
{{"secondary_decision":"False Negative", "confidence":0.0-1.0, "short_reason":"brief explanation of failure"}}
SECONDARY_DECISION: False Negative
"""

        else:  # Decision = "YES"
            prompt = f"""You are an expert in evaluating mathematical error detection systems. Your task is to determine if an LLM correctly identified the intended errors in a mathematical solution.

EVALUATION CRITERIA:
You must check if the LLM-generated error description matches the intended error on ALL these dimensions:

1) **ERROR TYPE MATCH**: Does the LLM identify the same category of error?
   - Logical fallacies (affirming consequent, circular reasoning, etc.)
   - Overgeneralization without proof
   - Incorrect rule application
   - Step omissions
   - Faulty causal reasoning
   - Calculation errors
   - Conceptual misunderstandings

2) **SPECIFIC LOCATION MATCH**: Does the LLM pinpoint the same erroneous step/statement?

3) **REASONING MATCH**: Does the LLM explain WHY it's wrong in a similar way?

4) **CONSEQUENCE AWARENESS**: Does the LLM understand the impact of the error?

{GeminiJustificationEvaluator.EXAMPLE_EVALUATION}

CLASSIFICATION RULES:
- **True Positive**: LLM correctly identifies the error type, location, and reasoning (matches on all key dimensions)
- **False Positive**: LLM detects an error but identifies WRONG error type/location, or misunderstands the nature of the error

Now evaluate the following case:

INTENDED ERROR DESCRIPTION (what was deliberately introduced):
{error_description}

SPECIFIC ERRONEOUS PART (the exact problematic component):
{erroneous_part}

WRONG SOLUTION (containing the errors):
{wrong_solution}

LLM-GENERATED ERROR DESCRIPTION (what the LLM identified):
{llm_error_desc}

EVALUATION PROCESS:
Step 1: Extract the key error type(s) from the intended error description
Step 2: Extract the key error type(s) from the LLM-generated description
Step 3: Compare if they match (same error category)
Step 4: Check if the LLM identified the correct location/step
Step 5: Assess if the reasoning for why it's wrong aligns
Step 6: Make final classification with confidence

OUTPUT REQUIREMENTS:
1) Provide detailed step-by-step analysis
2) Insert the thinking marker:
{GeminiJustificationEvaluator.THINK_MARKER}
3) Provide JSON block:
   {{"secondary_decision":"True Positive" or "False Positive", "confidence":0.0-1.0, "short_reason":"..."}}
4) End with:
   SECONDARY_DECISION: True Positive
   OR
   SECONDARY_DECISION: False Positive
"""

        thinking, final_text, full = gemini_evaluator.generate_response(prompt)

        secondary_decision, confidence, short_reason = extract_secondary_decision_and_confidence(final_text)

        return {
            'thinking': thinking,
            'final_text': final_text,
            'full_decoded': full,
            'secondary_decision': secondary_decision,
            'confidence': confidence,
            'short_reason': short_reason,
        }

    except Exception as e:
        logging.error(f"Error evaluating justification for problem {problem_index}: {e}", exc_info=True)
        return {
            'thinking': '',
            'final_text': f'Error occurred during evaluation: {str(e)}',
            'full_decoded': '',
            'secondary_decision': None,
            'confidence': None,
            'short_reason': None,
        }


def validate_and_retry_evaluation(gemini_evaluator, problem_data, problem_index, max_retries=3):
    """
    Retry loop that attempts to get a clear secondary decision.
    Returns (thinking, final_text, secondary_decision, confidence, short_reason)
    """
    last_result = None
    valid_decisions = ["True Positive", "False Positive", "False Negative"]

    for attempt in range(1, max_retries + 1):
        result = evaluate_justification_with_gemini(
            gemini_evaluator,
            problem_data,
            problem_index
        )

        secondary_decision = result['secondary_decision']
        thinking = result['thinking']
        final_text = result['final_text']
        confidence = result.get('confidence')
        short_reason = result.get('short_reason')

        last_result = result

        # Success condition: we got a valid secondary decision
        if secondary_decision in valid_decisions:
            if confidence is not None:
                logging.info(f"Problem {problem_index}: {secondary_decision} (conf: {confidence:.2f})")
            return thinking, final_text, secondary_decision, confidence, short_reason

        logging.warning(f"Attempt {attempt}/{max_retries} failed to get clear secondary decision for problem {problem_index}; got={secondary_decision}")

        if attempt < max_retries:
            time.sleep(0.1)

    # If all attempts fail, use conservative default based on original Decision
    decision = problem_data.get('Decision', '').upper()
    if decision == "NO":
        default_secondary = "False Negative"
    else:
        default_secondary = "False Positive"  # Conservative: assume mismatch if unclear

    logging.error(f"All {max_retries} attempts failed for problem {problem_index}; defaulting to {default_secondary}")

    if last_result is None:
        return '', 'All attempts failed - no output', default_secondary, None, 'Evaluation failed'

    return (
        last_result['thinking'],
        last_result['final_text'],
        default_secondary,
        last_result.get('confidence'),
        last_result.get('short_reason') or 'Failed to get clear decision'
    )


def save_json_atomic(obj, path):
    """Save JSON file atomically to prevent corruption."""
    tmp = path + '.tmp'
    with open(tmp, 'w', encoding='utf-8') as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)
    os.replace(tmp, path)


def calculate_confidence_stats(evaluation_results):
    """Calculate statistics for confidence scores."""
    confidences = [r.get('gemini_evaluation', {}).get('confidence')
                   for r in evaluation_results
                   if r.get('gemini_evaluation', {}).get('confidence') is not None]
    if not confidences:
        return {"count": 0}

    return {
        "count": len(confidences),
        "mean": sum(confidences) / len(confidences),
        "min": min(confidences),
        "max": max(confidences),
        "median": sorted(confidences)[len(confidences) // 2]
    }


def main():
    """Main function to execute the justification evaluation process."""

    # Configuration
    JSON_FILE_PATH = '/content/Perturbation_identification_Mathstral_Bangla_150.json'
    OUTPUT_FILE_PATH = '/content/justification_evaluation_results_Mathstral_Bangla.json'

    # Judge generation settings
    MAX_RETRIES = 3

    # Load API keys
    try:
        with open('/content/part_3_api_key.txt', 'r') as fh:
            keys = fh.read().strip()

        if keys:
            os.environ['GEMINI_API_KEYS'] = keys
            logging.info("Loaded API keys successfully")

        api_keys = None
        if 'keys' in locals() and keys:
            api_keys = [k.strip() for k in keys.split(',') if k.strip()]

        if not api_keys:
            env_val = os.environ.get('GEMINI_API_KEYS', '').strip()
            if env_val:
                api_keys = [k.strip() for k in env_val.split(',') if k.strip()]

        if not api_keys:
            raise ValueError("No API keys found")

    except Exception as e:
        logging.error(f"Failed to load API keys: {e}")
        print(f"Error loading API keys: {e}")
        return

    # Initialize API manager
    api_manager = GeminiTranslationApiManager(
        api_keys=api_keys,
        calls_per_day=1000,
        rate_limit_delay=5
    )

    # Initialize evaluator
    gemini_evaluator = GeminiJustificationEvaluator(
        api_manager=api_manager,
        seed=42
    )

    # Load input data
    try:
        with open(JSON_FILE_PATH, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
        logging.info(f"Loaded {len(json_data)} problems from JSON file")
        print(f"Loaded {len(json_data)} problems from JSON file")
    except Exception as e:
        logging.error(f"Failed to load JSON input: {e}")
        print(f"Error loading JSON: {e}")
        return

    evaluation_results = []
    total_problems = len(json_data)

    # Counters for Primary Accuracy (Decision field)
    primary_correct = 0  # Decision = "YES"
    primary_total = 0

    # Counters for Secondary Accuracy (Secondary Decision field)
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    secondary_total = 0

    problems_with_confidence = 0

    # Process each problem
    for idx, problem in enumerate(tqdm(json_data, desc='Evaluating justifications with Gemini')):
        try:
            problem_index = problem.get('problem_index', idx)
            decision = problem.get('Decision', '')

            # Calculate Primary Accuracy
            primary_total += 1
            if decision.upper() == "YES":
                primary_correct += 1

            # Perform Gemini evaluation
            thinking, final_text, secondary_decision, confidence, short_reason = validate_and_retry_evaluation(
                gemini_evaluator,
                problem,
                problem_index,
                max_retries=MAX_RETRIES
            )

            # Prepare result entry with all original fields plus new evaluation fields
            result_entry = dict(problem)  # Copy all original fields
            result_entry['Secondary_Decision'] = secondary_decision
            result_entry['gemini_evaluation'] = {
                'secondary_decision': secondary_decision,
                'confidence': confidence,
                'short_reason': short_reason,
                'thinking': thinking,
                'final_text': final_text,
            }

            evaluation_results.append(result_entry)

            # Calculate Secondary Accuracy
            secondary_total += 1
            if secondary_decision == "True Positive":
                true_positives += 1
            elif secondary_decision == "False Positive":
                false_positives += 1
            elif secondary_decision == "False Negative":
                false_negatives += 1

            if confidence is not None:
                problems_with_confidence += 1

            # Progress logging
            print(f"Problem {problem_index}: Decision={decision.upper()}, Secondary={secondary_decision}" +
                  (f" (conf: {confidence:.2f})" if confidence else ""))
            logging.info(f"Problem {problem_index}: Decision={decision}, Secondary={secondary_decision}" +
                        (f" (confidence: {confidence})" if confidence else ""))

            # Intermediate saves and progress reports
            if secondary_total % 50 == 0:
                primary_accuracy = (primary_correct / primary_total) * 100 if primary_total else 0
                secondary_accuracy = (true_positives / secondary_total) * 100 if secondary_total else 0
                conf_rate = (problems_with_confidence / secondary_total) * 100

                logging.info(f"Progress: {secondary_total}/{total_problems}")
                logging.info(f"Primary Accuracy (Decision=YES): {primary_correct}/{primary_total} ({primary_accuracy:.2f}%)")
                logging.info(f"Secondary Accuracy (True Positive): {true_positives}/{secondary_total} ({secondary_accuracy:.2f}%)")
                logging.info(f"TP: {true_positives}, FP: {false_positives}, FN: {false_negatives}")
                logging.info(f"Confidence rate: {problems_with_confidence}/{secondary_total} ({conf_rate:.1f}%)")

                # Save intermediate results
                intermediate_file = OUTPUT_FILE_PATH.replace('.json', f'_checkpoint_{secondary_total}.json')
                save_json_atomic(evaluation_results, intermediate_file)

        except Exception as e:
            logging.error(f"Error processing problem {problem.get('problem_index', idx)}: {e}")
            # Add entry with error information
            result_entry = dict(problem)
            result_entry['Secondary_Decision'] = 'False Positive'  # Conservative default
            result_entry['gemini_evaluation'] = {
                'secondary_decision': 'False Positive',
                'confidence': None,
                'short_reason': f'Error during evaluation: {str(e)}',
                'thinking': '',
                'final_text': f'Error during evaluation: {str(e)}',
            }
            evaluation_results.append(result_entry)
            false_positives += 1
            secondary_total += 1
            continue

    # Calculate final statistics
    primary_accuracy = (primary_correct / primary_total) * 100 if primary_total else 0
    secondary_accuracy = (true_positives / secondary_total) * 100 if secondary_total else 0
    confidence_rate = (problems_with_confidence / secondary_total) * 100 if secondary_total else 0
    confidence_stats = calculate_confidence_stats(evaluation_results)

    # Prepare final results
    final_results = {
        'evaluation_metadata': {
            'model_used': gemini_evaluator.model_name,
            'total_problems': total_problems,
            'successfully_processed': secondary_total,

            # Primary Accuracy (Decision field)
            'primary_accuracy': {
                'description': 'Accuracy based on Decision field (YES/NO error detection)',
                'ground_truth': 'YES (all problems have intentional errors)',
                'correct_detections': primary_correct,
                'total_problems': primary_total,
                'accuracy_percentage': round(primary_accuracy, 2)
            },

            # Secondary Accuracy (Secondary Decision field)
            'secondary_accuracy': {
                'description': 'Accuracy based on Secondary Decision field (justification quality)',
                'true_positives': true_positives,
                'false_positives': false_positives,
                'false_negatives': false_negatives,
                'total_evaluated': secondary_total,
                'accuracy_percentage': round(secondary_accuracy, 2),
                'precision': round((true_positives / (true_positives + false_positives) * 100) if (true_positives + false_positives) > 0 else 0, 2),
                'recall': round((true_positives / (true_positives + false_negatives) * 100) if (true_positives + false_negatives) > 0 else 0, 2)
            },

            'problems_with_confidence': problems_with_confidence,
            'confidence_rate_percentage': round(confidence_rate, 2),
            'confidence_statistics': confidence_stats,
            'evaluation_timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
            'settings': {
                'max_retries': MAX_RETRIES,
                'seed': gemini_evaluator.seed,
            }
        },
        'problem_evaluations': evaluation_results,
    }

    # Save final results
    save_json_atomic(final_results, OUTPUT_FILE_PATH)

    # Print summary
    print('\n' + '='*80)
    print('JUSTIFICATION EVALUATION COMPLETED!')
    print('='*80)
    print(f"Model Used: {gemini_evaluator.model_name}")
    print(f"Total Problems: {total_problems}")
    print(f"Successfully Processed: {secondary_total}")
    print()
    print("PRIMARY ACCURACY (Error Detection - Decision Field):")
    print(f"  Ground Truth: All problems should have Decision = YES")
    print(f"  Correct Detections (YES): {primary_correct}/{primary_total}")
    print(f"  Accuracy: {primary_accuracy:.2f}%")
    print()
    print("SECONDARY ACCURACY (Justification Quality - Secondary Decision Field):")
    print(f"  True Positives: {true_positives}")
    print(f"  False Positives: {false_positives}")
    print(f"  False Negatives: {false_negatives}")
    print(f"  Accuracy (TP rate): {secondary_accuracy:.2f}%")
    if (true_positives + false_positives) > 0:
        precision = (true_positives / (true_positives + false_positives)) * 100
        print(f"  Precision: {precision:.2f}%")
    if (true_positives + false_negatives) > 0:
        recall = (true_positives / (true_positives + false_negatives)) * 100
        print(f"  Recall: {recall:.2f}%")
    print()
    print(f"Problems with Confidence: {problems_with_confidence} ({confidence_rate:.1f}%)")
    if confidence_stats.get('count', 0) > 0:
        print(f"Average Confidence: {confidence_stats['mean']:.3f}")
        print(f"Confidence Range: {confidence_stats['min']:.3f} - {confidence_stats['max']:.3f}")
    print(f"\nResults saved to: {OUTPUT_FILE_PATH}")
    print('='*80)

    # Final logging
    logging.info('Justification evaluation with Gemini completed!')
    logging.info(f"Primary Accuracy: {primary_correct}/{primary_total} ({primary_accuracy:.2f}%)")
    logging.info(f"Secondary Accuracy: TP={true_positives}, FP={false_positives}, FN={false_negatives} ({secondary_accuracy:.2f}%)")
    logging.info(f"Confidence provided for {problems_with_confidence}/{secondary_total} problems ({confidence_rate:.1f}%)")
    logging.info(f"Results saved to: {OUTPUT_FILE_PATH}")


if __name__ == '__main__':
    main()

Gemini justification evaluator initialized with model: gemini-2.5-flash-lite
Loaded 150 problems from JSON file


Evaluating justifications with Gemini:   1%|          | 1/150 [00:04<11:36,  4.67s/it]

Problem 0: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:   1%|▏         | 2/150 [00:10<13:36,  5.51s/it]

Problem 1: Decision=YES, Secondary=False Negative (conf: 1.00)


Evaluating justifications with Gemini:   2%|▏         | 3/150 [00:18<16:21,  6.68s/it]

Problem 2: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:   3%|▎         | 4/150 [00:29<19:44,  8.11s/it]

Problem 3: Decision=YES, Secondary=False Positive (conf: 0.75)


Evaluating justifications with Gemini:   3%|▎         | 5/150 [00:35<17:46,  7.35s/it]

Problem 4: Decision=YES, Secondary=False Positive (conf: 0.75)


Evaluating justifications with Gemini:   4%|▍         | 6/150 [00:41<16:32,  6.89s/it]

Problem 5: Decision=YES, Secondary=False Positive (conf: 0.85)


Evaluating justifications with Gemini:   5%|▍         | 7/150 [00:51<19:09,  8.04s/it]

Problem 6: Decision=YES, Secondary=True Positive (conf: 0.90)


Evaluating justifications with Gemini:   5%|▌         | 8/150 [01:00<19:50,  8.39s/it]

Problem 7: Decision=YES, Secondary=False Positive (conf: 0.85)


Evaluating justifications with Gemini:   6%|▌         | 9/150 [01:12<22:29,  9.57s/it]

Problem 8: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:   7%|▋         | 10/150 [01:21<21:46,  9.33s/it]

Problem 9: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:   7%|▋         | 11/150 [01:31<21:54,  9.45s/it]

Problem 10: Decision=YES, Secondary=False Positive


Evaluating justifications with Gemini:   8%|▊         | 12/150 [01:44<24:24, 10.61s/it]

Problem 11: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:   9%|▊         | 13/150 [02:00<28:00, 12.27s/it]

Problem 12: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:   9%|▉         | 14/150 [02:08<24:52, 10.97s/it]

Problem 13: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  10%|█         | 15/150 [02:20<25:32, 11.35s/it]

Problem 14: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  11%|█         | 16/150 [02:30<23:54, 10.70s/it]

Problem 15: Decision=YES, Secondary=True Positive (conf: 0.90)


Evaluating justifications with Gemini:  11%|█▏        | 17/150 [02:43<25:42, 11.60s/it]

Problem 16: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  12%|█▏        | 18/150 [02:55<25:16, 11.49s/it]

Problem 17: Decision=YES, Secondary=False Positive (conf: 0.70)


Evaluating justifications with Gemini:  13%|█▎        | 19/150 [03:03<23:04, 10.57s/it]

Problem 18: Decision=YES, Secondary=False Positive (conf: 0.70)


Evaluating justifications with Gemini:  13%|█▎        | 20/150 [03:11<21:06,  9.75s/it]

Problem 19: Decision=YES, Secondary=True Positive (conf: 0.90)


Evaluating justifications with Gemini:  14%|█▍        | 21/150 [03:17<18:50,  8.76s/it]

Problem 20: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  15%|█▍        | 22/150 [03:26<18:33,  8.70s/it]

Problem 21: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  15%|█▌        | 23/150 [03:39<21:18, 10.07s/it]

Problem 22: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  16%|█▌        | 24/150 [03:48<20:14,  9.64s/it]

Problem 23: Decision=YES, Secondary=False Positive (conf: 0.70)


Evaluating justifications with Gemini:  17%|█▋        | 25/150 [04:00<21:27, 10.30s/it]

Problem 24: Decision=YES, Secondary=True Positive (conf: 0.90)


Evaluating justifications with Gemini:  17%|█▋        | 26/150 [04:09<20:26,  9.89s/it]

Problem 25: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  18%|█▊        | 27/150 [04:18<19:56,  9.73s/it]

Problem 26: Decision=NO, Secondary=False Negative (conf: 0.90)


Evaluating justifications with Gemini:  19%|█▊        | 28/150 [04:27<19:33,  9.62s/it]

Problem 27: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  19%|█▉        | 29/150 [04:42<22:48, 11.31s/it]

Problem 28: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  20%|██        | 30/150 [04:54<22:29, 11.25s/it]

Problem 29: Decision=YES, Secondary=True Positive (conf: 0.85)


Evaluating justifications with Gemini:  21%|██        | 31/150 [05:01<20:15, 10.21s/it]

Problem 30: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  21%|██▏       | 32/150 [05:17<23:10, 11.78s/it]

Problem 31: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  22%|██▏       | 33/150 [05:24<20:28, 10.50s/it]

Problem 32: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  23%|██▎       | 34/150 [05:30<17:37,  9.11s/it]

Problem 33: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  23%|██▎       | 35/150 [05:36<15:36,  8.14s/it]

Problem 34: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  24%|██▍       | 36/150 [05:45<15:53,  8.36s/it]

Problem 35: Decision=YES, Secondary=True Positive (conf: 0.85)


Evaluating justifications with Gemini:  25%|██▍       | 37/150 [05:54<16:24,  8.72s/it]

Problem 36: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  25%|██▌       | 38/150 [06:04<16:37,  8.90s/it]

Problem 37: Decision=NO, Secondary=False Negative (conf: 1.00)


Evaluating justifications with Gemini:  26%|██▌       | 39/150 [06:11<15:16,  8.26s/it]

Problem 38: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  27%|██▋       | 40/150 [06:20<15:35,  8.50s/it]

Problem 39: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  27%|██▋       | 41/150 [06:29<16:07,  8.87s/it]

Problem 40: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  28%|██▊       | 42/150 [06:41<17:41,  9.83s/it]

Problem 41: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  29%|██▊       | 43/150 [06:51<17:17,  9.70s/it]

Problem 42: Decision=YES, Secondary=False Positive (conf: 0.85)


Evaluating justifications with Gemini:  29%|██▉       | 44/150 [07:02<17:39, 10.00s/it]

Problem 43: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  30%|███       | 45/150 [07:10<16:43,  9.55s/it]

Problem 44: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  31%|███       | 46/150 [07:19<16:11,  9.34s/it]

Problem 45: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  31%|███▏      | 47/150 [07:25<14:19,  8.34s/it]

Problem 46: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  32%|███▏      | 48/150 [07:33<14:14,  8.38s/it]

Problem 47: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  33%|███▎      | 49/150 [07:44<15:14,  9.05s/it]

Problem 48: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  33%|███▎      | 50/150 [08:08<22:43, 13.63s/it]

Problem 49: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  34%|███▍      | 51/150 [08:18<20:34, 12.47s/it]

Problem 50: Decision=YES, Secondary=False Positive (conf: 0.80)


Evaluating justifications with Gemini:  35%|███▍      | 52/150 [08:28<19:08, 11.72s/it]

Problem 51: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  35%|███▌      | 53/150 [08:41<19:37, 12.14s/it]

Problem 52: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  36%|███▌      | 54/150 [08:49<17:33, 10.98s/it]

Problem 53: Decision=YES, Secondary=True Positive (conf: 0.80)


Evaluating justifications with Gemini:  37%|███▋      | 55/150 [08:58<16:15, 10.27s/it]

Problem 54: Decision=YES, Secondary=False Negative (conf: 0.98)


Evaluating justifications with Gemini:  37%|███▋      | 56/150 [09:07<15:40, 10.00s/it]

Problem 55: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  38%|███▊      | 57/150 [09:16<14:50,  9.58s/it]

Problem 56: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  39%|███▊      | 58/150 [09:26<14:55,  9.73s/it]

Problem 57: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  39%|███▉      | 59/150 [09:34<14:07,  9.31s/it]

Problem 58: Decision=YES, Secondary=False Positive (conf: 0.80)


Evaluating justifications with Gemini:  40%|████      | 60/150 [09:49<16:21, 10.90s/it]

Problem 59: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  41%|████      | 61/150 [09:58<15:25, 10.40s/it]

Problem 60: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  41%|████▏     | 62/150 [10:19<19:51, 13.54s/it]

Problem 61: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  42%|████▏     | 63/150 [10:26<16:53, 11.65s/it]

Problem 62: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  43%|████▎     | 64/150 [10:36<15:47, 11.02s/it]

Problem 63: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  43%|████▎     | 65/150 [10:42<13:26,  9.49s/it]

Problem 64: Decision=YES, Secondary=False Positive (conf: 1.00)


Evaluating justifications with Gemini:  44%|████▍     | 66/150 [10:49<12:11,  8.71s/it]

Problem 65: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  45%|████▍     | 67/150 [11:04<14:40, 10.61s/it]

Problem 66: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  45%|████▌     | 68/150 [11:12<13:24,  9.81s/it]

Problem 67: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  46%|████▌     | 69/150 [11:23<13:54, 10.31s/it]

Problem 68: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  47%|████▋     | 70/150 [11:33<13:22, 10.03s/it]

Problem 69: Decision=YES, Secondary=False Positive (conf: 0.80)


Evaluating justifications with Gemini:  47%|████▋     | 71/150 [11:42<12:47,  9.71s/it]

Problem 70: Decision=NO, Secondary=False Negative (conf: 1.00)


Evaluating justifications with Gemini:  48%|████▊     | 72/150 [11:47<11:07,  8.56s/it]

Problem 71: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  49%|████▊     | 73/150 [11:58<11:43,  9.14s/it]

Problem 72: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  49%|████▉     | 74/150 [12:08<11:55,  9.41s/it]

Problem 73: Decision=YES, Secondary=False Positive (conf: 0.75)


Evaluating justifications with Gemini:  50%|█████     | 75/150 [12:18<11:54,  9.53s/it]

Problem 74: Decision=NO, Secondary=False Negative (conf: 1.00)


Evaluating justifications with Gemini:  51%|█████     | 76/150 [12:27<11:37,  9.42s/it]

Problem 75: Decision=NO, Secondary=False Negative (conf: 1.00)


Evaluating justifications with Gemini:  51%|█████▏    | 77/150 [12:35<10:49,  8.90s/it]

Problem 76: Decision=YES, Secondary=False Negative (conf: 1.00)


Evaluating justifications with Gemini:  52%|█████▏    | 78/150 [12:41<09:38,  8.03s/it]

Problem 77: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  53%|█████▎    | 79/150 [12:48<09:05,  7.69s/it]

Problem 78: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  53%|█████▎    | 80/150 [12:57<09:39,  8.28s/it]

Problem 79: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  54%|█████▍    | 81/150 [13:06<09:38,  8.38s/it]

Problem 80: Decision=NO, Secondary=False Negative (conf: 0.90)


Evaluating justifications with Gemini:  55%|█████▍    | 82/150 [13:12<08:36,  7.60s/it]

Problem 81: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  55%|█████▌    | 83/150 [13:31<12:19, 11.04s/it]

Problem 82: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  56%|█████▌    | 84/150 [13:38<11:01, 10.03s/it]

Problem 83: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  57%|█████▋    | 85/150 [13:55<12:53, 11.91s/it]

Problem 84: Decision=YES, Secondary=False Positive (conf: 0.85)


Evaluating justifications with Gemini:  57%|█████▋    | 86/150 [14:03<11:33, 10.84s/it]

Problem 85: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  58%|█████▊    | 87/150 [14:10<10:20,  9.85s/it]

Problem 86: Decision=YES, Secondary=True Positive (conf: 0.90)


Evaluating justifications with Gemini:  59%|█████▊    | 88/150 [14:17<09:11,  8.90s/it]

Problem 87: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  59%|█████▉    | 89/150 [14:27<09:20,  9.19s/it]

Problem 88: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  60%|██████    | 90/150 [14:40<10:11, 10.19s/it]

Problem 89: Decision=YES, Secondary=False Positive (conf: 0.85)


Evaluating justifications with Gemini:  61%|██████    | 91/150 [14:51<10:30, 10.68s/it]

Problem 90: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  61%|██████▏   | 92/150 [15:00<09:44, 10.09s/it]

Problem 91: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  62%|██████▏   | 93/150 [15:09<09:20,  9.83s/it]

Problem 92: Decision=NO, Secondary=False Negative (conf: 0.90)


Evaluating justifications with Gemini:  63%|██████▎   | 94/150 [15:16<08:17,  8.89s/it]

Problem 93: Decision=YES, Secondary=False Positive (conf: 1.00)


Evaluating justifications with Gemini:  63%|██████▎   | 95/150 [15:36<11:06, 12.11s/it]

Problem 94: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  64%|██████▍   | 96/150 [15:45<10:10, 11.31s/it]

Problem 95: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  65%|██████▍   | 97/150 [15:53<09:07, 10.34s/it]

Problem 96: Decision=YES, Secondary=True Positive (conf: 0.90)


Evaluating justifications with Gemini:  65%|██████▌   | 98/150 [16:02<08:30,  9.82s/it]

Problem 97: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  66%|██████▌   | 99/150 [16:10<08:01,  9.43s/it]

Problem 98: Decision=YES, Secondary=False Positive (conf: 0.70)


Evaluating justifications with Gemini:  67%|██████▋   | 100/150 [16:23<08:37, 10.34s/it]

Problem 99: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  67%|██████▋   | 101/150 [16:31<08:02,  9.84s/it]

Problem 100: Decision=NO, Secondary=False Negative (conf: 0.90)


Evaluating justifications with Gemini:  68%|██████▊   | 102/150 [16:49<09:42, 12.14s/it]

Problem 101: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  69%|██████▊   | 103/150 [16:56<08:19, 10.63s/it]

Problem 102: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  69%|██████▉   | 104/150 [17:05<07:42, 10.05s/it]

Problem 103: Decision=YES, Secondary=True Positive (conf: 0.90)


Evaluating justifications with Gemini:  70%|███████   | 105/150 [17:14<07:18,  9.74s/it]

Problem 104: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  71%|███████   | 106/150 [17:26<07:46, 10.61s/it]

Problem 105: Decision=NO, Secondary=False Negative (conf: 0.90)


Evaluating justifications with Gemini:  71%|███████▏  | 107/150 [17:36<07:17, 10.18s/it]

Problem 106: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  72%|███████▏  | 108/150 [17:44<06:49,  9.75s/it]

Problem 107: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  73%|███████▎  | 109/150 [17:54<06:33,  9.61s/it]

Problem 108: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  73%|███████▎  | 110/150 [18:14<08:38, 12.97s/it]

Problem 109: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  74%|███████▍  | 111/150 [18:24<07:50, 12.07s/it]

Problem 110: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  75%|███████▍  | 112/150 [18:42<08:39, 13.68s/it]

Problem 111: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  75%|███████▌  | 113/150 [18:51<07:31, 12.19s/it]

Problem 112: Decision=YES, Secondary=False Negative (conf: 1.00)


Evaluating justifications with Gemini:  76%|███████▌  | 114/150 [19:04<07:31, 12.55s/it]

Problem 113: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  77%|███████▋  | 115/150 [19:11<06:18, 10.81s/it]

Problem 114: Decision=YES, Secondary=False Positive (conf: 0.99)


Evaluating justifications with Gemini:  77%|███████▋  | 116/150 [19:19<05:45, 10.17s/it]

Problem 115: Decision=YES, Secondary=False Positive (conf: 0.75)


Evaluating justifications with Gemini:  78%|███████▊  | 117/150 [19:35<06:29, 11.82s/it]

Problem 116: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  79%|███████▊  | 118/150 [19:43<05:46, 10.82s/it]

Problem 117: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  79%|███████▉  | 119/150 [19:51<05:00,  9.70s/it]

Problem 118: Decision=YES, Secondary=False Positive (conf: 0.99)


Evaluating justifications with Gemini:  80%|████████  | 120/150 [19:59<04:40,  9.34s/it]

Problem 119: Decision=NO, Secondary=False Negative (conf: 0.90)


Evaluating justifications with Gemini:  81%|████████  | 121/150 [20:08<04:24,  9.11s/it]

Problem 120: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  81%|████████▏ | 122/150 [20:16<04:10,  8.96s/it]

Problem 121: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  82%|████████▏ | 123/150 [20:25<04:04,  9.04s/it]

Problem 122: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  83%|████████▎ | 124/150 [20:36<04:05,  9.43s/it]

Problem 123: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  83%|████████▎ | 125/150 [20:49<04:26, 10.67s/it]

Problem 124: Decision=YES, Secondary=True Positive (conf: 0.85)


Evaluating justifications with Gemini:  84%|████████▍ | 126/150 [20:58<04:00, 10.02s/it]

Problem 125: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  85%|████████▍ | 127/150 [21:11<04:13, 11.03s/it]

Problem 126: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  85%|████████▌ | 128/150 [21:19<03:40, 10.00s/it]

Problem 127: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  86%|████████▌ | 129/150 [21:30<03:35, 10.27s/it]

Problem 128: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  87%|████████▋ | 130/150 [21:38<03:15,  9.79s/it]

Problem 129: Decision=NO, Secondary=False Negative (conf: 0.90)


Evaluating justifications with Gemini:  87%|████████▋ | 131/150 [21:51<03:22, 10.64s/it]

Problem 130: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  88%|████████▊ | 132/150 [21:58<02:53,  9.66s/it]

Problem 131: Decision=YES, Secondary=False Positive (conf: 1.00)


Evaluating justifications with Gemini:  89%|████████▊ | 133/150 [22:07<02:39,  9.36s/it]

Problem 132: Decision=YES, Secondary=False Positive (conf: 0.70)


Evaluating justifications with Gemini:  89%|████████▉ | 134/150 [22:15<02:23,  8.98s/it]

Problem 133: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  90%|█████████ | 135/150 [22:23<02:11,  8.77s/it]

Problem 134: Decision=YES, Secondary=True Positive (conf: 0.90)


Evaluating justifications with Gemini:  91%|█████████ | 136/150 [22:34<02:12,  9.43s/it]

Problem 135: Decision=YES, Secondary=False Negative


Evaluating justifications with Gemini:  91%|█████████▏| 137/150 [22:44<02:02,  9.45s/it]

Problem 136: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  92%|█████████▏| 138/150 [22:52<01:47,  8.95s/it]

Problem 137: Decision=YES, Secondary=False Negative (conf: 0.98)


Evaluating justifications with Gemini:  93%|█████████▎| 139/150 [23:10<02:08, 11.69s/it]

Problem 138: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  93%|█████████▎| 140/150 [23:19<01:47, 10.79s/it]

Problem 139: Decision=NO, Secondary=False Negative (conf: 0.90)


Evaluating justifications with Gemini:  94%|█████████▍| 141/150 [23:27<01:31, 10.15s/it]

Problem 140: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  95%|█████████▍| 142/150 [23:36<01:19,  9.90s/it]

Problem 141: Decision=YES, Secondary=True Positive (conf: 0.95)


Evaluating justifications with Gemini:  95%|█████████▌| 143/150 [23:46<01:07,  9.70s/it]

Problem 142: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  96%|█████████▌| 144/150 [23:51<00:51,  8.50s/it]

Problem 143: Decision=YES, Secondary=False Positive (conf: 0.90)


Evaluating justifications with Gemini:  97%|█████████▋| 145/150 [23:57<00:38,  7.72s/it]

Problem 144: Decision=YES, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  97%|█████████▋| 146/150 [24:07<00:32,  8.25s/it]

Problem 145: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini:  98%|█████████▊| 147/150 [24:16<00:25,  8.52s/it]

Problem 146: Decision=NO, Secondary=False Negative


Evaluating justifications with Gemini:  99%|█████████▊| 148/150 [24:24<00:16,  8.31s/it]

Problem 147: Decision=YES, Secondary=False Positive (conf: 0.95)


Evaluating justifications with Gemini:  99%|█████████▉| 149/150 [24:33<00:08,  8.60s/it]

Problem 148: Decision=NO, Secondary=False Negative (conf: 0.95)


Evaluating justifications with Gemini: 100%|██████████| 150/150 [24:42<00:00,  9.88s/it]

Problem 149: Decision=YES, Secondary=False Positive (conf: 0.95)

JUSTIFICATION EVALUATION COMPLETED!
Model Used: gemini-2.5-flash-lite
Total Problems: 150
Successfully Processed: 150

PRIMARY ACCURACY (Error Detection - Decision Field):
  Ground Truth: All problems should have Decision = YES
  Correct Detections (YES): 117/150
  Accuracy: 78.00%

SECONDARY ACCURACY (Justification Quality - Secondary Decision Field):
  True Positives: 30
  False Positives: 71
  False Negatives: 49
  Accuracy (TP rate): 20.00%
  Precision: 29.70%
  Recall: 37.97%

Problems with Confidence: 147 (98.0%)
Average Confidence: 0.920
Confidence Range: 0.700 - 1.000

Results saved to: /content/justification_evaluation_results_Mathstral_Bangla.json



