In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Suppose you saved keys in /content/drive/MyDrive/gemini_keys.txt as single line: key1,key2,...
with open('/kaggle/input/numerical-symbolic-nctb/gemini_keys.txt','r') as fh:
    keys = fh.read().strip()

import os
import re
import time
import json
import queue
import random
import logging
import threading
import traceback
from tqdm import tqdm
from collections import deque
from datetime import datetime, timezone
from google import genai
from typing import Optional, Dict, Any

# ---------------------------
# Logging
# ---------------------------
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("bangla_mcq_generator.log"),
        logging.StreamHandler()
    ]
)

# If keys were loaded above, set them into an environment variable for optional use by other code.
if 'keys' in globals() and keys:
    os.environ['GEMINI_API_KEYS'] = keys
    print("Loaded keys from Drive into environment variable GEMINI_API_KEYS.")
else:
    print("No keys loaded from Drive; will attempt to read GEMINI_API_KEYS environment variable.")

# ---------------------------
# Gemini API manager
# ---------------------------
class GeminiMCQApiManager:
    """
    Manages multiple Gemini API keys with rotation and rate limiting for MCQ generation tasks.
    Thread-safe: uses a lock to guard key/usage state.
    """

    def __init__(self, api_keys, calls_per_day=1000, rate_limit_delay=4):
        if not api_keys:
            raise ValueError("api_keys must contain at least one key")

        self.api_keys = deque(api_keys)
        self.calls_per_day = calls_per_day
        self.rate_limit_delay = rate_limit_delay

        # Track usage for each key
        self.usage_count = {key: 0 for key in api_keys}
        self.current_key = self.api_keys[0]
        self.client = genai.Client(api_key=self.current_key)

        # Thread-safety
        self.lock = threading.Lock()

        # Set up a queue for API calls
        self.call_queue = queue.Queue()
        self.worker_thread = threading.Thread(target=self._process_queue, name="GeminiWorker")
        self.worker_thread.daemon = True
        self.worker_thread.start()

        logging.info(f"MCQ API Manager initialized with {len(api_keys)} keys")

    def _rotate_key(self):
        """Rotate to the next available API key (thread-safe)."""
        with self.lock:
            self.api_keys.rotate(1)
            self.current_key = self.api_keys[0]
            self.client = genai.Client(api_key=self.current_key)
            usage = self.usage_count.get(self.current_key, 0)
        logging.info(f"Rotated to new API key (usage: {usage})")

    def _find_available_key(self):
        """Find an API key that hasn't reached the daily limit. Returns True if found."""
        with self.lock:
            if self.usage_count.get(self.current_key, 0) < self.calls_per_day:
                return True

        initial_key = self.current_key
        for _ in range(len(self.api_keys)):
            self._rotate_key()
            with self.lock:
                if self.usage_count.get(self.current_key, 0) < self.calls_per_day:
                    return True
            if self.current_key == initial_key:
                return False
        return False

    def _process_queue(self):
        """Process the queue of API calls in a background worker."""
        while True:
            try:
                args, kwargs, result_queue = self.call_queue.get()

                # Find available key
                if not self._find_available_key():
                    err = {"error": "All API keys have reached their daily limit"}
                    try:
                        result_queue.put(err)
                    except Exception:
                        logging.exception("Failed to notify caller about exhausted keys")
                    self.call_queue.task_done()
                    time.sleep(10)
                    continue

                try:
                    response = None
                    try:
                        # call into Gemini client (args and kwargs forwarded)
                        response = self.client.models.generate_content(*args, **kwargs)
                    except Exception as api_exc:
                        msg = str(api_exc).lower()
                        if 'quota' in msg or 'rate limit' in msg or 'quota_exceeded' in msg:
                            with self.lock:
                                self.usage_count[self.current_key] = self.calls_per_day
                            logging.warning(f"API key reached quota/rate-limit: {api_exc}")
                            result_queue.put({"error": f"Rate limit/quota: {api_exc}"})
                        else:
                            logging.error(f"API call error: {api_exc}")
                            result_queue.put({"error": str(api_exc)})
                        response = None

                    if response is not None:
                        result_queue.put({"response": response})
                        with self.lock:
                            self.usage_count[self.current_key] = self.usage_count.get(self.current_key, 0) + 1

                except Exception as e:
                    logging.error(f"Unexpected API invocation error: {e}\n{traceback.format_exc()}")
                    try:
                        result_queue.put({"error": str(e)})
                    except Exception:
                        logging.exception("Failed to send error to result queue")

                time.sleep(self.rate_limit_delay)
                self.call_queue.task_done()

            except Exception as e:
                logging.error(f"Queue processing error: {e}\n{traceback.format_exc()}")
                time.sleep(1)
                continue

    def generate_content(self, *args, timeout=60, **kwargs):
        """Make an API call to generate content, automatically handling key rotation."""
        result_queue = queue.Queue()
        self.call_queue.put((args, kwargs, result_queue))
        try:
            result = result_queue.get(timeout=timeout)
        except queue.Empty:
            raise TimeoutError("Timed out waiting for API worker result.")

        if "error" in result:
            raise Exception(result["error"])
        return result["response"]

    def reset_usage_counts(self):
        """Reset the usage counts for all keys (e.g., at the start of a new day)."""
        with self.lock:
            self.usage_count = {key: 0 for key in self.api_keys}
        logging.info("Reset API key usage counts")

    def get_usage_stats(self):
        """Get usage statistics for all keys."""
        with self.lock:
            per_key = dict(self.usage_count)
        total_used = sum(per_key.values())
        total_available = len(self.api_keys) * self.calls_per_day
        return {
            "per_key": per_key,
            "total_used": total_used,
            "total_available": total_available,
            "percent_used": (total_used / total_available) * 100 if total_available > 0 else 0
        }

# ---------------------------
# Robust Gemini JSON extraction (replaces old extract_json_from_response)
# ---------------------------

def parse_gemini_json_object(text: str) -> Optional[Dict[str, Any]]:
    """
    Robustly extract the first JSON object from Gemini output.
    1) Look for fenced code blocks (```json ... ``` and variants) and try each match.
    2) For each candidate try json.loads; if it fails, escape stray backslashes and retry.
    3) If no fenced block parses, fall back to a balanced-brace scan of the entire text.
    Returns a dict on success, or None on failure.
    """
    if not text:
        return None

    # Normalize whitespace a little
    text = text.strip()

    # Combined code-fence regex that covers:
    #  ```json\n ... \n```
    #  ```json ... ```
    #  ```\n ... \n```
    #  ``` ... ```
    fence_re = re.compile(r'```(?:json)?\s*\n?(.*?)\n?```', re.DOTALL | re.IGNORECASE)

    # Try fenced blocks first (if any). re.findall returns all non-overlapping matches.
    for match in fence_re.findall(text):
        candidate = match.strip()
        if not candidate:
            continue
        # Try parsing candidate directly
        try:
            parsed = json.loads(candidate)
            if isinstance(parsed, dict):
                return parsed
        except json.JSONDecodeError:
            # Escape stray backslashes that are NOT valid JSON escapes:
            # valid escapes: " \ / b f n r t u
            fixed = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', candidate)
            try:
                parsed = json.loads(fixed)
                if isinstance(parsed, dict):
                    return parsed
            except json.JSONDecodeError:
                # try next fenced block (if any)
                continue

    # If no fenced block worked, try direct full-text parse (maybe the whole text is JSON)
    try:
        parsed_full = json.loads(text)
        if isinstance(parsed_full, dict):
            return parsed_full
    except Exception:
        pass

    # Fallback: scan for the first balanced {...} substring (robust to extra text around JSON)
    start = None
    depth = 0
    for i, ch in enumerate(text):
        if ch == '{':
            if start is None:
                start = i
            depth += 1
        elif ch == '}' and start is not None:
            depth -= 1
            if depth == 0:
                candidate = text[start:i+1]
                try:
                    parsed = json.loads(candidate)
                    if isinstance(parsed, dict):
                        return parsed
                except json.JSONDecodeError:
                    fixed = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', candidate)
                    try:
                        parsed = json.loads(fixed)
                        if isinstance(parsed, dict):
                            return parsed
                    except json.JSONDecodeError:
                        # if this candidate fails, continue scanning (rare)
                        start = None
                        depth = 0
                        continue
    return None



# ---------------------------
# SIMPLIFIED: Direct MCQ building
# ---------------------------

def build_mcq_choices(wrong_answers, exact_answer):
    """
    Build MCQ choices directly from Gemini output.
    NO CLEANING - preserve exactly as generated.
    """
    # Ensure we have exactly 3 wrong answers
    while len(wrong_answers) < 3:
        wrong_answers.append("অনির্ণেয়")  # fallback
    wrong_answers = wrong_answers[:3]  # truncate if more than 3

    # Create all 4 options (3 wrong + 1 correct)
    all_options = wrong_answers + [exact_answer]
    random.shuffle(all_options)  # randomize order

    # Find where the correct answer ended up
    correct_index = all_options.index(exact_answer)
    correct_letter = chr(ord('A') + correct_index)

    # Format as A. B. C. D.
    choices = [f"{chr(ord('A') + i)}. {option}" for i, option in enumerate(all_options)]

    return choices, correct_letter

# ---------------------------
# Advanced prompt template
# ---------------------------

ADVANCED_PROMPT_TEMPLATE = r"""You are a mathematics expert and educational assessment specialist who understands common student misconceptions.

TASK: Generate exactly 3 STRATEGICALLY CRAFTED wrong answers (distractors) for this multiple-choice question.

CORRECT ANSWER: {exact_answer}
QUESTION: {question}
SOLUTION: {solution}

🎯 ADVANCED DISTRACTOR GENERATION STRATEGIES:

A) PROCEDURAL ERRORS (Target: Students who know the method but make execution mistakes):
   • Arithmetic mistakes: 7×8=54 instead of 56, 144÷12=11 instead of 12
   • Sign errors: Getting -5 instead of +5, forgetting negative signs
   • Order of operations: Calculating 2+3×4=20 instead of 14
   • Fraction errors: Adding numerators and denominators incorrectly
   • Decimal placement: 0.25×4=0.10 instead of 1.0
   • Rounding errors: Premature or incorrect rounding

B) CONCEPTUAL MISCONCEPTIONS (Target: Students with fundamental misunderstandings):
   • Formula confusion: Using the area formula for perimeter problems
   • Unit confusion: Mixing cm² with cm, degrees with radians
   • Operation confusion: Using addition instead of multiplication
   • Inverse operations: Using division instead of multiplication
   • Mathematical relationships: Confusing direct vs inverse proportions
   • Geometric misconceptions: Confusing radius with diameter

C) INCOMPLETE SOLUTIONS (Target: Students who stop too early or skip steps):
   • Partial answers: Giving intermediate results as final answers
   • Missing final step: Finding the derivative but not evaluating at a point
   • Solving for wrong variable: Finding x when asked for y
   • Units incomplete: Getting numerical value but missing units

D) OVER-GENERALIZATION (Target: Students applying rules incorrectly):
   • Pattern misapplication: Using the simple interest formula for compound interest
   • Wrong domain application: Using rules outside their valid range
   • Symmetry errors: Assuming all functions have certain properties
   • Distribution errors: (a+b)²=a²+b² instead of a²+2ab+b²

E) MATHEMATICAL ANXIETY PATTERNS (Target: Students under test pressure):
   • First/last option bias: Answers that "look right" superficially
   • Magnitude errors: Answers off by factor of 10, 100, or 1000
   • Complexity avoidance: Choosing simpler-looking wrong answers
   • Overthinking: Getting elaborate wrong answers from overcomplication

🎯 STRATEGIC GUIDELINES:
• Make each wrong answer arise from a DIFFERENT type of error
• Ensure wrong answers are mathematically plausible but verifiably incorrect
• Consider the difficulty level: harder problems need more sophisticated distractors
• For Bengali/Bangla mathematics: Include culture-specific calculation methods
• Match the format exactly: if the answer is a fraction, distractors should be fractions
• Avoid "throwaway" options: each distractor should tempt some students

🎯 DISTRACTOR VALIDATION CHECKLIST:
✓ Would a student who made error X naturally arrive at this answer?
✓ Is this answer close enough to seem plausible but clearly wrong when checked?
✓ Does this distractor test a different misconception than the other two?
✓ Is the format/presentation consistent with the correct answer?

CRITICAL FORMATTING CONSTRAINTS:
⚠️ MANDATORY: Your response must be EXACTLY this JSON format with NO additional text:
{{"wrong_answers": ["answer1", "answer2", "answer3"]}}

⚠️ FORBIDDEN:
- Any text before or after the JSON
- Markdown code blocks (```json or ```)
- Explanations or comments
- Line breaks or whitespace before/after JSON
- The correct answer {exact_answer} in any wrong answer
- Generic distractors like "None of the above" or "Cannot be determined"

⚠️ VALIDATION REQUIREMENTS:
- Each wrong answer must be different from the correct answer: {exact_answer}
- Each wrong answer must represent a distinct mathematical error type
- Maintain the same format/units as the correct answer
- Make answers tempting to students who made specific mistakes

START JSON OUTPUT NOW:
"""

# ---------------------------
# Main MCQ generation function
# ---------------------------

def generate_mcq_options(api_manager, question, solution, exact_answer):
    """
    Generate MCQ options - SIMPLIFIED to preserve Gemini output exactly.
    """
    prompt = ADVANCED_PROMPT_TEMPLATE.format(
        exact_answer=exact_answer,
        question=question,
        solution=solution
    )

    try:
        response = api_manager.generate_content(
            # model="gemini-2.5-flash",
            model = "gemini-2.5-flash-lite",
            contents=prompt
        )

        # Get response text
        response_text = ""
        if hasattr(response, 'text'):
            response_text = response.text
        elif isinstance(response, dict) and 'text' in response:
            response_text = response['text']
        else:
            response_text = str(response)

        response_text = response_text.strip()
        logging.info(f"Raw Gemini response: {response_text}")
        # print(f"\ngenerated MCQ {response_text}\n")

        # Parse JSON using robust parser
        parsed = parse_gemini_json_object(response_text)

        if parsed and isinstance(parsed, dict) and 'wrong_answers' in parsed:
            # DIRECT parsing - access elements explicitly as requested
            try:
                # Access exactly as the user requested
                wrong0 = parsed["wrong_answers"][0]
                wrong1 = parsed["wrong_answers"][1]
                wrong2 = parsed["wrong_answers"][2]
                wrong_answers = [wrong0, wrong1, wrong2]
            except (IndexError, KeyError, TypeError) as e:
                logging.error(f"Error accessing wrong_answers array directly: {e}")
                # Fallback if the array doesn't have 3 elements or is malformed
                try:
                    raw_list = parsed.get('wrong_answers', [])
                except Exception:
                    raw_list = []
                while len(raw_list) < 3:
                    raw_list.append("অনির্ণেয়")
                wrong_answers = raw_list[:3]

            # Build choices directly - NO CLEANING
            choices, correct_letter = build_mcq_choices(wrong_answers, exact_answer)

            return {
                'multiple_choices': choices,
                'correct_answer': correct_letter,
                'generation_successful': True,
                'method': 'api',
                'raw_gemini_response': response_text,
                'parsed_wrong_answers': wrong_answers
            }
        else:
            # Fallback: use simple wrong answers
            fallback_wrongs = ["অনির্ণেয়", "অসংজ্ঞায়িত", "0"]
            choices, correct_letter = build_mcq_choices(fallback_wrongs, exact_answer)

            return {
                'multiple_choices': choices,
                'correct_answer': correct_letter,
                'generation_successful': False,
                'method': 'fallback',
                'error': 'JSON parsing failed'
            }

    except Exception as e:
        logging.error(f"API generation failed: {e}")
        # Fallback
        fallback_wrongs = ["অনির্ণেয়", "অসংজ্ঞায়িত", "0"]
        choices, correct_letter = build_mcq_choices(fallback_wrongs, exact_answer)

        return {
            'multiple_choices': choices,
            'correct_answer': correct_letter,
            'generation_successful': False,
            'method': 'fallback',
            'error': str(e)
        }

# ---------------------------
# JSONL Processing
# ---------------------------

def get_field_case_insensitive(obj, *keys):
    """Get field value with case-insensitive key matching."""
    for key in keys:
        if key in obj:
            return obj[key]
        for present_key in obj.keys():
            if present_key.lower() == key.lower():
                return obj[present_key]
    return ''

def process_jsonl_file(api_manager, input_file_path, output_file_path):
    """Process JSONL file to add MCQ options - SIMPLIFIED."""
    logging.info(f"Starting MCQ generation for: {input_file_path}")

    # Count total objects
    total_objects = 0
    with open(input_file_path, 'r', encoding='utf-8') as f:
        for _ in f:
            total_objects += 1

    logging.info(f"Found {total_objects} objects to process")

    # Processing counters
    processed_count = 0
    successful_generations = 0

    with open(input_file_path, 'r', encoding='utf-8') as input_file, \
         open(output_file_path, 'w', encoding='utf-8') as output_file:

        for line_num, line in enumerate(tqdm(input_file, total=total_objects, desc="Generating MCQs"), 1):
            try:
                obj = json.loads(line.strip())

                # Extract required fields
                question = get_field_case_insensitive(obj, 'Question', 'question', 'QuestionText')
                solution = get_field_case_insensitive(obj, 'Solution', 'solution')
                exact_answer = get_field_case_insensitive(obj, 'Exact Answer', 'ExactAnswer', 'exact answer', 'Answer')

                if not all([question, exact_answer]):
                    logging.warning(f"Line {line_num}: Missing required fields")
                    # Simple fallback
                    obj['Multiple Choices'] = [f"A. {exact_answer or 'Error'}", "B. অনির্ণেয়", "C. অসংজ্ঞায়িত", "D. 0"]
                    obj['Correct Answer'] = "A"
                    obj['MCQ_Metadata'] = {'generation_method': 'error', 'error': 'missing_fields'}
                else:
                    mcq_result = generate_mcq_options(api_manager, question, solution, exact_answer)

                    obj['Multiple Choices'] = mcq_result['multiple_choices']
                    obj['Correct Answer'] = mcq_result['correct_answer']
                    obj['MCQ_Metadata'] = {
                        'generation_method': mcq_result['method'],
                        'generation_successful': mcq_result['generation_successful']
                    }

                    # Add debug info if API was used
                    if 'raw_gemini_response' in mcq_result:
                        obj['MCQ_Metadata']['raw_gemini_response'] = mcq_result['raw_gemini_response']
                        obj['MCQ_Metadata']['parsed_wrong_answers'] = mcq_result.get('parsed_wrong_answers', [])

                    if mcq_result['generation_successful']:
                        successful_generations += 1

                # Write result (proper JSONL format)
                output_file.write(json.dumps(obj, ensure_ascii=False) + '\n')
                processed_count += 1

                # Progress logging
                if processed_count % 10 == 0:
                    stats = api_manager.get_usage_stats()
                    logging.info(f"Progress: {processed_count}/{total_objects}, Success: {successful_generations}")
                    logging.info(f"API Usage: {stats['total_used']}/{stats['total_available']}")

            except Exception as e:
                logging.error(f"Line {line_num}: Processing error: {e}")
                continue

    # Final statistics
    print(f"MCQ generation completed!")
    print(f"Total processed: {processed_count}")
    print(f"Successful API generations: {successful_generations}")
    print(f"Output saved to: {output_file_path}")

# ---------------------------
# Main function
# ---------------------------

def main():
    """Main function to generate MCQ options for Bangla math problems."""
    # Initialize API keys (require keys loaded from Drive or environment; fail fast if missing)
    api_keys = None

    # 1) Try to use `keys` variable loaded earlier (from Drive)
    if 'keys' in globals() and keys:
        api_keys = [k.strip() for k in keys.split(',') if k.strip()]

    # 2) Otherwise, try the environment variable GEMINI_API_KEYS
    if not api_keys:
        env_val = os.environ.get('GEMINI_API_KEYS', '').strip()
        if env_val:
            api_keys = [k.strip() for k in env_val.split(',') if k.strip()]

    # Fail fast if no keys found
    if not api_keys:
        err_msg = (
            "ERROR: No API keys found. Please create a text file at "
            "/content/drive/MyDrive/gemini_keys.txt containing a single line with keys "
            "separated by commas (e.g. key1,key2,...) or set the GEMINI_API_KEYS environment variable."
        )
        print(err_msg)
        raise SystemExit(err_msg)

    api_manager = GeminiMCQApiManager(
        api_keys=api_keys,
        calls_per_day=1000,
        rate_limit_delay=4
    )

    # File paths (update as needed)
    input_jsonl_path = "/kaggle/input/numerical-symbolic-nctb/Numerical_Symbolic_Bangla.jsonl"
    output_jsonl_path = "/kaggle/working/MCQ_Numerical_Symbolic_Bangla.jsonl"

    if not os.path.exists(input_jsonl_path):
        print(f"Input file not found: {input_jsonl_path}")
        return

    try:
        process_jsonl_file(api_manager, input_jsonl_path, output_jsonl_path)
    except Exception as e:
        print(f"Error processing file: {e}")
        logging.error(f"Error processing file: {e}")

if __name__ == "__main__":
    main()