In [1]:
!git clone https://github.com/mhizterpaul/leetcode-questions-dataset.git
!mv leetcode-questions-dataset/data .

Cloning into 'leetcode-questions-dataset'...
remote: Enumerating objects: 211, done.[K
remote: Counting objects: 100% (211/211), done.[K
remote: Compressing objects: 100% (124/124), done.[K
remote: Total 211 (delta 104), reused 171 (delta 78), pack-reused 0 (from 0)[K
Receiving objects: 100% (211/211), 2.97 MiB | 15.30 MiB/s, done.
Resolving deltas: 100% (104/104), done.


In [2]:
import pandas as pd

leetcode_dataset = pd.read_csv("data/leetcode_dataset/all_questions_details_cumulative.csv")

interview_dataset = pd.read_csv("data/interview_dataset/SoftwareQuestions.csv", encoding='latin1')

hackerrank_algo_dataset = pd.read_csv("data/hackerrank_dataset/algorithms_questions.csv")

hackerrank_ds_dataset = pd.read_csv("data/hackerrank_dataset/data_structures_questions.csv")

In [3]:
# Process LeetCode dataset
leetcode_processed = leetcode_dataset[[
    'Question ID',
    'Question Title',
    'Question Text',
    'Topic Tagged text',
    'Difficulty Level'
]].copy()

# Rename columns to match target schema where possible
leetcode_processed.rename(columns={
    'Question ID': 'id',
    'Question Title': 'question_title',
    'Question Text': 'question_text',
    'Topic Tagged text': 'tags',
    'Difficulty Level': 'difficulty'
}, inplace=True)

# Combine title and text for the question column
leetcode_processed['question'] = leetcode_processed['question_title'] + "\n" + leetcode_processed['question_text']

# Drop the individual title and text columns
leetcode_processed.drop(columns=['question_title', 'question_text'], inplace=True)

# Display the processed LeetCode data
display(leetcode_processed.head())

Unnamed: 0,id,tags,difficulty,question
0,1,"Array,Hash Table",Easy,Two Sum\nGiven an array of integers nums and a...
1,2,"Linked List,Math,Recursion",Medium,Add Two Numbers\nYou are given two non-empty l...
2,3,"Hash Table,String,Sliding Window",Medium,Longest Substring Without Repeating Characters...
3,4,"Array,Binary Search,Divide and Conquer",Hard,Median of Two Sorted Arrays\nGiven two sorted ...
4,5,"String,Dynamic Programming",Medium,Longest Palindromic Substring\nGiven a string ...


In [4]:
# Define the five target categories
target_categories = [
    'Algorithms',
    'Data Structures',
    'Database Systems',
    'System Design',
    'Security'
]

# Process Interview dataset
interview_processed = interview_dataset[[
    'Question Number',
    'Question',
    'Answer',
    'Category',
    'Difficulty'
]].copy()

# Filter the dataset to include only the target categories
interview_processed = interview_processed[interview_processed['Category'].isin(target_categories)].copy()


# Rename columns to match target schema where possible
interview_processed.rename(columns={
    'Question Number': 'id',
    'Question': 'question',
    'Answer': 'correct',
    'Category': 'category',
    'Difficulty': 'difficulty'
}, inplace=True)

# Interview dataset does not have explicit tags, a, b, c, d columns.
# These will need to be generated by the LLM later.
interview_processed['tags'] = None
interview_processed['a'] = None
interview_processed['b'] = None
interview_processed['c'] = None
interview_processed['d'] = None


# Display the processed Interview data
display(interview_processed.head())

Unnamed: 0,id,question,correct,category,difficulty,tags,a,b,c,d
10,11,What is the difference between an array and a ...,An array has fixed size and stores elements in...,Data Structures,Easy,,,,,
11,12,Explain the time complexity of an algorithm.,Time complexity measures the time an algorithm...,Data Structures,Hard,,,,,
12,13,Describe the difference between a binary searc...,"A binary search tree is hierarchical, maintain...",Data Structures,Medium,,,,,
13,14,What is a linked list and how does it work?,A linked list is a series of nodes each contai...,Data Structures,Medium,,,,,
14,15,Explain the concept of recursion.,Recursion is when a function calls itself to s...,Data Structures,Medium,,,,,


In [5]:
# Process HackerRank Algorithms dataset
hackerrank_algo_processed = hackerrank_algo_dataset[[
    'url',
    'question_text',
]].copy()

# Rename columns to match target schema where possible
hackerrank_algo_processed.rename(columns={
    'url': 'id', # Using url as a temporary ID
    'question_text': 'question',
}, inplace=True)

# Add missing columns, to be generated by LLM
hackerrank_algo_processed['tags'] = None
hackerrank_algo_processed['a'] = None
hackerrank_algo_processed['b'] = None
hackerrank_algo_processed['c'] = None
hackerrank_algo_processed['d'] = None
hackerrank_algo_processed['correct'] = None
hackerrank_algo_processed['category'] = 'Algorithms' # Assign category based on dataset

# Display the processed HackerRank Algorithms data
display(hackerrank_algo_processed.head())

Unnamed: 0,id,question,tags,a,b,c,d,correct,category
0,https://www.hackerrank.com/challenges/count-st...,A regular expression is used to describe a set...,,,,,,,Algorithms
1,https://www.hackerrank.com/challenges/kingdom-...,It has been a prosperous year for King Charles...,,,,,,,Algorithms
2,https://www.hackerrank.com/challenges/morgan-a...,Jack and Daniel are friends. Both of them like...,,,,,,,Algorithms
3,https://www.hackerrank.com/challenges/red-knig...,"In ordinary chess, the pieces are only of two ...",,,,,,,Algorithms
4,https://www.hackerrank.com/challenges/two-robo...,You have a warehouse with\ncontainers filled w...,,,,,,,Algorithms


In [6]:
# Process HackerRank Data Structures dataset
hackerrank_ds_processed = hackerrank_ds_dataset[[
    'url',
    'question_text',
]].copy()

# Rename columns to match target schema where possible
hackerrank_ds_processed.rename(columns={
    'url': 'id', # Using url as a temporary ID
    'question_text': 'question',
}, inplace=True)

# Add missing columns, to be generated by LLM
hackerrank_ds_processed['tags'] = None
hackerrank_ds_processed['a'] = None
hackerrank_ds_processed['b'] = None
hackerrank_ds_processed['c'] = None
hackerrank_ds_processed['d'] = None
hackerrank_ds_processed['correct'] = None
hackerrank_ds_processed['category'] = 'Data Structures' # Assign category based on dataset

# Display the processed HackerRank Data Structures data
display(hackerrank_ds_processed.head())

Unnamed: 0,id,question,tags,a,b,c,d,correct,category
0,https://www.hackerrank.com/challenges/box-oper...,Alice purchased an array of\nwooden boxes that...,,,,,,,Data Structures
1,https://www.hackerrank.com/challenges/is-binar...,"For the purposes of this challenge, we define ...",,,,,,,Data Structures
2,https://www.hackerrank.com/challenges/array-an...,Given two numbers\nand\n.\nindicates the numbe...,,,,,,,Data Structures
3,https://www.hackerrank.com/challenges/subseque...,A subsequence of a sequence is a sequence whic...,,,,,,,Data Structures
4,https://www.hackerrank.com/challenges/company-...,The\nLRT Company\nhas\nemployees. Each employe...,,,,,,,Data Structures


In [7]:
import os

# Create the interim directory if it doesn't exist
os.makedirs("data/interim", exist_ok=True)

# List of processed dataframes
processed_dfs = [
    leetcode_processed,
    interview_processed,
    hackerrank_algo_processed,
    hackerrank_ds_processed
]

# Combine all processed dataframes
combined_df = pd.concat(processed_dfs, ignore_index=True)

# Deduplicate based on the 'question' column
combined_df_deduplicated = combined_df.drop_duplicates(subset=['question'])

# Select only the 'id' and 'question' columns
output_df = combined_df_deduplicated.reindex(columns=['id', 'question'])

# Define the output path
OUTPUT_CSV = "data/interim/filtered_dataset.csv"

# Save the combined dataframe to a CSV file
output_df.to_csv(OUTPUT_CSV, index=False)

print(f"Combined and deduplicated dataset with only 'id' and 'question' saved to {OUTPUT_CSV}")
display(output_df.head())

Combined and deduplicated dataset with only 'id' and 'question' saved to data/interim/filtered_dataset.csv


Unnamed: 0,id,question
0,1,Two Sum\nGiven an array of integers nums and a...
1,2,Add Two Numbers\nYou are given two non-empty l...
2,3,Longest Substring Without Repeating Characters...
3,4,Median of Two Sorted Arrays\nGiven two sorted ...
4,5,Longest Palindromic Substring\nGiven a string ...


In [None]:
# Step 1: Install the Hugging Face Hub tools
!pip install -q huggingface_hub transformers accelerate bitsandbytes pandas tqdm

# Step 2: Import and login using your token
from huggingface_hub import login

# Paste your token here (get it from https://huggingface.co/settings/tokens)
login("YOUR_KEY")

import os
import json
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from huggingface_hub import snapshot_download
import re # Import regex module

# --- Config ---
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
LOCAL_MODEL_DIR = "/content/Phi-3-mini-4k" # Changed to absolute path
INPUT_CSV = "data/interim/filtered_dataset.csv"
OUTPUT_CSV = "data/dist/combined_questions_with_llm_features.csv"
MAX_RETRIES = 3
TEMPERATURE = 0.7
TOP_P = 0.9
MAX_TOKENS = 1024  # Reverted MAX_TOKENS
BATCH_SIZE = 3

# --- Step 1: Download Model ---
if not os.path.exists(LOCAL_MODEL_DIR):
    print(f"⬇️ Downloading Phi-3-mini-4k to {LOCAL_MODEL_DIR}")
    snapshot_download(
        repo_id=MODEL_NAME,
        local_dir=LOCAL_MODEL_DIR,
        local_dir_use_symlinks=False
    )
else:
    print(f"✅ Using cached model from {LOCAL_MODEL_DIR}")

# --- Step 2: Load Model ---
print("🔧 Phi-3-mini-4k...")

# Configure 4-bit quantization for GPU
# Remove CPU-specific settings
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", # nf4 is generally recommended for 4-bit on GPU
    bnb_4bit_compute_dtype=torch.float16
)


tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_DIR, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    LOCAL_MODEL_DIR,
    trust_remote_code=True,
    device_map="auto", # Use device_map="auto" to automatically use the GPU if available
    quantization_config=bnb_config # Use the quantization config
)
llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

# --- Step 3: Prompt Template ---
def build_prompt(question: str) -> str:
    return tokenizer.apply_chat_template([
        {
            "role": "system",
            "content": (
                "You are an expert tutor that generates multiple-choice questions for technical interviews. "
                "Your job is to create **difficult** questions with five answer choices: "
                "four **incorrect** options labeled as 'a', 'b', 'c', and 'd' (called **distractors**), and one **correct** answer labeled as 'correct'. "
                "⚠️ The correct answer **must not** appear as options 'a', 'b', 'c', or 'd'. It must be able to replace options 'a', 'b', 'c' or 'd'. "
                "Your output should also include a list of relevant technical **tags** (e.g., linked list, recursion, microservices) based on the question content. "
                "Each question must belong to **one category only**, chosen from the following fixed list: "
                "[algorithms, data structures, database systems, system design, security]."
                "Format your output strictly as a JSON object containing fields: a, b, c, d, correct, tags, and category."

            )
        },
        {
            "role": "user",
            "content": f"""
Question:
\"\"\"{question.strip()}\"\"\"

Please generate the output as a JSON object with the following structure and fields:
{{
  "a": "text for option a",
  "b": "text for option b",
  "c": "text for option c",
  "d": "text for option d",
  "correct": "text of the correct answer",
  "tags": ["tag1", "tag2"],
  "category": "category_name"
}}
Ensure the JSON is enclosed in ```json and ```.
""",
        },
    ], tokenize=False, add_generation_prompt=True)

# --- Step 4: Load Data ---
df = pd.read_csv(INPUT_CSV)[['id', 'question']]
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

# Check for existing output and load processed IDs
processed_ids = set()
failed_or_incomplete_ids = set() # Initialize set for failed/incomplete IDs

if os.path.exists(OUTPUT_CSV):
    print(f"Found existing output file: {OUTPUT_CSV}. Loading processed IDs.")
    try:
        existing_df = pd.read_csv(OUTPUT_CSV)
        # Ensure 'id' column exists and check if generated fields are NOT None
        if 'id' in existing_df.columns:
            # Consider an ID processed only if at least 'a', 'b', 'c', 'd', 'correct', and 'category' are not None
            successfully_processed_df = existing_df.dropna(subset=['a', 'b', 'c', 'd', 'correct', 'category'])
            processed_ids = set(successfully_processed_df['id'].astype(str).tolist())

            # Identify IDs that were in the file but not considered successfully processed
            all_existing_ids = set(existing_df['id'].astype(str).tolist())
            failed_or_incomplete_ids = all_existing_ids - processed_ids

            print(f"Loaded {len(processed_ids)} successfully processed questions.")
            if failed_or_incomplete_ids:
                 print(f"Found {len(failed_or_incomplete_ids)} questions in the output file that will be retried (missing generated data).")

        else:
            print(f"Warning: 'id' column not found in {OUTPUT_CSV}. Starting from scratch.")
            # If 'id' column is missing, treat as if no questions were processed
            processed_ids = set()

    except pd.errors.EmptyDataError:
        print(f"Existing output file {OUTPUT_CSV} is empty. Starting from scratch.")
        processed_ids = set()
    except Exception as e:
        print(f"Error loading existing output file: {e}")
        # Continue without skipping if there's an error loading the file
        processed_ids = set()


# Filter out already successfully processed questions
initial_rows = len(df)
df = df[~df['id'].astype(str).isin(processed_ids)].copy()
skipped_successfully_processed_rows = initial_rows - len(df)
if skipped_successfully_processed_rows > 0:
    print(f"Skipping {skipped_successfully_processed_rows} questions that have already been successfully processed.")

# At this point, df contains both unprocessed questions AND failed/incomplete questions from the previous run.
# We need to ensure that the failed/incomplete IDs are *not* treated as successfully processed when checking again.
# The current filtering correctly keeps them in df because they are not in the 'processed_ids' set.
# No further filtering needed here for the retry logic to work as intended with the current structure.


# --- [NEW] Batched Inference Loop ---
from itertools import islice

def batched(iterable, batch_size):
    it = iter(iterable)
    while True:
        batch = list(islice(it, batch_size))
        if not batch:
            break
        yield batch

# Convert 'question' column to string type
df['question'] = df['question'].astype(str)

# --- Custom Workflow Preprocessing Function ---
def clean_json_string_custom_workflow(json_str_raw):
    """
    Attempts to clean a raw string based on the user's specified workflow:
    remove {}, split by ,, split by :, validate key (as string), remove newlines in value, merge valid, add {}.
    This is a highly experimental and likely fragile heuristic.
    Trims whitespace from keys and values.
    Does NOT include explicit delimiter balance check.
    """
    # 1. Remove outer {}
    # Be more robust to whitespace around braces
    cleaned_raw = json_str_raw.strip()
    if cleaned_raw.startswith('{') and cleaned_raw.endswith('}'):
        content_without_braces = cleaned_raw[1:-1].strip()
    else:
        # If braces are missing or misplaced, this workflow won't work as intended.
        # Return the raw string or an empty string to indicate failure.
        print(f"Warning: Raw string does not start/end with {{}} as expected for custom workflow: {json_str_raw}")
        return "" # Cannot apply workflow if basic structure is missing

    # 2. Split by comma (carefully, to not split inside strings)
    segments = []
    current_segment = []
    in_string = False

    i = 0
    while i < len(content_without_braces):
        char = content_without_braces[i]

        if char == '"':
            current_segment.append(char)
            in_string = not in_string
        elif char == ',' and not in_string:
            # Found a comma outside a string - this is a segment boundary
            segments.append("".join(current_segment).strip())
            current_segment = [] # Start new segment
        else:
            current_segment.append(char)

        i += 1

    # Add the last segment after the loop
    segments.append("".join(current_segment).strip())

    valid_segments_processed = []

    # 3. & 4. Split by colon, Validate key, Trim key/value, Remove newlines in value
    for segment in segments:
        # Split by the first colon
        colon_index = segment.find(':')
        if colon_index != -1:
            key_part = segment[:colon_index].strip() # Trim key part
            value_part_raw = segment[colon_index + 1:] # Raw value part

            # Check if the key part is a valid JSON string (starts and ends with ")
            if key_part.startswith('"') and key_part.endswith('"'):
                 # This segment appears to have a valid string key.
                 # Now, process the value part to remove newlines within strings.
                 cleaned_value_chars = []
                 in_string_value = False
                 j = 0
                 while j < len(value_part_raw):
                      val_char = value_part_raw[j]
                      if in_string_value and val_char == '\n':
                           # Remove newline within string in value
                           pass
                      elif val_char == '"':
                            cleaned_value_chars.append(val_char)
                            in_string_value = not in_string_value
                      else:
                           # Keep other characters in value (including whitespace and structural chars outside quotes)
                           cleaned_value_chars.append(val_char)
                      j += 1

                 value_part_cleaned = "".join(cleaned_value_chars).strip() # Trim cleaned value
                 valid_segments_processed.append(key_part + ":" + value_part_cleaned) # Reconstruct with cleaned value
            else:
                 # Key is not a valid string. Discard this segment.
                 # print(f"Debug (Workflow): Discarding segment with invalid key: {segment}")
                 pass # Discard the segment
        else:
            # Segment does not contain a colon. This might be a structural element like an empty object/array.
            # Or it could be malformed. Let's check if it's empty or contains valid structural chars.
            # A simple check: if it contains only whitespace or valid structural chars, keep it.
            stripped_segment = segment.strip()
            if stripped_segment == '' or all(c in '{}[]:, \n\t' for c in stripped_segment):
                 valid_segments_processed.append(stripped_segment)
            else:
                 # print(f"Debug (Workflow): Discarding segment without colon and not structural: {segment}")
                 pass # Discard


    # 5. Merge valid segments
    content_without_braces_cleaned = ",".join(valid_segments_processed)

    # 6. Add back {}
    json_str_cleaned = "{" + content_without_braces_cleaned + "}"

    # No explicit delimiter balance check as requested.

    return json_str_cleaned


# --- Step 5: Inference Loop ---
print("🚀 Generating MCQs...")
# Use 'output' list to accumulate entries before writing in case of batch failure
batch_output_list = []

# Clear failed_or_incomplete_ids at the start of the main processing loop
# This is crucial to ensure that only IDs that fail in the *current* run
# are marked for retry in the *next* run.
failed_or_incomplete_ids.clear()


for batch_rows in tqdm(batched(df.iterrows(), BATCH_SIZE), total=(len(df) + BATCH_SIZE - 1) // BATCH_SIZE):
    prompts = []
    metadata = []
    batch_output_list = [] # Reset batch_output_list for each batch


    for _, row in batch_rows:
        prompt = build_prompt(row["question"])
        prompts.append(prompt)
        metadata.append((row["id"], row["question"]))

        # IMPORTANT: Remove the ID from failed_or_incomplete_ids set once it's included in the current batch
        # This ensures that if it fails again, it's still marked as incomplete for the next run.
        # No need to remove from processed_ids here, as processed_ids only contains successfully processed IDs.
        # The filtering of df at the start of the script handles skipping successfully processed ones.
        pass # No change needed here, the initial filtering already ensures retries are in df


    try:
        results = llm(prompts, max_new_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, do_sample=True, use_cache=False)

        for i, res in enumerate(results):
            qid, question = metadata[i]
            retries = 0
            success = False
            response_text = None # Initialize response_text to None
            # Initialize entry with default values for the 9 output columns
            entry = {
                "id": qid,
                "question": question,
                "a": None,
                "b": None,
                "c": None,
                "d": None,
                "correct": None,
                "tags": None,
                "category": None
            }
            # Temporarily store raw_output for debugging within the loop if needed
            raw_output_temp = None


            # --- Extract generated_text from the response list/dict ---
            if isinstance(res, list) and res and isinstance(res[0], dict) and 'generated_text' in res[0]:
                 # Handle cases where the result might be a list containing the dictionary
                response_text = res[0]['generated_text']
            elif isinstance(res, dict) and 'generated_text' in res:
                 # Handle cases where the result might be a single dictionary (less common for pipeline batching but good practice)
                 response_text = res['generated_text']
            else:
                print(f"❌ Unexpected response format from pipeline for ID {qid}. Raw result: {res}")
                raw_output_temp = str(res) # Store raw result in temp
                batch_output_list.append(entry) # Append entry with default values
                # Add this ID to the failed_or_incomplete_ids set
                failed_or_incomplete_ids.add(str(qid))
                continue # Move to the next result

            raw_output_temp = response_text # Store raw response text in temp


            # --- Extract text after the last <|assistant|> tag ---
            assistant_tag = "<|assistant|>"
            last_assistant_index = response_text.rfind(assistant_tag)

            extracted_text_after_assistant = ""
            if last_assistant_index != -1:
                # Extract text from the end of the last assistant tag to the end of the string
                extracted_text_after_assistant = response_text[last_assistant_index + len(assistant_tag):].strip()
            else:
                 print(f"❌ Could not find the last '{assistant_tag}' tag in response for ID {qid}. Raw response text: {response_text}")
                 # Store error info in raw_output for logging/debugging if needed later, but not in final CSV
                 # entry["raw_output_debug"] = f"Could not find '{assistant_tag}' tag.\nRaw Response Text: {response_text}" # Removed as raw_output is not in final CSV
                 batch_output_list.append(entry) # Append entry with default values
                 # Add this ID to the failed_or_incomplete_ids set
                 failed_or_incomplete_ids.add(str(qid))
                 continue # Move to the next result


            # --- Now, find the JSON block using ```json and ``` within the extracted text ---
            json_block_start = extracted_text_after_assistant.find('```json')
            json_block_end = extracted_text_after_assistant.find('```', json_block_start + 7)

            json_str_raw = ""
            if json_block_start != -1 and json_block_end != -1:
                # Extract and trim the content within the block
                json_str_raw = extracted_text_after_assistant[json_block_start + 7 : json_block_end].strip()
            else:
                 print(f"❌ Could not find ```json block within text after '{assistant_tag}' for ID {qid}. Extracted Text: {extracted_text_after_assistant}")
                 # Store error info in raw_output for logging/debugging if needed later, but not in final CSV
                 # entry["raw_output_debug"] = f"Could not find JSON block within text after '{assistant_tag}'.\nExtracted Text: {extracted_text_after_assistant}" # Removed as raw_output is not in final CSV
                 batch_output_list.append(entry) # Append entry with default values
                 # Add this ID to the failed_or_incomplete_ids set
                 failed_or_incomplete_ids.add(str(qid))
                 continue # Move to the next result


            # --- Parsing and Retries (Inside Retry Loop) ---
            json_str_to_parse = json_str_raw # Start with the extracted raw string

            while retries < MAX_RETRIES and not success:
                parsed = None
                json_decode_error = None

                try:
                    parsed = json.loads(json_str_to_parse)
                    success = True # If parsing is successful, set success to True

                except json.JSONDecodeError as e:
                    json_decode_error = e
                    print(f"JSON Decode Error for ID {qid}, attempt {retries + 1}: {e}")
                    # Log the string that failed parsing
                    print(f"--- Failed JSON string for ID {qid}, attempt {retries + 1} ---\n{json_str_to_parse}\n--- End Failed JSON string ---")


                    if retries < MAX_RETRIES -1 : # Only attempt cleanup if retries remain
                         print(f"Attempting JSON cleanup due to decode error for ID {qid}, attempt {retries + 1}.")
                         # --- Preprocessing Step (Custom Workflow) - ONLY applied on JSONDecodeError ---
                         # Use the raw extracted string for the custom workflow, as it expects the {}
                         json_str_cleaned = clean_json_string_custom_workflow(json_str_raw)
                         json_str_to_parse = json_str_cleaned # Now try parsing the cleaned string
                         # No extra log here, the failed string is logged above before this retry attempt
                         # print(f"Attempting to parse cleaned JSON string for ID {qid}, attempt {retries + 1}: {json_str_to_parse}") # Removed extra log
                         # --- End Preprocessing Step ---
                    else:
                         print(f"Max retries reached for ID {qid}. Skipping cleanup attempt.")


                    retries += 1 # Increment retry count


                # If parsing was successful, we break the retry loop and process the data
                if success:
                     break


            # --- Process Parsed Data (Outside Retry Loop, after success or max retries) ---
            if success:
                 # Check if the parsed output is a dictionary
                 if not isinstance(parsed, dict):
                     print(f"❌ Parsed JSON is not a dictionary for ID {qid} after successful decode. Parsed object: {parsed}")
                     success = False # Mark as failed if not a dictionary
                 else:
                     # Try extracting data based on the parsed dictionary structure
                     try:
                        options_dict = parsed.get('options', parsed) # Try 'options' dict, fallback to top-level keys
                        correct_answer = parsed.get('correct')
                        category = parsed.get('category')
                        tags_list = parsed.get('tags', [])

                        # Ensure required keys exist (including 'correct' now being the text)
                        if not (isinstance(options_dict, dict) and all(key in options_dict for key in ['a', 'b', 'c', 'd']) and correct_answer is not None and category is not None):
                             print(f"❌ Parsed dictionary missing required keys ('options', 'correct', 'category' or top-level a,b,c,d) for ID {qid}. Parsed dict: {parsed}")
                             success = False # Mark as failed if keys are missing
                        else:
                             # Case-insensitive and whitespace-stripped comparison for leak detection
                             correct_stripped = str(correct_answer).strip().lower()
                             leaked = False
                             # Check only the 'a', 'b', 'c', 'd' options for leaks
                             for option_key in ['a', 'b', 'c', 'd']:
                                 option_text = options_dict.get(option_key)
                                 if option_text is not None and isinstance(option_text, str) and option_text.strip().lower() == correct_stripped:
                                     leaked = True
                                     break

                             if leaked:
                                  print(f"⚠️ Correct leaked into options for ID {qid}. Skipping.")
                                  success = False # Mark as failed if leak detected

                             # --- NEW Check: If correct is 'a', 'b', 'c', or 'd', mark as failed ---
                             if success and isinstance(correct_answer, str) and correct_answer.strip().lower() in ['a', 'b', 'c', 'd']:
                                  print(f"❌ Correct answer is a character ('a', 'b', 'c', or 'd') for ID {qid}. Skipping and retrying.")
                                  success = False # Mark as failed

                             if success: # Only update entry if no leak, keys are present, AND correct is not a single character option
                                 # Update the entry dictionary with parsed data for the 9 columns
                                 entry.update({
                                     "a": options_dict.get('a'),
                                     "b": options_dict.get('b'),
                                     "c": options_dict.get('c'),
                                     "d": options_dict.get('d'),
                                     "correct": correct_answer, # Store the text of the correct answer
                                     "tags": ", ".join([str(tag) for tag in tags_list if tag is not None]),
                                     "category": category
                                 })

                     except Exception as extract_e:
                        print(f"❌ Error extracting data from parsed JSON for ID {qid}: {extract_e}.Parsed dict: {parsed}")
                        success = False # Mark as failed on extraction error


            if not success:
                print(f"❌ Skipped ID {qid} after {MAX_RETRIES} failed attempts or parsing issues.")
                # Add this ID to the failed_or_incomplete_ids set
                failed_or_incomplete_ids.add(str(qid))
                # Do NOT append the entry to batch_output_list if parsing failed or validation failed (correct is a single character option)
            else:
                 # ONLY append the entry if processing was successful and validation passed
                 batch_output_list.append(entry)


    except Exception as e:
        print(f"❌ Batch failed: {e}")
        # Log entries for the entire failed batch with an error indicator
        for i, (qid, question) in enumerate(metadata):
             # Initialize entry with default values for the 9 output columns for failed batch
             # These will NOT be appended to batch_output_list here, they are just for logging/tracking
             # The IDs are added to failed_or_incomplete_ids below
             print(f"Batch failed for ID {qid}: {e}")

             # Add this ID to the failed_or_incomplete_ids set
             failed_or_incomplete_ids.add(str(qid))


    # Write the accumulated batch output to the CSV
    if batch_output_list:
        # Define the columns explicitly to ensure order and presence for the 9 columns
        output_columns = ["id", "question", "a", "b", "c", "d", "correct", "tags", "category"]
        # Create DataFrame with defined columns from the successfully processed entries
        batch_df = pd.DataFrame(batch_output_list, columns=output_columns)
        batch_df.to_csv(OUTPUT_CSV, mode='a', index=False, header=not os.path.exists(OUTPUT_CSV))


# --- Final Save (optional) ---
# After the main loop finishes, save the failed_or_incomplete_ids for the next run
if failed_or_incomplete_ids:
    print(f"Found {len(failed_or_incomplete_ids)} questions that failed or were incomplete in this run.")
    # You might want to save these IDs to a separate file or log them clearly
    # For now, they will be picked up by the loading logic in the next run
print("✅ Script completed. Entries already written to disk incrementally.")

✅ Using cached model from /content/Phi-3-mini-4k
🔧 Phi-3-mini-4k...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


🚀 Generating MCQs...


  0%|          | 1/1083 [02:30<45:09:34, 150.25s/it]

❌ Correct answer is a character ('a', 'b', 'c', or 'd') for ID 2. Skipping and retrying.
❌ Skipped ID 2 after 3 failed attempts or parsing issues.


  0%|          | 2/1083 [04:36<40:50:33, 136.02s/it]

⚠️ Correct leaked into options for ID 4. Skipping.
❌ Skipped ID 4 after 3 failed attempts or parsing issues.
