In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from datasets import load_dataset

ds = load_dataset("allenai/ai2_arc", "ARC-Easy")

README.md: 0.00B [00:00, ?B/s]

ARC-Easy/train-00000-of-00001.parquet:   0%|          | 0.00/331k [00:00<?, ?B/s]

ARC-Easy/test-00000-of-00001.parquet:   0%|          | 0.00/346k [00:00<?, ?B/s]

ARC-Easy/validation-00000-of-00001.parqu(â€¦):   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2376 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/570 [00:00<?, ? examples/s]

In [3]:
train_df = ds['train'].to_pandas()
test_df = ds['test'].to_pandas()
validation_df = ds['validation'].to_pandas()

In [4]:
train_df.head()

Unnamed: 0,id,question,choices,answerKey
0,Mercury_7220990,Which factor will most likely cause a person t...,{'text': ['a leg muscle relaxing after exercis...,B
1,MCAS_2007_8_5189,Lichens are symbiotic organisms made of green ...,"{'text': ['carbon dioxide', 'food', 'protectio...",B
2,Mercury_SC_401169,When a switch is used in an electrical circuit...,"{'text': ['cause the charge to build.', 'incre...",D
3,MCAS_2004_8_27,Which of the following is an example of an ass...,"{'text': ['contact lens', 'motorcycle', 'rainc...",A
4,NYSEDREGENTS_2006_8_10,"Rocks are classified as igneous, metamorphic, ...","{'text': ['their color', 'their shape', 'how t...",3


In [5]:
for indx, rows in train_df.iloc[:10].iterrows():
    print(rows['question'])
    print(rows['choices'])
    print(rows['answerKey'])

Which factor will most likely cause a person to develop a fever?
{'text': array(['a leg muscle relaxing after exercise',
       'a bacterial population in the bloodstream',
       'several viral particles on the skin',
       'carbohydrates being digested in the stomach'], dtype=object), 'label': array(['A', 'B', 'C', 'D'], dtype=object)}
B
Lichens are symbiotic organisms made of green algae and fungi. What do the green algae supply to the fungi in this symbiotic relationship?
{'text': array(['carbon dioxide', 'food', 'protection', 'water'], dtype=object), 'label': array(['A', 'B', 'C', 'D'], dtype=object)}
B
When a switch is used in an electrical circuit, the switch can
{'text': array(['cause the charge to build.', 'increase and decrease the voltage.',
       'cause the current to change direction.',
       'stop and start the flow of current.'], dtype=object), 'label': array(['A', 'B', 'C', 'D'], dtype=object)}
D
Which of the following is an example of an assistive device?
{'text': a

In [6]:
import os
import pandas as pd
import json
import time
from openai import OpenAI, RateLimitError
from tenacity import retry, wait_random_exponential, stop_after_attempt
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()


def parse_choices_and_answer(row):
    """
    Parses the 'choices' dictionary and 'answerKey' to get the
    list of options and the text of the correct answer.
    """
    choices_dict = row['choices']
    answer_key = row['answerKey']

    # Extract all options text
    all_options_text = choices_dict['text'].tolist()
    
    # Extract all labels
    all_labels = choices_dict['label'].tolist()
    
    # Find the index of the correct answer label
    try:
        correct_index = all_labels.index(answer_key)
        correct_answer_text = all_options_text[correct_index]
    except ValueError:
        correct_answer_text = None # Handle cases where answerKey might not be in labels

    return all_options_text, correct_answer_text


# --- Function 1: Initialize OpenAI client ---
def initialize_client(api_key):
    """Initializes and returns an OpenAI client."""
    if not api_key:
        raise ValueError("API key not provided.")
    return OpenAI(api_key=api_key)

# --- Function 2: Construct Prompt Messages ---
def construct_prompt_messages(row):
    """
    Constructs the list of messages for a single API call,
    using the provided prompt and data from a DataFrame row.
    """

    all_options, original_correct_answer = parse_choices_and_answer(row)
    question = row['question']

    user_content = f"""
Core Constraints:
1.  **Strictly use provided options:** You MUST only choose additional correct options from the provided list of 'all_options'.
2.  **Find all correct answers:** Identify all other options that are factually correct answers to the question.
3.  **Handle single-correct cases:** If no other options are correct, the 'additional_correct_answers' list in your output MUST be empty. Do not invent new answers.
4.  **Preserve formatting:** Output your response as a single, valid JSON object.

Examples:

Input:
Question: What are some common sources of protein?
All Options: ["Milk", "Eggs", "Meat", "Vegetables", "Soda"]
Original Correct Answer: "Eggs"

Output JSON:
{{
  "question": "What are some common sources of protein?",
  "original_correct_answer": "Eggs",
  "additional_correct_answers": ["Milk", "Meat"]
}}

Input:
Question: What is the largest organ in the human body?
All Options: ["Heart", "Brain", "Skin", "Lungs"]
Original Correct Answer: "Skin"

Output JSON:
{{
  "question": "What is the largest organ in the human body?",
  "original_correct_answer": "Skin",
  "additional_correct_answers": []
}}

Now, your turn. Generate the JSON output for the following.

Input:
Question: {question}
All Options: {all_options}
Original Correct Answer: {original_correct_answer}
"""
    messages = [
        {
            "role": "system",
            "content": "You are an expert at identifying all correct options for a multiple-choice question. You respond only with a valid JSON object."
        },
        {
            "role": "user",
            "content": user_content
        }
    ]
    return messages

# --- 4. Call GPT-4o-mini with Robustness ---
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def call_gpt4o_mini(client, messages):
    """
    Sends a single prompt to the GPT-4o-mini API with robustness for errors and rate limits.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            response_format={"type": "json_object"},
            temperature=0.7,
            n=1
        )
        content = response.choices[0].message.content
        return json.loads(content)
    except RateLimitError as e:
        print(f"Rate limit error: {e}. Retrying...")
        raise
    except json.JSONDecodeError as e:
        print(f"Received invalid JSON from API: {e}. Retrying...")
        raise e
    except Exception as e:
        print(f"An unexpected error occurred during the API call: {e}")
        raise e

# --- 5. Orchestrate the Process ---
def process_dataframe(client, df, output_file, batch_size=50):
    """
    Main function to process a DataFrame, call the API, and save the results.
    """
    processed_results = []
    num_rows = len(df)

    print(f"Starting to process {num_rows} rows in batches of {batch_size}.")

    for i in range(0, num_rows, batch_size):
        batch = df.iloc[i:i + batch_size]
        print(f"Processing batch starting at row {i+1}...")

        for index, row in batch.iterrows():
            messages_for_api = construct_prompt_messages(row)
            if not messages_for_api: # Skip if parsing failed
                continue

            try:
                generated_json = call_gpt4o_mini(client, messages_for_api)
                processed_results.append(generated_json)
                print(f"  Processed row {index} successfully.")
            except Exception as e:
                print(f"  Failed to process row {index} after multiple retries. Skipping.")

        time.sleep(0.01)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(processed_results, f, indent=4)

    print(f"Processing complete. {len(processed_results)} results saved to {output_file}")

# --- Main Execution Block ---
if __name__ == "__main__":
    # --- STEP 1: Set up your API key ---
    openai_key = user_secrets.get_secret("openai")
    if not openai_key:
        raise ValueError("OPENAI_API_KEY environment variable not set. Please set it.")
    client = initialize_client(openai_key)

    
    output_json_file = 'expanded_arc_answers.json'
    process_dataframe(client, train_df, output_json_file, batch_size=10)


Starting to process 2251 rows in batches of 10.
Processing batch starting at row 1...
  Processed row 0 successfully.
  Processed row 1 successfully.
  Processed row 2 successfully.
  Processed row 3 successfully.
  Processed row 4 successfully.
  Processed row 5 successfully.
  Processed row 6 successfully.
  Processed row 7 successfully.
  Processed row 8 successfully.
  Processed row 9 successfully.
Processing batch starting at row 11...
  Processed row 10 successfully.
  Processed row 11 successfully.
  Processed row 12 successfully.
  Processed row 13 successfully.
  Processed row 14 successfully.
  Processed row 15 successfully.
  Processed row 16 successfully.
  Processed row 17 successfully.
  Processed row 18 successfully.
  Processed row 19 successfully.
Processing batch starting at row 21...
  Processed row 20 successfully.
  Processed row 21 successfully.
  Processed row 22 successfully.
  Processed row 23 successfully.
  Processed row 24 successfully.
  Processed row 25 su

In [7]:
with open('/kaggle/working/expanded_arc_answers.json', 'r', encoding='utf-8') as f:
    gpt_output_data = json.load(f)

In [8]:
import pandas as pd
import json
import numpy as np

def create_filtered_dataset_by_index(original_df, gpt_output_data, output_csv_file):
    """
    Creates a new DataFrame and CSV containing only questions with additional correct answers.
    This function processes the data by index, assuming both inputs are in the same order.

    Args:
        original_df (pd.DataFrame): The DataFrame with original questions and choices.
        gpt_output_data (list): The list of dictionaries from the GPT-generated JSON file.
        output_csv_file (str): The filename for the final CSV output.
    """
    final_data_list = []

    # Iterate through both datasets by index
    for i in range(len(original_df)):
        original_row = original_df.iloc[i]
        gpt_entry = gpt_output_data[i]
        
        # --- CRITICAL FILTERING STEP ---
        additional_corrects = gpt_entry.get('additional_correct_answers', [])
        
        # Only proceed if there are additional correct answers
        if additional_corrects:
            id_ = original_row['id']
            question = original_row['question']
            original_correct = gpt_entry['original_correct_answer']
            all_correct_options = [original_correct] + additional_corrects
            
            # Get all original options to find the distractors
            all_original_options = original_row['choices']['text'].tolist()
            
            # Calculate the distractors by finding the set difference
            distractors = list(set(all_original_options) - set(all_correct_options))
            
            final_data_list.append({
                'id': id_,
                'question': question,
                'correct_ans': original_correct,
                'additional_correct': '; '.join(additional_corrects),
                'distractors': '; '.join(distractors)
            })

    # Create the final DataFrame and save it to a CSV file
    final_df = pd.DataFrame(final_data_list)
    final_df.to_csv(output_csv_file, index=False)
    
    print(f"Dataset created with {len(final_df)} entries (only questions with additional correct answers).")
    print(f"Output saved to '{output_csv_file}'.")
    
    return final_df

if __name__ == '__main__':
    
    # --- Run the function to create the final dataset ---
    final_dataset = create_filtered_dataset_by_index(train_df, gpt_output_data, 'final_arc_train_dataset.csv')
    print("\nFinal DataFrame head:")
    print(final_dataset.head())

Dataset created with 477 entries (only questions with additional correct answers).
Output saved to 'final_arc_train_dataset.csv'.

Final DataFrame head:
                       id                                           question  \
0        MCAS_2007_8_5189  Lichens are symbiotic organisms made of green ...   
1  NYSEDREGENTS_2006_8_10  Rocks are classified as igneous, metamorphic, ...   
2         Mercury_7013388  A chewable calcium carbonate tablet is a commo...   
3       Mercury_SC_400124  A scientist wanting to document a change in a ...   
4         Mercury_7109393  A group of students are researching changes in...   

                   correct_ans  \
0                         food   
1              how they formed   
2  neutralizes digestive acid.   
3                       years.   
4                   tree rings   

                                  additional_correct               distractors  
0                              carbon dioxide; water                protection  