In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sciencehotpotqa/hotpot_scienceandtech.csv
/kaggle/input/sciencehotpotqa/pure_science_hotpot.csv


In [2]:
df = pd.read_csv('/kaggle/input/sciencehotpotqa/pure_science_hotpot.csv')
df.head()

Unnamed: 0,question,answer,supporting_facts_text,all_sentences,document,topic_id,topic_category
0,Cadmium Chloride is slightly soluble in this c...,alcohol,It is a hygroscopic solid that is highly solu...,['Cadmium chloride is a white crystalline comp...,Cadmium Chloride is slightly soluble in this c...,14,Chemistry and Chemical Compounds
1,What type of vegetation does Kniphofia and Bap...,plant,"Kniphofia , also called tritoma, red hot poker...",['The Dinaric calcareous silver fir forests ar...,What type of vegetation does Kniphofia and Bap...,0,Biology and Life Sciences
2,Teri W. Odom is a member of a scientific jour...,the American Chemical Society,Teri W. Odom is an American chemist and materi...,"['""Chelonian Conservation and Biology: Interna...",Teri W. Odom is a member of a scientific jour...,28,Physics and Astronomy
3,Who won a Nobel Prize in 1943 and is associate...,Otto Stern,"The Stern–Volmer relationship, named after Ott...","['Since 1901, the Nobel Prize in Literature (S...",Who won a Nobel Prize in 1943 and is associate...,8,Physics and Astronomy
4,What type of species is a Boreo-arctic Montane...,beetle,Carabus glabratus is a species of beetle. The ...,"[""The five main latitude regions of the Earth'...",What type of species is a Boreo-arctic Montane...,0,Biology and Life Sciences


In [3]:
df.topic_category.value_counts()

topic_category
Biology and Life Sciences               1783
Physics and Astronomy                    320
Earth Sciences and Natural Phenomena     139
Chemistry and Chemical Compounds         119
Name: count, dtype: int64

In [4]:
import os
import json
import time
import pandas as pd
import random
from openai import OpenAI, RateLimitError
from tenacity import retry, wait_random_exponential, stop_after_attempt
from datasets import load_dataset
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

# --- 1. Setup and API Call Functions ---

openai_key = user_secrets.get_secret("openai")
if not openai_key:
    raise ValueError("OPENAI_API_KEY environment variable not set.")
client = OpenAI(api_key=openai_key)


def call_gpt4o_mini(messages):
    """Sends a prompt to the GPT-4o-mini API with robustness for errors."""
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            response_format={"type": "json_object"},
            temperature=0.7,
            n=1
        )
        content = response.choices[0].message.content
        return json.loads(content)
    except RateLimitError as e:
        print(f"Rate limit error: {e}. Retrying...")
        raise
    except json.JSONDecodeError as e:
        print(f"Received invalid JSON from API: {e}. Retrying...")
        raise e
    except Exception as e:
        print(f"An unexpected error occurred during the API call: {e}")
        raise e

# --- 2. Data Loading and Filtering ---

def load_and_filter_scienceqa_data():
    """
    Loads the ScienceQA dataset, filters for a specific subset, and then
    further filters for text-only questions with a valid lecture/solution context.
    """
    print("Loading ScienceQA dataset...")
    df = pd.read_csv('/kaggle/input/sciencehotpotqa/pure_science_hotpot.csv')
    df = df[df.topic_category != 'Physics and Astronomy']
    df = df[df.topic_category != 'Earth Sciences and Natural Phenomena']
    
    print(df.shape)

    df['text_context'] = df['document']

    # Filter out rows where the combined context is empty
    df = df[df['text_context'].str.len() > 0]

    print(f"Loaded and filtered {len(df)} text-only questions.")
    return df

# --- 3. Revised Single-Prompt Function with Constraints ---

def construct_single_stage_prompt(context):
    """
    Constructs a single prompt to generate a question, options, and distractors
    with strong constraints for thematic coherence and grounding.
    """
    user_content = f"""
    You are an expert at identifying and extracting atomic facts and plausible distractors from a given context. You are given a task to generate questionaire to select students' understanding.

    Your task is to read the context and extract two distinct lists:
    1.  A list of 3-5 independent, atomic correct answers that are TRUE based on the context.
    2.  A list of 3-5 plausible but FALSE statements (distractors) based on the context.
    
    Let's think step by step:
    1.  Read the context carefully and identify a specific thematic category (e.g., "states of matter," "living organisms").
    2.  If the context contains "**Required Context:**", this information is critical and before answering that question, this context has to be present as the question needs this context.
    3.  From the context, extract a set of independent, atomic correct answers. Prioritize facts that describe relationships, processes, or less obvious conclusions, not just simple definitions.
    4.  Also, create a list of plausible, incorrect facts (distractors). These should be highly convincing but factually wrong based on the context. Ensure they use correct terminology from the context but apply it incorrectly.
    5.  Order the list of correct answers in order of their relative importance.
    6.  The intention is to later create a difficult QA dataset combining correct answer1 OR correct answer2/ correct answer1 AND correct answer2. That's why they have to be multiple atomic answers. This would make the combination creation easier.
    7.  Everything has to sound natural. Students answering this shouldn't have any confusion. Also don't just use the same contents from the context because it would be a giveaway to them.
    8.  To make it challenging, make the structure of both correct and incorrect answers alike wherever possible. This is not strict, only optional.
    9.  Frame the question to require multi-step reasoning, focusing on the logical correctness of both correct and incorrect answers. You can use the question present at the beginning of every context for this part.
    10. The options have to be representative of the question.
    11. Format the final output as a single JSON object.

    Strict Constraints:
    -   **Fact-Grounded:** The entire question, all answers, and all options must be derived from the context. 
    -   **No Pronouns:** All options must use proper subject names or descriptive phrases (e.g., "solids" instead of "they").
    -   **Multi-Step:** The answers must be statements that requires combining at least two facts from the context to arrive at it.
    -   **No Repetition:** The correct option MUST be a novel combination. It must NOT be a direct copy of a compound statement from the context.
    -   Each item in the lists must be a concise, single sentence.
    
    -   **JSON Format:** The final output must be a single JSON object with the following keys: `correct_answers`, `incorrect_answers`, `question`, `choices`.


    Context:
    {context}

    Output JSON Example:
    {{
      "question": "Which of the following describes the key characteristics of both solids and liquids?",
      "correct_answers": [
        "Solids have a definite shape.",
        "Solids have a definite volume.",
        "Liquids take the shape of their container.",
        "Liquids have a definite volume."
      ],
      "incorrect_answers": [
        "Solids can be bent or broken easily.",
        "Liquids have a definite shape.",
        "Gases have a definite volume.",
        "Solids take the shape of their container."
      ]
    }}
    
    
    {{
  "question": "Which of the following statements about renewable energy is correct?",
  "correct_answers": [
    "Solar energy is a renewable resource.",
    "Wind energy is a renewable resource.",
    "Solar panels convert sunlight into electricity.",
    "Wind turbines convert wind into electricity."
  ],
  "incorrect_answers": [
    "Solar energy is a nonrenewable resource.",
    "Wind turbines convert coal into electricity.",
    "Solar panels generate energy at night."
  ]
  }}
    """
    messages = [
        {"role": "system", "content": "You are a QA system that strictly follows instructions."},
        {"role": "user", "content": user_content}
    ]
    return messages

# --- 4. Main Execution Block ---
def process_single_stage_pipeline(df, output_file_csv):
    """
    Orchestrates the single-stage pipeline and saves the final output to a CSV file.
    """
    print(f"Starting single-stage pipeline for {len(df)} rows.")
    final_output_list = []
    
    for i in range(len(df)):
        row = df.iloc[i]
        
        # This is where the model generates the question
        messages = construct_single_stage_prompt(row['text_context'])
        try:
            result = call_gpt4o_mini(messages)
            
            # This is where we add the metadata to the final output
            result['original_context'] = row['text_context']
            result['all_sentences'] = row['all_sentences']
            result['document'] = row['document']

            final_output_list.append(result)
            print(f"  Processed row {i} successfully.")
        except Exception as e:
            print(f"  Failed to process row {i}. Skipping.")
        time.sleep(0.01)

    final_df = pd.DataFrame(final_output_list)
    final_df.to_csv(output_file_csv, index=False, encoding='utf-8')
    print(f"\nProcessing complete. {len(final_output_list)} final results saved to '{output_file_csv}'.")

if __name__ == '__main__':
    filtered_df = load_and_filter_scienceqa_data()
    if not filtered_df.empty:
        output_csv_file = 'sciencehotpotqa_single_stage_generated.csv'
        process_single_stage_pipeline(filtered_df, output_csv_file)

Loading ScienceQA dataset...
(1902, 7)
Loaded and filtered 1902 text-only questions.
Starting single-stage pipeline for 1902 rows.
  Processed row 0 successfully.
  Processed row 1 successfully.
  Processed row 2 successfully.
  Processed row 3 successfully.
  Processed row 4 successfully.
  Processed row 5 successfully.
  Processed row 6 successfully.
  Processed row 7 successfully.
  Processed row 8 successfully.
  Processed row 9 successfully.
  Processed row 10 successfully.
  Processed row 11 successfully.
  Processed row 12 successfully.
  Processed row 13 successfully.
  Processed row 14 successfully.
  Processed row 15 successfully.
  Processed row 16 successfully.
  Processed row 17 successfully.
  Processed row 18 successfully.
  Processed row 19 successfully.
  Processed row 20 successfully.
  Processed row 21 successfully.
  Processed row 22 successfully.
  Processed row 23 successfully.
  Processed row 24 successfully.
  Processed row 25 successfully.
  Processed row 26 su

In [5]:
new_df = pd.read_csv('/kaggle/working/sciencehotpotqa_single_stage_generated.csv')

In [6]:
new_df.head()

Unnamed: 0,question,correct_answers,incorrect_answers,original_context,all_sentences,document
0,Which of the following statements accurately d...,['Cadmium Chloride is slightly soluble in wate...,['Cadmium Chloride is highly soluble in alcoho...,Cadmium Chloride is slightly soluble in this c...,['Cadmium chloride is a white crystalline comp...,Cadmium Chloride is slightly soluble in this c...
1,What characteristics do Kniphofia and Baptisia...,['Both Kniphofia and Baptisia produce flowerin...,['Kniphofia and Baptisia are both classified a...,What type of vegetation does Kniphofia and Bap...,['The Dinaric calcareous silver fir forests ar...,What type of vegetation does Kniphofia and Bap...
2,Which of the following statements best describ...,['Boreo-arctic Montane species are typically f...,['Boreo-arctic Montane species are primarily f...,What type of species is a Boreo-arctic Montane...,"[""The five main latitude regions of the Earth'...",What type of species is a Boreo-arctic Montane...
3,What are the shared characteristics of Fotherg...,['Fothergilla and Clerodendrum both belong to ...,['Fothergilla and Clerodendrum are both native...,What type of vegetation does Fothergilla and C...,"['Clerodendrum infortunatum, the hill glory bo...",What type of vegetation does Fothergilla and C...
4,Which statements accurately reflect the charac...,['Dendranthema boreale belongs to the Asterace...,['Dendranthema boreale is classified in the Or...,What are the plants in the same genus as the D...,['Navarretia is a genus of about 30 species of...,What are the plants in the same genus as the D...


In [7]:
for idx, row in new_df.iterrows():
    print(row['question'])
    print(row['correct_answers'])
    print(row['incorrect_answers'])
    print(row['original_context'])
    print(row['document'])
    print()

Which of the following statements accurately describes the properties of Cadmium Chloride and Ethanol?
['Cadmium Chloride is slightly soluble in water.', 'Ethanol is highly soluble in water.', 'Ethanol is also known as ethyl alcohol.', 'Ethanol has the chemical formula C2H5OH.']
['Cadmium Chloride is highly soluble in alcohol.', 'Ethanol is a compound with the chemical formula C2H6OH.', 'Ethanol is a solid that is not hygroscopic.']
Cadmium Chloride is slightly soluble in this chemical, it is also called what?  It is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol. Ethanol, also called alcohol, ethyl alcohol, and drinking alcohol, is a compound and simple alcohol with the chemical formula C2H5OH .
Cadmium Chloride is slightly soluble in this chemical, it is also called what?  It is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol. Ethanol, also called alcohol, ethyl alcohol, and drinking alcohol, is a compound and s