## Step 3: Use LLM to answer MCQs for each document

In [1]:
import json, os
from openai import OpenAI

source_data_path = "json/20ng/"
filename = "topic_9_10_documents.json"
source_filename = source_data_path + filename


# Load articles
with open(source_filename, "r", encoding="utf-8") as f:
    articles = json.load(f)

# Load questions
with open(
    source_data_path + "topic_9_10_mcqs.json",
    "r",
    encoding="utf-8",
) as f:
    questions = json.load(f)

# Set API key
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [2]:
def get_answers(article, questions, model):
    article_title = article.get("id")
    article_content = article.get("text")
    print(f"Proceeding article {article_title}")
    # answers = {}
    # for q_id, question in questions.items():

    pre = "You are an AI trained to understand documents and generate concise answers to multiple-choice questions based on the content. \
        Please read the following document carefully. After reading, answer ALL the questions listed below. \
            Your answers must be in capital letters and formatted as a single string, where each question number is followed by its corresponding answer letter. \
                Separate each question-answer pair with a semicolon. \
                    Example format: 1A;2B;3C;4D;... \n\n"

    prompt = pre + f"Document Content:\n{article_content}\n\n Questions: {questions}\n"
    try:
        response = client.chat.completions.create(
            model=model, messages=[{"role": "user", "content": prompt}], temperature=0
        )
        # content = response.choices[0].message.content
        # answers[q_id] = content
    except Exception as e:
        print(f"Error: {e}")
        # answers[q_id] = "Sorry, error from GPT."
    return response

In [None]:
"""
def convert_string_to_dict(answer_string):
    if answer_string == "":
        return {}
    if answer_string[-1] == ";":
        answer_string = answer_string[:-1]
    return {item[:-1]: item[-1] for item in answer_string.split(";")}
"""


def convert_string_to_dict(answer_string):
    if answer_string == "":
        return {}
    if answer_string[-1] == ";":
        answer_string = answer_string[:-1]

    # Mapping for the last character
    mapping = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4}

    result = {}
    for item in answer_string.split(";"):
        key = item[:-1]
        value_char = item[-1]
        # Convert using the mapping if the character exists in the mapping
        value = mapping.get(value_char, value_char)
        result[key] = value

    return result


results = []
iteration = 0
max_iteration = len(articles)
des_data_path = "json/20ng/"

for article in articles:

    answers = get_answers(article, questions, model="gpt-4.1-nano")

    pure_answers = answers.choices[0].message.content
    print(pure_answers)

    # convert string to dictionary 1A;2B;3C;44D to {'1': 'A', '2': 'B', '3': 'C', '44': 'D'}
    processed_answers = convert_string_to_dict(pure_answers)
    article["answers"] = processed_answers

    results.append(article)
    iteration += 1
    print(f"Processed {iteration} out of {max_iteration}")

    # when iteration is the multiple of 500, save the results to a JSON file, to avoid losing data
    if iteration % 500 == 0:
        des_filename = des_data_path + f"topic_9_10_documents_{iteration}.json"
        with open(des_filename, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=4)

des_filename = des_data_path + f"topic_9_10_documents_{iteration}.json"
# Save results to a JSON file
with open(des_filename, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

Proceeding article 0
1B;2D;3D
Processed 1 out of 1914
Proceeding article 1
1B;2D;3D
Processed 2 out of 1914
Proceeding article 2
1B;2D;3D
Processed 3 out of 1914
Proceeding article 3
1B;2D;3D
Processed 4 out of 1914
Proceeding article 4
1A;2D;3D
Processed 5 out of 1914
Proceeding article 5
1B;2D;3D
Processed 6 out of 1914
Proceeding article 6
1A;2D;3C
Processed 7 out of 1914
Proceeding article 7
1B;2D;3D
Processed 8 out of 1914
Proceeding article 8
1B;2D;3C
Processed 9 out of 1914
Proceeding article 9
1B;2D;3D
Processed 10 out of 1914
Proceeding article 10
1A;2D;3A
Processed 11 out of 1914
