In [1]:
from openai import OpenAI
import json, os, sys
import os
import re
import random

In [2]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
def generate_questions(chunk, client, model="gpt-4o", n=1):
    prompt = f"""
Role: You are an expert in news analysis, skilled at creating multiple-choice questions that effectively differentiate topics between articles.

Task: Generate unique and comprehensive multiple-choice questions based on the provided article summaries. These questions should:

	•	Help to distinguish the topics covered in the articles.
	•	Be based on the content of the summaries provided.
	•	Be applicable to all articles, rather than specific to any single article.
	•	Be comprehensive, covering a wide range of summaries.
	•	Be unique, avoiding repetition.
	•	Be objective, ensuring that answers are fact-based and not open to interpretation.
	•	Include “None of the above” as one of the options for each question.

Requirements:

	1.	Question Format: Each question should be concise, clear, and directly related to the content of the summaries.
	2.	Answer Options: Provide five options for each question, with one correct answer and “None of the above” as one of the options.
	3.	Topic Differentiation: Ensure that the questions are designed to highlight differences in topics.

Example Structure:

	1.	What was done?
	•	A. The Chinese coast guard seized one of four food packs dropped by a Philippine military plane for Filipino navy personnel at a territorial outpost. After discovering the package contained food, they dumped it into the sea.
	•	B. Philippine soldiers were reported to have pointed guns at Chinese coast guard personnel during a resupply mission to the grounded Sierra Madre ship. The Chinese coast guard responded to the resupply operation, which included food drops, by observing armed Philippine soldiers on the ship’s deck.
	•	C. Turkish Foreign Minister Hakan Fidan began a trip to China and expressed priorities to support Hamas against Israel and increase trade with China, without condemning the Uyghur genocide.
	•	D. The Philippines is collaborating with the United States and Japan to ensure the West Philippine Sea (WPS) remains free and safe amid tensions with Chinese maritime forces.
	•	E. None of the above
    
Questions that need multiple choice answers to be generated (Generate your own unique topical questions when needed based on the provided summaries.):

	1. What is the main theme of the article? 
	2. What are the primary keywords or phrases in this article? 
	3. What event or issue is the article centered around?
	4. Which region or country is primarily discussed in the article?
	5. Who is most affected by the issues discussed in the article?
	6. What type of sources does the article cite?
	
Output: Provide a list of multiple-choice questions and answers following the above structure. Each question should effectively differentiate between the topics of the provided article summaries.

{n} Summaries: 
	{chunk}

"""
    messages = [{"role": "user", "content": prompt}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."

In [4]:
def process_batches(input_file, output_file, client, model, label):
    # Read the article summaries from the input JSON file
    with open(input_file, "r") as f:
        articles = json.load(f)

    # if i in the label extract the summaries from articles[i]
    article_summaries = [article["summary"] for article in articles]
    # get the article body with the label_a_index list
    batch = [article_summaries[i] for i in label]

    random.shuffle(batch)
    responses = []

    chunk = json.dumps(batch)  # Convert the batch to a JSON string

    questions = generate_questions(chunk, client, model=model, n=len(batch))
    responses.append(
        {
            "questions": questions,
        }
    )

    # Write the generated questions to the output JSON file
    with open(output_file, "w") as f:
        json.dump(responses, f, indent=4)

In [5]:
import pandas as pd

# read the label file
labels = pd.read_csv("labels_4o.csv", header=None, names=["label"])
# get the label index if labe is 1
# label = list(labels[labels["label"] != 1].index)
label = list(range(0, 200))
random.shuffle(label)
print(len(label))

200


In [6]:
input_file = "combined_Russo_Ukrainian_War.json"
output_file = "output_questions_test.json"

# Assuming you have your OpenAI API client initialized as `client`
process_batches(input_file, output_file, client, model="gpt-4o", label=label)

In [7]:
def extract_questions(batch_questions):
    questions = {}
    q_count = 1
    for batch in batch_questions:
        batch_qs = batch["questions"].split("\n\n")
        for q in batch_qs:
            match = re.match(
                r"\d+\. (.+?)\n\s+- A\. (.+?)\n\s+- B\. (.+?)\n\s+- C\. (.+?)\n\s+- D\. (.+?)\n\s+- E\. (.+?)$",
                q.strip(),
                re.DOTALL,
            )
            if match:
                question = match.group(1).strip()
                choices = {
                    "A": match.group(2).strip(),
                    "B": match.group(3).strip(),
                    "C": match.group(4).strip(),
                    "D": match.group(5).strip(),
                    "E": match.group(6).strip(),
                }
                if question not in questions:
                    questions[question] = choices
    return questions


def convert_to_json2_format(unique_questions):
    formatted_questions = {}
    q_num = 1
    for question, choices in unique_questions.items():
        formatted_questions[f"Q{q_num}"] = {"question": question, "choices": choices}
        q_num += 1
    return formatted_questions


def process_json(input_json):
    unique_questions = extract_questions(input_json)
    return convert_to_json2_format(unique_questions)

In [8]:
# Load JSON 1
with open("output_questions_test.json", "r") as f:
    json1 = json.load(f)

# Process JSON 1 to get JSON 2 format
json2_format = process_json(json1)

# Save the result to a new JSON file
with open("formated_output_questions_v3.json", "w") as f:
    json.dump(json2_format, f, indent=4)