In [1]:
from openai import OpenAI
import json, os, sys
import os
import re
import random

In [2]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
def generate_questions(chunk, client, model="gpt-4o"):
    prompt = f"""
You are an expert in news analysis and skilled at creating multiple-choice question answers that help to differentiate between articles. 
I have some summaries of news articles and I need to generate unique, comprehensive multiple-choice questions based on these summaries.

Your task is to generate questions that:
- Focusing on Kenneth Burke’s Dramatistic Pentad, which is a tool for analyzing stories and narratives.
- Help to differentiate between the articles.
- Are common to all articles, not specific to any one article.
- Are unique and not repeated in any other batch.
- Include "None of the above" as one of the options for each question.

Below is the format and example I need:

1. What was done?
   - A. The Chinese coast guard seized one of four food packs dropped by a Philippine military plane for Filipino navy personnel at a territorial outpost. After discovering the package contained food, they dumped it into the sea.
   - B. Philippine soldiers were reported to have pointed guns at Chinese coast guard personnel during a resupply mission to the grounded Sierra Madre ship. The Chinese coast guard responded to the resupply operation, which included food drops, by observing armed Philippine soldiers on the ship’s deck.
   - C. Turkish Foreign Minister Hakan Fidan began a trip to China and expressed priorities to support Hamas against Israel and increase trade with China, without condemning the Uyghur genocide.
   - D. The Philippines is collaborating with the United States and Japan to ensure the West Philippine Sea (WPS) remains free and safe amid tensions with Chinese maritime forces.
   - E. None of the above

Below is the questions that need multiple choice answers to be generated:

1. Why did the event happen?
2. How was the event carried out?
3. What was the outcome?
4. What was the primary motive behind the actions?
5. What was the reaction of the involved parties?
6. What was the broader context of the event?

Please generate as many unique and comprehensive answers as possible based on the given article summaries below. 
Make sure each answer is designed to highlight differences between the articles.

Summaries: {chunk}

Generate as many as you can unique questions based on the above summaries.
"""
    messages = [{"role": "user", "content": prompt}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."

In [4]:
def process_batches(input_file, output_file, client, model):
    # Read the article summaries from the input JSON file
    with open(input_file, "r") as f:
        articles = json.load(f)

    # Extract summaries from the articles
    article_summaries = [article["summary"] for article in articles]
    article_summaries = (
        article_summaries[200:300] + article_summaries[500:]
    )  # Limit to 200 summaries
    # extracted from the articles using the article key
    list_of_summaries = [
        0,
        5,
        6,
        8,
        9,
        11,
        12,
        14,
        15,
        17,
        18,
        20,
        22,
        23,
        27,
        28,
        29,
        48,
        108,
        173,
    ]
    for i in list_of_summaries:
        print(article_summaries[i])

    num_summaries = len(article_summaries)
    batch_size = num_summaries
    # print(article_summaries[:5])
    responses = []

    # Process the article summaries in batches of 100
    for i in range(0, num_summaries, batch_size):
        batch = article_summaries[i : i + batch_size]
        chunk = json.dumps(batch)  # Convert the batch to a JSON string

        questions = generate_questions(chunk, client, model=model)
        responses.append(
            {
                "batch_start": i,
                "batch_end": min(i + batch_size - 1, num_summaries - 1),
                "questions": questions,
            }
        )

    # Write the generated questions to the output JSON file
    with open(output_file, "w") as f:
        json.dump(responses, f, indent=4)

In [5]:
input_file = "../data/QnA_data/combined_summary.json"
output_file = "output_questions.json"

# Assuming you have your OpenAI API client initialized as `client`
process_batches(input_file, output_file, client, model="gpt-4o")

The Philippine military chief, Gen Romeo Brawner, reported that the Chinese coast guard seized one of four food packages dropped by a plane for Filipino navy personnel at a territorial outpost in the disputed South China Sea. The Chinese personnel, suspecting the packages contained construction materials, dumped the food into the sea after discovering it was just rice and biscuits. The incident occurred at Second Thomas Shoal, where a Philippine navy ship has been grounded since 1999 to serve as an outpost. The May 19 airdrop led to a scramble between Chinese and Filipino forces to retrieve the packages, with the Chinese managing to seize one. The Philippines claims the shoal is within its exclusive economic zone, citing a 2016 international arbitration ruling that invalidated China's claims. The territorial disputes have heightened tensions, with potential implications for US-China relations, as the US is obligated to defend the Philippines under a mutual defense treaty. Other countri

In [6]:
def extract_questions(batch_questions):
    questions = {}
    q_count = 1
    for batch in batch_questions:
        batch_qs = batch["questions"].split("\n\n")
        for q in batch_qs:
            match = re.match(
                r"\d+\. (.+?)\n\s+- A\. (.+?)\n\s+- B\. (.+?)\n\s+- C\. (.+?)\n\s+- D\. (.+?)\n\s+- E\. (.+?)$",
                q.strip(),
                re.DOTALL,
            )
            if match:
                question = match.group(1).strip()
                choices = {
                    "A": match.group(2).strip(),
                    "B": match.group(3).strip(),
                    "C": match.group(4).strip(),
                    "D": match.group(5).strip(),
                    "E": match.group(6).strip(),
                }
                # if question not in questions:
                questions[question] = choices
    print(questions)
    return questions


def convert_to_json2_format(unique_questions):
    formatted_questions = {}
    q_num = 1
    for question, choices in unique_questions.items():
        formatted_questions[f"Q{q_num}"] = {"question": question, "choices": choices}
        q_num += 1
    return formatted_questions


def process_json(input_json):
    unique_questions = extract_questions(input_json)
    return convert_to_json2_format(unique_questions)

In [7]:
# Load JSON 1
with open("output_questions.json", "r") as f:
    json1 = json.load(f)

In [8]:
# Process JSON 1 to get JSON 2 format
json2_format = process_json(json1)

{'Why did the event happen?': {'A': 'To assert territorial claims and sovereignty in the South China Sea.', 'B': 'To strengthen economic and military ties between the Philippines and the United States.', 'C': 'To address and condemn human rights violations and geopolitical tensions.', 'D': 'To promote regional stability and cooperation through dialogue and negotiation.', 'E': 'None of the above'}, 'How was the event carried out?': {'A': 'Through military exercises and patrols in disputed waters.', 'B': 'By conducting high-level diplomatic meetings and dialogues.', 'C': 'Via public speeches and international forums addressing security issues.', 'D': 'Through economic investments and trade agreements.', 'E': 'None of the above'}, 'What was the outcome?': {'A': 'Increased tensions and confrontations in the South China Sea.', 'B': 'Strengthened alliances and commitments between the Philippines and the United States.', 'C': 'Public condemnation and criticism of aggressive actions by involve

In [9]:
# Save the result to a new JSON file
with open("formated_output_questions_5.json", "w") as f:
    json.dump(json2_format, f, indent=4)