In [1]:
from openai import OpenAI
import json, os, sys
import os
import re
import random

In [2]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
def generate_questions(chunk, client, model="gpt-4o"):
    prompt = f"""
You are an expert in news analysis and skilled at creating multiple-choice question answers that help to differentiate topics between articles. 
I have article summary and I need to generate unique, comprehensive multiple-choice questions based on these summaries.

Your task is to generate questions that:
- Help to differentiate topics between the articles.
- Are common to all articles, not specific to any one article.
- Are unique and not repeated in any other batch.
- Include "None of the above" as one of the options for each question.

Below is the format and example I need:

1. What is the primary focus of the article?
   - A. The expansion of the U.S. Navy's fleet.
   - B. The development of new space exploration technologies.
   - C. The signing of a peace treaty with neighboring countries.
   - D. The introduction of new economic reforms.
   - E. None of the above

Below is the questions that need multiple choice answers to be generated:

1. What is the main theme of the article?
2. What are the primary keywords or phrases in this article? 
3. What event or issue is the article centered around?
4. Which region or country is primarily discussed in the article?
5. Who is most affected by the issues discussed in the article?
6. What type of sources does the article cite?

Summaries: {chunk}

Generate as many as you can unique questions based on the above summaries.

"""
    messages = [{"role": "user", "content": prompt}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."

In [4]:
def process_batches(input_file, output_file, client, model):
    # Read the article summaries from the input JSON file
    with open(input_file, "r") as f:
        articles = json.load(f)

    # Shuffle the data
    random.shuffle(articles)

    # Extract summaries from the articles
    article_summaries = [article["summary"] for article in articles]
    num_summaries = len(article_summaries)
    batch_size = 200

    responses = []

    # Process the article summaries in batches of 100
    for i in range(0, num_summaries, batch_size):
        batch = article_summaries[i : i + batch_size]
        chunk = json.dumps(batch)  # Convert the batch to a JSON string

        questions = generate_questions(chunk, client, model=model)
        responses.append(
            {
                "batch_start": i,
                "batch_end": min(i + batch_size - 1, num_summaries - 1),
                "questions": questions,
            }
        )

    # Write the generated questions to the output JSON file
    with open(output_file, "w") as f:
        json.dump(responses, f, ensure_ascii=False, indent=4)

In [5]:
input_file = "combined_Russo_Ukrainian_War.json"
output_file = "output_questions_eng.json"

# Assuming you have your OpenAI API client initialized as `client`
process_batches(input_file, output_file, client, model="gpt-4o")

In [6]:
def extract_questions(batch_questions):
    questions = {}
    q_count = 1
    for batch in batch_questions:
        batch_qs = batch["questions"].split("\n\n")
        for q in batch_qs:
            match = re.match(
                r"\d+\. (.+?)\n\s+- A\. (.+?)\n\s+- B\. (.+?)\n\s+- C\. (.+?)\n\s+- D\. (.+?)\n\s+- E\. (.+?)$",
                q.strip(),
                re.DOTALL,
            )
            if match:
                question = match.group(1).strip()
                choices = {
                    "A": match.group(2).strip(),
                    "B": match.group(3).strip(),
                    "C": match.group(4).strip(),
                    "D": match.group(5).strip(),
                    "E": match.group(6).strip(),
                }
                if question not in questions:
                    questions[question] = choices
    return questions


def convert_to_json2_format(unique_questions):
    formatted_questions = {}
    q_num = 1
    for question, choices in unique_questions.items():
        formatted_questions[f"Q{q_num}"] = {"question": question, "choices": choices}
        q_num += 1
    return formatted_questions


def process_json(input_json):
    unique_questions = extract_questions(input_json)
    return convert_to_json2_format(unique_questions)

In [7]:
# Load JSON 1
with open("output_questions_eng.json", "r") as f:
    json1 = json.load(f)

# Process JSON 1 to get JSON 2 format
json2_format = process_json(json1)

# Save the result to a new JSON file
with open("formated_output_questions_eng.json", "w") as f:
    json.dump(json2_format, f, ensure_ascii=False, indent=4)

In [8]:
print(json1)

[{'batch_start': 0, 'batch_end': 199, 'questions': "1. What is the main theme of the article?\n   - A. Military aid and support for Ukraine.\n   - B. Economic development in the Gulf Cooperation Council (GCC) countries.\n   - C. Technological advancements in the electric toothbrush market.\n   - D. Political dynamics within the European Union.\n   - E. None of the above\n\n2. What are the primary keywords or phrases in this article?\n   - A. Military aid, NATO, Ukraine.\n   - B. Real estate, tourism, oil prices.\n   - C. Oral hygiene, smart toothbrushes, e-commerce.\n   - D. Sanctions, EU, Dr. Moshe Kantor.\n   - E. None of the above\n\n3. What event or issue is the article centered around?\n   - A. The Russia-Ukraine conflict and international responses.\n   - B. The economic growth forecast for GCC countries.\n   - C. The market growth of electric toothbrushes.\n   - D. The EU's sanctions regime and its flaws.\n   - E. None of the above\n\n4. Which region or country is primarily disc