In [54]:
from openai import OpenAI
import json, os, sys
import os
import re

In [38]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [43]:
def generate_questions(chunk, client, model="gpt-4-turbo"):
    prompt = f"""
I have summaries of 100 news articles and I need to generate multiple-choice questions based on these summaries. 
The questions should help differentiate between articles and provide a clear understanding of the content just by reading the answers.
The questions should be generic to all articles and shouldn't have answer related to one specific article.
Make sure to add None of the above as one of the option as I will ask these questions to articles which are not of the same topic. 
Provide me 20 Questions for the given article summaries. Make sure all the questions are unique and not repeated.
Here are the example questions and format I need:

1. What is the primary focus of the article?
   - A. Military actions and attacks
   - B. Political statements and negotiations
   - C. Humanitarian impact and casualties
   - D. Protests and public reactions
   - E. None of the above

2. Which international organization is prominently mentioned in the article?
   - A. United Nations
   - B. NATO
   - C. European Union
   - D. African Union
   - E. None of the above

3. What is the main geographical area discussed in the article?
   - A. Gaza Strip
   - B. West Bank
   - C. Israel
   - D. International (e.g. U.S. Europe)
   - E. None of the above

The questions should be comprehensive and cover various aspects like focus, primary actors, geographical area, outcomes, public reaction, and any specific mentions of organizations or events. Please generate similar questions for the given article summaries below:

Summaries: {chunk}
"""
    messages = [{"role": "user", "content": prompt}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."


In [52]:
def process_batches(input_file, output_file, client, model="gpt-4-turbo"):
    # Read the article summaries from the input JSON file
    with open(input_file, 'r') as f:
        articles = json.load(f)
    
    # Extract summaries from the articles
    article_summaries = [article['summary'] for article in articles]
    num_summaries = len(article_summaries)
    batch_size = 50
    
    responses = []

    # Process the article summaries in batches of 100
    for i in range(500, num_summaries, batch_size):
        batch = article_summaries[i:i+batch_size]
        chunk = json.dumps(batch)  # Convert the batch to a JSON string

        questions = generate_questions(chunk, client, model=model)
        responses.append({
            "batch_start": i,
            "batch_end": min(i+batch_size-1, num_summaries-1),
            "questions": questions
        })
    
    # Write the generated questions to the output JSON file
    with open(output_file, 'w') as f:
        json.dump(responses, f, indent=4)

In [53]:
input_file = '../data/QnA_data/combined_summary.json'
output_file = '../data/QnA_data/output_questions.json'

# Assuming you have your OpenAI API client initialized as `client`
process_batches(input_file, output_file, client, model="gpt-4o")

In [55]:
def extract_questions(batch_questions):
    questions = {}
    q_count = 1
    for batch in batch_questions:
        batch_qs = batch["questions"].split('\n\n')
        for q in batch_qs:
            match = re.match(r"\d+\. (.+?)\n\s+- A\. (.+?)\n\s+- B\. (.+?)\n\s+- C\. (.+?)\n\s+- D\. (.+?)\n\s+- E\. (.+?)$", q.strip(), re.DOTALL)
            if match:
                question = match.group(1).strip()
                choices = {
                    "A": match.group(2).strip(),
                    "B": match.group(3).strip(),
                    "C": match.group(4).strip(),
                    "D": match.group(5).strip(),
                    "E": match.group(6).strip()
                }
                if question not in questions:
                    questions[question] = choices
    return questions

def convert_to_json2_format(unique_questions):
    formatted_questions = {}
    q_num = 1
    for question, choices in unique_questions.items():
        formatted_questions[f"Q{q_num}"] = {
            "question": question,
            "choices": choices
        }
        q_num += 1
    return formatted_questions

def process_json(input_json):
    unique_questions = extract_questions(input_json)
    return convert_to_json2_format(unique_questions)

In [56]:
# Load JSON 1
with open('../data/QnA_data/output_questions.json', 'r') as f:
    json1 = json.load(f)

# Process JSON 1 to get JSON 2 format
json2_format = process_json(json1)

# Save the result to a new JSON file
with open('../data/QnA_data/formated_output_questions.json', 'w') as f:
    json.dump(json2_format, f, indent=4)