In [1]:
from openai import OpenAI
import json, os, sys
import os
import re
import random

In [2]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
topic = 19

In [4]:
def generate_questions(chunk, client, model="gpt-4o-mini"):
    prompt = f"""
You are an AI assistant analyzing a corpus of NIPS papers spanning multiple years. 
Your task is to design multiple-choice questions (MCQs) that highlight key differences among those papers. 
I will provide abstracts of NIPS papers, and you will generate unique, comprehensive MCQs based on them. 

These questions should:
- Help to differentiate between the papers.
- Are common to all articles, not specific to any one paper.
- Cover various aspects, including but not limited to research focus, methodology, theoretical advancements, and applications.
- Are unique and not repeated in any other batch.
- Include "None of the above" as one of the options for each question.

Below is the format and example I need:

1. What is the primary focus of the paper?
   - A. Reinforcement learning
   - B. Deep learning
   - C. Bayesian methods
   - D. Kernel methods
   - E. None of the above

2. What is the primary focus of the paper?
   - A. Faster convergence compared to existing methods
   - B. Higher accuracy
   - C. Better scalability to large datasets
   - D. Improved interpretability of results
   - E. None of the above

Generate as many unique and comprehensive questions as possible based on the given paper abstracts below. 
Each question should be designed to highlight key differences between the papers.

Abstracts: {chunk}

"""
    messages = [{"role": "user", "content": prompt}]
    try:
        response = client.chat.completions.create(
            model=model, messages=messages, temperature=0
        )

        content = response.choices[0].message.content
        return content
    except Exception as e:  # if the model fails to return a response
        print(f"Error: {e}")
        return "Sorry, error from GPT."

In [5]:
def process_batches(input_file, output_file, client, model):
    # Read the article summaries from the input JSON file
    with open(input_file, "r") as f:
        articles = json.load(f)

    # Shuffle the data
    random.shuffle(articles)

    # Extract summaries from the articles
    article_summaries = [article["abstract_new"] for article in articles]
    num_summaries = len(article_summaries)
    batch_size = 100

    responses = []

    # Process the article summaries in batches of 100
    for i in range(0, num_summaries, batch_size):
        batch = article_summaries[i : i + batch_size]
        chunk = json.dumps(batch)  # Convert the batch to a JSON string

        questions = generate_questions(chunk, client, model=model)
        responses.append(
            {
                "batch_start": i,
                "batch_end": min(i + batch_size - 1, num_summaries - 1),
                "questions": questions,
            }
        )

    # Write the generated questions to the output JSON file
    with open(output_file, "w") as f:
        json.dump(responses, f, indent=4)

In [6]:
input_file = f"json/topic_{topic}.json"
output_file = "json/output_questions_v2.json"

# Assuming you have your OpenAI API client initialized as `client`
process_batches(input_file, output_file, client, model="gpt-4o")

In [7]:
def extract_questions(batch_questions):
    questions = {}
    q_count = 1
    for batch in batch_questions:
        batch_qs = batch["questions"].split("\n\n")
        for q in batch_qs:
            match = re.match(
                r"\d+\. (.+?)\n\s+- A\. (.+?)\n\s+- B\. (.+?)\n\s+- C\. (.+?)\n\s+- D\. (.+?)\n\s+- E\. (.+?)$",
                q.strip(),
                re.DOTALL,
            )
            if match:
                question = match.group(1).strip()
                choices = {
                    "A": match.group(2).strip(),
                    "B": match.group(3).strip(),
                    "C": match.group(4).strip(),
                    "D": match.group(5).strip(),
                    "E": match.group(6).strip(),
                }
                if question not in questions:
                    questions[question] = choices
    return questions


def convert_to_json2_format(unique_questions):
    formatted_questions = {}
    q_num = 1
    for question, choices in unique_questions.items():
        formatted_questions[f"Q{q_num}"] = {"question": question, "choices": choices}
        q_num += 1
    return formatted_questions


def process_json(input_json):
    unique_questions = extract_questions(input_json)
    return convert_to_json2_format(unique_questions)

In [8]:
# Load JSON 1
with open("json/output_questions_v2.json", "r") as f:
    json1 = json.load(f)

# Process JSON 1 to get JSON 2 format
json2_format = process_json(json1)

# Save the result to a new JSON file
with open(f"json/topic_{topic}_formated_output_questions_v2.json", "w") as f:
    json.dump(json2_format, f, indent=4)