In [1]:
%pip install openai
from openai import OpenAI
import os
from utils import *

client = OpenAI(
    api_key = os.environ['OPENAI_API_KEY']
)

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Function to extract questions and answers from the generated question/answer pairs
def parse_qa_string(qa_string):
    qa_list = []
    lines = qa_string.strip().split("\n")
    for line in lines:
        parts = line.split('question: ')
        if len(parts) < 2:
            continue
        number_question, answer = parts[1].split('answer: ')
        qa_list.append((number_question.strip(), answer.strip()))

    return qa_list

In [5]:
def generate_qa_pairs(article_text, client):
    """
    Takes in a text excerpt from a Wikipedia article and generates a list of question/answer pairs using GPT-3.5.
    
    Parameters:
    - article_text (str): The excerpt of the Wikipedia article.
    - client (openai.ApiClient): An instance of the OpenAI API client.
    
    Returns:
    - str: A numbered list of question/answer pairs formatted as specified.
    """
    
    prompt = (
        "I want to generate a list of 10 questions/answers from this wikipedia excerpt:\n"
        + article_text +
        "\nAn example of this might be question: 'When was George Washington born', answer: 'George Washington was born in 1732'.\n"
        "Please format your response as a numbered list of the form\n"
        "1. question: <generated question>, answer: <generated answer>\n"
        "2. question: <generated question>, answer: <generated answer>\n"
        "etc...\n"
        "where the question and answer are on the same line. Please only return the list as your response with no other text surrounding it.\n"
        "Keep in mind two points while generating these questions/answers:\n"
        "1) The questions should be answerable even without the given text (i.e. the question should not require something that was defined in the text to answer it).\n"
        "2) The answers should either be a single word or a short phrase.\n"
    )

    while True:
        try:
            completion = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                        "role": "system",
                        "content": "You are an AI. Please answer the following questions as if you were an expert on the subject. Please make sure that each question/answer pair\
                        is on the same line anad that the response has no other text surrounding it.",
                    },
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                temperature=0.5,
                max_tokens=1000,
                n=1,
            )
            qa_text = completion.choices[0].message.content
            qa_list = parse_qa_string(qa_text)
            return qa_list
        except Exception as e:
            print(e)
            continue

In [6]:
# Checkpoint qa_map
if os.path.exists('qa_map.json'):
    qa_map = load_json('qa_map.json')
else:
    qa_map = {}

# Load articles
post_2024_articles = load_json('../datasets/post_2024_articles.json')
pre_2021_articles = load_json('../datasets/pre_2021_articles.json')

# Generate question/answer pairs for each article
for article_title, article_text in post_2024_articles.items():
    if article_title in qa_map:
        continue
    qa_list = generate_qa_pairs(article_text, client)
    qa_map[article_title] = qa_list
    if len(qa_map) % 100 == 0:
        print(len(qa_map))

for article_title, article_text in pre_2021_articles.items():
    if article_title in qa_map:
        continue
    qa_list = generate_qa_pairs(article_text, client)
    qa_map[article_title] = qa_list
    if len(qa_map) % 100 == 0:
        print(len(qa_map))

In [14]:
save_json(qa_map, 'datasets/qa_map.json')