In [3]:
import os
import json

import pandas as pd
import openai
import random
from tqdm import tqdm
from dotenv import load_dotenv
tqdm.pandas()
load_dotenv()
random.seed(42)

In [4]:
CHUNK_SIZE = 10
NUM_QUESTIONS_PER_CHUNK = 4

In [5]:
OPENAI_API_KEY = os.getenv("GPT4_OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("GPT4_OPENAI_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("GPT4_DEPLOYMENT_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("GPT4_DEPLOYMENT_VERSION")

openai.api_type = 'azure'
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

In [6]:
OPENAI_DEPLOYMENT_NAME = "GPT-4-1106-Preview"

In [7]:
meta_prompt = """\
Context information is below.\n
Given the context information and no prior knowledge.\n
Generate only questions based on the below query.\n
The context of each question should be easily inferred by reading the question. If you are referring to a specific event, mention the exact name and date of the event.\n

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} multiple-choice questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Do not repeat the same question twice. Restrict the questions to the \
context information provided. Each question should have exactly four possible answers. Only one answer should be correct. \
Your reply must be a single number (0, 1, 2, or 3) related to the correct answer. \ 
Return a JSON formatted string with the following fields: \
question, possible_answers, correct_answer. \
"""

context_prompt = """---------------------\n{context_str}\n---------------------\n\
Each question key should have and index value, given by the following list: {question_indices}.\
"""

num_questions_per_chunk = 4
question_indices = ["Q_23", "Q_16254", "Q_224", "Q_3"]
context_str = "The 2023 NBA playoffs was the postseason tournament of the National Basketball Association's 2022–23 season. \
The playoffs began on April 15 and concluded on June 12 with the Denver Nuggets winning the 2023 NBA Finals."

message= [{"role": "system", "content": meta_prompt.format(num_questions_per_chunk=num_questions_per_chunk)},
          {"role": "user", "content": context_prompt.format(context_str=context_str, question_indices=question_indices)}]

In [6]:
from openai import AzureOpenAI
client = AzureOpenAI(
  azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-12-01-preview"
)

In [7]:
response = client.chat.completions.create(
    model=OPENAI_DEPLOYMENT_NAME,
    response_format={"type": "json_object"},
    messages=message)

## Load full data

In [8]:
df = pd.read_csv("wiki_data/wiki_events_aug-nov_2023.csv", index_col=0)
df.fillna("None", inplace=True)
df['text_len'] = df['text'].str.len()
df = df[df['text_len'] > 100]

df['prefix'] = ("### Article: " + df['topic_name'] + ".\n" +
                "### Section: " + df['section'] + ".\n" +
                "### Subsection: " + df['subsection'] + ".\n" +
                "### Text: ")
df = df[['prefix', 'text']]
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,prefix,text
0,### Article: 2023 Louisiana wildfires.\n### Se...,In a three-month period from August to October...
1,### Article: 2023 Louisiana wildfires.\n### Se...,"On August 22, a fire started in Beauregard Par..."
2,### Article: 2023 Louisiana wildfires.\n### Se...,"On August 24, a fire described as ""out of cont..."
3,### Article: 2023 Louisiana wildfires.\n### Se...,The Federal Emergency Management Agency approv...
4,### Article: United States abortion protests (...,A series of ongoing protests supporting aborti...
...,...,...
680,### Article: 2023 Virginia Senate election.\n#...,"Five incumbent senators, four Democrats and on..."
681,### Article: 2023 Virginia Senate election.\n#...,District 21: Won by State Delegate Angelia Wil...
682,### Article: 2023 Virginia Senate election.\n#...,District 1: Won by farmer Timmy French\nDistri...
683,### Article: 2023 Virginia Senate election.\n#...,"Incumbent Republican Jen Kiggans, first electe..."


In [9]:
df_dict = df.to_dict(orient='records')
corpus = []

for row in tqdm(df_dict):
    sentences = row['text'].split(".")
    sentences = [s.strip() + '.' for s in sentences if len(s) > 0]
    chunks = [sentences[i:i + CHUNK_SIZE] for i in range(0, len(sentences), CHUNK_SIZE)]
    chunks = [' '.join(chunk).strip() for chunk in chunks]
    
    # Limit the number of chunks for each article to 2
    if len(chunks) > 2:
        chunks = random.sample(chunks, 2)
    
    # Adding the prefix to each chunk
    chunks = [row['prefix'] + chunk for chunk in chunks]
    corpus.extend(chunks)

100%|██████████| 685/685 [00:00<00:00, 140851.96it/s]


## Generating the dataset

In [12]:
count = 0
json_path = 'current_events_questions.json'

for chunk in tqdm(corpus):
    question_indices = [f"Q_{i}" for i in range(count*NUM_QUESTIONS_PER_CHUNK, (count+1)*NUM_QUESTIONS_PER_CHUNK)]
    message= [{"role": "system", "content": meta_prompt.format(num_questions_per_chunk=NUM_QUESTIONS_PER_CHUNK)},
              {"role": "user", "content": context_prompt.format(context_str=chunk, question_indices=question_indices)}]
    
    try:
        response = client.chat.completions.create(
            model=OPENAI_DEPLOYMENT_NAME,
            response_format={"type": "json_object"},
            messages=message)
    except:
        print(f"Error in generating questions for chunk number: {count}")
        print(f"Chunk: {chunk}")
        continue
    
    json_response = response.choices[0].message.content
    python_dict = json.loads(json_response)
    for q in python_dict.keys():
        python_dict[q]['context'] = chunk
    
    if count > 0:
        with open(json_path, 'r') as json_file:
            json_dict = json.load(json_file)
        json_dict.update(python_dict)
    else:
        json_dict = python_dict
    
    if len(json_dict) > 0:
        with open(json_path, 'w') as json_file:
            json.dump(json_dict, json_file)
    
    count += 1

100%|██████████| 938/938 [26:02<00:00,  1.67s/it]


## Correcting the dataset
Keeps only specific questions

In [8]:
questions_dataset = pd.read_json('../../data/current_events_questions.json').T

In [9]:
questions_dataset['context_num'] = pd.factorize(questions_dataset['context'])[0]

In [17]:
meta_prompt = """\
Context information is below.\n
A number of multiple choice questions are given for this context. Your job is to select the **2** most specific questions, and return them, as is (i.e., do not change their content or format), in their json format.\n
We define specific to mean that a student who knows the information included in the context should be able to answer the question without knowing which context the question refers to. Questions that are more knowledge intensive are better.\
If none are specific, return an empty dictionary.\n
If only **1** is specific, return only **1**.\
If more than **2** are specific, return only the top 2."""

user_example1 = """\
Context: "### Article: 2023 Louisiana wildfires.\\n### Section: Response.\\n### Subsection: None.\\n### Text: The Federal Emergency Management Agency approved the Fire Management Assistance Grant in Beauregard Parish. The American Red Cross assisted with relief efforts, and the United States Army Blackhawk helicopters assisted with firefighting operations. Assistance also came from Texas, Oklahoma, Arkansas, and Tennessee, and 161 bulldozers from the National Guard were deployed. A Boeing CH-47 Chinook was also deployed to help with efforts to drop water onto the fires; at least 161,000 gallons of water were dropped from the CH-47. Four strike teams were to arrive from Florida with 160 firefighters and 20 bulldozers. Merryville, Louisiana was under a mandatory evacuation order that has since been lifted. Emergency operations centers were activated on August 23, and a church was opened for a temporary shelter in DeRidder, Louisiana. On August 25, Texas governor Greg Abbott announced that firefighters and emergency personnel would be dispatched to Merryville, Louisiana to assist in firefighting efforts. More than 200 Louisiana National Guard troops were  dispatched to assist in firefighting efforts. Louisiana governor John Bel Edwards stated that \\"Nobody alive in Louisiana has ever seen these conditions." \n
Questions: '{"Q_16":{"question":"Which agency approved the Fire Management Assistance Grant for Beauregard Parish during the 2023 Louisiana wildfires?","possible_answers":["The American Red Cross","The United States Army","The Federal Emergency Management Agency","The Louisiana National Guard"],"correct_answer":"2"},"Q_17":{"question":"What type of aircraft was deployed to drop water onto the fires during the 2023 Louisiana wildfires?","possible_answers":["Boeing 747","Boeing CH-47 Chinook","Lockheed C-130 Hercules","Blackhawk helicopter"],"correct_answer":"1"},"Q_18":{"question":"How many gallons of water were dropped from the CH-47 during the firefighting efforts of the 2023 Louisiana wildfires?","possible_answers":["Approximately 161,000 gallons","Around 200,000 gallons","Nearly 100,000 gallons","Over 250,000 gallons"],"correct_answer":"0"},"Q_19":{"question":"Which governor announced the dispatch of firefighters and emergency personnel to Merryville, Louisiana on August 25 during the 2023 Louisiana wildfires?","possible_answers":["Governor of Arkansas","Governor of Texas","Governor of Tennessee","Governor of Louisiana"],"correct_answer":"1"}}'
"""


ai_answer_example_1 = """\
'{"Q_16":{"question":"Which agency approved the Fire Management Assistance Grant for Beauregard Parish during the 2023 Louisiana wildfires?","possible_answers":["The American Red Cross","The United States Army","The Federal Emergency Management Agency","The Louisiana National Guard"],"correct_answer":"2"}, "Q_19":{"question":"Which governor announced the dispatch of firefighters and emergency personnel to Merryville, Louisiana on August 25 during the 2023 Louisiana wildfires?","possible_answers":["Governor of Arkansas","Governor of Texas","Governor of Tennessee","Governor of Louisiana"],"correct_answer":"1"}}'
"""

user_example_2 = """\
Context: "### Article: August 2023 Ohio Issue 1.\\n### Section: Turnout and outcome.\\n### Subsection: None.\\n### Text: EDT. ." \n
Questions: '{"Q_164":{"question":"What is the timezone mentioned in the context for the event?","possible_answers":["PDT","EDT","CST","EST"],"correct_answer":"1"},"Q_165":{"question":"Which issue is the article related to?","possible_answers":["Ohio Issue 1","Ohio Issue 2","Ohio Issue 3","Ohio Issue 4"],"correct_answer":"0"},"Q_166":{"question":"In what month and year is the event taking place?","possible_answers":["July 2023","August 2023","September 2023","October 2023"],"correct_answer":"1"},"Q_167":{"question":"What is the main subject of the article?","possible_answers":["A festival","A state issue","A city council meeting","A sports event"],"correct_answer":"1"}}'
"""

ai_answer_example_2 = "{}"

user_prompt = """\
Context: {context}\n
Questions: {questions} 
"""

example_context = """\
"### Article: Federal prosecution of Donald Trump (election obstruction case).\\n### Section: Reactions.\\n### Subsection: Democrats in support of the indictment.\\n### Text: Senate Majority Leader Chuck Schumer in a joint statement with House Democratic Leader Hakeem Jeffries said \\"No one is above the law \\u2013 including Donald Trump\\". House members Nancy Pelosi, Joaquin Castro and Rashida Tlaib also came out in support of the indictment."
"""

example_questions = """\
'{"Q_256":{"question":"Who made the joint statement \'No one is above the law \\u2013 including Donald Trump\'?","possible_answers":["Nancy Pelosi and Joaquin Castro","Chuck Schumer and Hakeem Jeffries","Rashida Tlaib and Nancy Pelosi","Chuck Schumer and Nancy Pelosi"],"correct_answer":"1"},"Q_257":{"question":"Which position is currently held by Chuck Schumer?","possible_answers":["House Democratic Leader","Senate Majority Leader","Speaker of the House","House Majority Whip"],"correct_answer":"1"},"Q_258":{"question":"Which of the following individuals did not express support for the indictment?","possible_answers":["Joaquin Castro","Hakeem Jeffries","Rashida Tlaib","Nancy Pelosi"],"correct_answer":"1"},"Q_259":{"question":"How many House members were mentioned to support the indictment?","possible_answers":["One","Two","Three","Four"],"correct_answer":"2"}}'
"""

message= [{"role": "system", "content": meta_prompt},
          {"role": "user", "content": user_example1}, {"role": "assistant", "content": ai_answer_example_1},
          {"role": "user", "content": user_example_2}, {"role": "assistant", "content": ai_answer_example_2},
          {"role": "user", "content": user_prompt.format(context=example_context, questions=example_questions)}]

In [18]:
from openai import AzureOpenAI
client = AzureOpenAI(
  azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-12-01-preview"
)

In [19]:
response = client.chat.completions.create(
    model=OPENAI_DEPLOYMENT_NAME,
    response_format={"type": "json_object"},
    messages=message)

In [39]:
responses = {}

for ii in tqdm(range(questions_dataset.context_num.nunique())):
    try:
        data = questions_dataset[questions_dataset.context_num==ii]
        cur_context = list(questions_dataset[questions_dataset.context_num==ii].context.drop_duplicates().to_dict().values())[0]
        cur_questions = questions_dataset[questions_dataset.context_num==ii].drop(columns=['context', 'context_num']).T.to_json()
        message= [{"role": "system", "content": meta_prompt},
              {"role": "user", "content": user_example1}, {"role": "assistant", "content": ai_answer_example_1},
              {"role": "user", "content": user_example_2}, {"role": "assistant", "content": ai_answer_example_2},
              {"role": "user", "content": user_prompt.format(context=cur_context, questions=cur_questions)}]
        response = client.chat.completions.create(
        model=OPENAI_DEPLOYMENT_NAME,
        response_format={"type": "json_object"},
        messages=message)
        response = eval(response.choices[0].message.content)
        for q in response.keys():
            response[q]['context'] = cur_context
        responses.update(response)
    except Exception as e:
        print(e)
        print(f'Failed on context number {ii}')


100%|██████████| 749/749 [2:26:59<00:00, 11.78s/it]  


In [42]:
updated_dataset = pd.DataFrame(responses).T

In [45]:
updated_dataset

Unnamed: 0,question,possible_answers,correct_answer,context
Q_2,What was the name of the largest wildfire reco...,"[Beauregard Blaze, Merryville Inferno, Singer ...",3,### Article: 2023 Louisiana wildfires.\n### Se...
Q_3,Which Parish received an assistance grant from...,"[Caddo Parish, Beauregard Parish, Livingston P...",1,### Article: 2023 Louisiana wildfires.\n### Se...
Q_4,What was the cause of the 2023 Louisiana wildf...,"[Lightning, Arson, Electrical fault, Accidenta...",1,### Article: 2023 Louisiana wildfires.\n### Se...
Q_5,On what date was the Tiger Island fire in 2023...,"[August 22, August 25, August 27, August 29]",2,### Article: 2023 Louisiana wildfires.\n### Se...
Q_8,What percentage of the Tiger Island fire was c...,"[45%, 55%, 65%, 75%]",2,### Article: 2023 Louisiana wildfires.\n### Se...
...,...,...,...,...
Q_3741,Which newly created district in the 2023 Virgi...,"[District 1, District 3, District 10, District...",1,### Article: 2023 Virginia Senate election.\n#...
Q_3744,In which year was Jen Kiggans first elected to...,"[2017, 2018, 2019, 2020]",2,### Article: 2023 Virginia Senate election.\n#...
Q_3745,What position did Jen Kiggans resign from to b...,"[Virginia State Senator, Virginia Governor, U....",0,### Article: 2023 Virginia Senate election.\n#...
Q_3748,In what year was Jennifer McClellan first elec...,"[2015, 2016, 2017, 2018]",2,### Article: 2023 Virginia Senate election.\n#...


In [46]:
def clean_correct_answers(x):
    try:
        return int(x)
    except:
        return None
updated_dataset.correct_answer = updated_dataset.correct_answer.apply(lambda x: clean_correct_answers(x))
updated_dataset.dropna(inplace=True)

In [57]:
# dumb heuristic to remove contexts with no information
updated_dataset = updated_dataset[updated_dataset.context.apply(lambda x: len(x.split())>100)]
updated_dataset = updated_dataset.reset_index().rename(columns={'index': 'orig_question_num', 'context': 'text'})

In [58]:
updated_dataset.to_csv('../../data/current_events_questions_updated.csv', index=False)