# My own attempt at parallelizing OpenAI API requests

Found this after I made an earlier version of this: https://github.com/openai/openai-cookbook/blob/main/examples/api_request_parallel_processor.py 

Requires data in jsonl format, which is inconveinient at this time. Might move to this in the future to take advantage of being able to explicitly set rate limits and abide by those rules. In this implementation, I'm eyeballing the rate/token limits. Trying to operate in 50-75% of each territory as to not get messed up results... ymmv

In [None]:
from bs4 import BeautifulSoup
import re
import os
import openai
import pickle
import time
import json
import random
from itertools import chain
import concurrent.futures

test = False # sample or run on full dataset?
sample_size = 4 # if test=True

scape_output = pickle.load(open('data/scraper_output.p', 'rb'))
openai.api_key = os.getenv("OPENAI_API_KEY")

if test: 
    keys = random.sample(list(scape_output.keys()), sample_size)
    scape_output = {key: scape_output[key] for key in keys}

In [3]:
def process_item(key, value, question_count):
    # record the time before the request is sent
    start_time = time.time()

    soup = BeautifulSoup(value.content, "html.parser")
    cleaned = re.sub('[\n]+', '\n', soup.text.strip())

    prompt = f"""
    Based on the cleaned HTML given below, generate as many questions possible with their answers.
    Try to make the questions relevant from the perspective of a prospective or current student, as well as faculty and staff.
    Format your responce in JSON, with the "instruction" field containing the question, an empty "input" field, and the answer in the "output" field.
    Include up to {question_count} questions, each being a sentence or two long in length. Do not include question number.
    Keep answers somewhat brief, but be enthusiastic in your response!\n\n"""
    
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0613",
        messages=[
                {"role": "system", "content": "You are a helpful question answer generator."},
                {"role": "user", "content": f"{prompt}\n This is the cleaned HTML: \n{cleaned}\n Start:."},
            ]
    )

    # calculate the time it took to receive the response
    response_time = time.time() - start_time

    qa = completion.choices[0].message.content
    tokens = completion.usage["total_tokens"]

    print(f"Success! Complete in {response_time:.2f}s with {tokens} tokens for {key}")

    return key, qa

In [5]:
gpt_output = {}

with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
    future_to_item = {executor.submit(process_item, key, value, 25): key for key, value in scape_output.items()}
    for future in concurrent.futures.as_completed(future_to_item):
        key = future_to_item[future]
        try:
            key, qa = future.result()
            gpt_output[key] = qa
        except Exception as exc:
            print('%r generated an exception: %s' % (key, exc))

Success! Complete in 31.36s with 2572 tokens for https://www2.brockport.edu/support/administration-finance/enterprise-risk-management
Success! Complete in 33.47s with 2456 tokens for https://www2.brockport.edu/support/parking/campus-parking/faculty
Success! Complete in 35.10s with 1677 tokens for https://www2.brockport.edu/about/contact_us
Success! Complete in 35.40s with 2366 tokens for https://www2.brockport.edu/live/profiles/5407-student-injury-policy
Success! Complete in 35.66s with 2555 tokens for https://www2.brockport.edu/academics/school-business-management/?program=marketing-major-minor
Success! Complete in 36.46s with 3198 tokens for https://www2.brockport.edu/academics/neuroscience/directory
Success! Complete in 39.28s with 4097 tokens for https://www2.brockport.edu/live/blurbs/2080-adirondack-cc-course-equivalencies
Success! Complete in 41.34s with 2160 tokens for https://www2.brockport.edu/academics/computing-sciences/careers-computing
Success! Complete in 41.94s with 2276

In [11]:
pickle.dump(gpt_output, open('data/gpt_output.p', 'wb'))

# Parse GPT output to JSON

While the prompt above to translate clean HTML to question/answer format does specify to do it in JSON format, GPT3.5 does not always do it perfectly. However, it always get close. Instead of trying to fix the JSON output from GPT, In this step I'm using regex to parse for all the instructions (questions), and outputs (answers). This is returned into a python list of dictionaries, which is appended for each webpage. Eventually I shuffle this so the questions are all mixed up instead of grouped by webpage, and dump it to a json file.

For any questions which seem off, investigate the original webpage. I've left gpt_output as a python dictionary specifically for this reason, so that we can always refer back to the data and know exactly where it came from.

In [None]:
# The regular expression pattern for a JSON object with "instruction" and "output"
pattern = r'"instruction":\s*"(.*?)",.*?"output":\s*"(.*?)"'

def extract_data(s):
    matches = re.findall(pattern, s, flags=re.DOTALL)
    data = [{"instruction": m[0], "input": "", "output": m[1]} for m in matches]
    return data

jsonqa = []

for key, value in gpt_output.items():
    clean_value = extract_data(value)
    jsonqa.append(clean_value)

jsonqa = list(chain(*jsonqa))

random.shuffle(jsonqa)

# Write to a JSON file
with open('data/gpt_output_json.json', 'w') as f:
    json.dump(jsonqa, f, indent=4)  # Dump the entire list at once