In [1]:
import json, random
from openai import OpenAI
from dotenv import dotenv_values

from typing import List

In [5]:
env = dotenv_values()

API_KEY = env['OPENAI_API_KEY']
MODEL = env['OPENAI_MODEL']

DATASET_DIR = "../../../data/dataset"
summaries_filename = "01_summaries.json"
queries_filename = "02_queries.json"

In [162]:
client = OpenAI(api_key=API_KEY)


In [3]:
# Define a helper function to extract the only key-value pair from a dictionary no matter what the key is.

def get_value_from_dict(dict):
  return dict[list(dict.keys())[0]]


# Pipeline


In [164]:
# Create an assistant
assistant = client.beta.assistants.create(
    name="asst_bachelor_queries",
    instructions="You are a helpful assistant that generates keyword-based, Google-like user search queries that retrieve a subset of a specified list of documents.",
    response_format={"type": "json_object"},
    temperature=0.7,
    model=MODEL
)

In [None]:
prompt = "Generate 10 English user search queries that would retrieve a subset of an arbitrary size from the list of documents provided above. Each object contains a document id, title, and a short summary that describes its contents. Use advanced, domain-specific keywords directly from the document summaries to craft the queries. Return a JSON array of strings, where each string represents a single query. The queries should be 3 to 5 words long, keyword-based and should resemble Google search queries."

In [7]:
# Load summaries, leaving out those with None/null value

with open(f"{DATASET_DIR}/{summaries_filename}", "r") as f:
    summaries = json.load(f)

summaries = [s for s in summaries if s["summary"] is not None]

In [167]:
def get_queries_for_batch(batch):
    # Create a single thread for this entire step
    thread = client.beta.threads.create()

    client.beta.threads.messages.create(
        thread_id=thread.id,
        content=[
            {
                "type": "text",
                "text": json.dumps(
                    batch,
                    indent=2,
                    ensure_ascii=False
                )
            },
            {
                "type": "text",
                "text": prompt
            }
        ],
        role="user"
    )

    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread.id,
        assistant_id=assistant.id,
    )

    print(run)

    if run.status != "completed":
        raise Exception("Run failed:", run.status)

    messages_cursor = client.beta.threads.messages.list(
        thread.id,
        limit=1,
        order="desc"
    )

    message = [message for message in messages_cursor][0]
    response_content = message.content[0].text.value
    response_dict = json.loads(response_content)
    results: List[str] = get_value_from_dict(response_dict)

    client.beta.threads.delete(thread.id)

    return results

In [None]:
# Generate queries for each batch of summaries,
# repeat 5 times with shuffling.

BATCH_SIZE = 40

queries = []

for _ in range(5):
    shuffled_summaries = summaries.copy()
    random.shuffle(shuffled_summaries)

    batches = [shuffled_summaries[i:i + BATCH_SIZE]
               for i in range(0, len(shuffled_summaries), BATCH_SIZE)]

    for batch in batches:
        batch_queries = get_queries_for_batch(batch)
        queries.extend(batch_queries)

        print(json.dumps(
            batch_queries,
            indent=2,
            ensure_ascii=False
        ))

[{'id': 'a9c61094-1086-494f-a535-d786d08aea46', 'title': 'The Little Book of Semaphores', 'summary': 'The document explores classical synchronization problems in concurrent programming, focusing on the mechanisms that enable multiple threads to safely and efficiently share resources. It delves into several well-known problems such as the Dining Philosophers and Producer-Consumer scenarios. The Dining Philosophers problem illustrates the challenges of resource allocation among threads needing exclusive access to shared resources—in this case, forks required for eating. The document discusses potential solutions to avoid deadlock, such as limiting the number of philosophers at the table or altering the order in which they pick up forks. Similarly, the Producer-Consumer problem is analyzed, highlighting the necessity for synchronization to prevent race conditions, where consumers and producers interact with a shared buffer. The text explicates the use of semaphores as a synchronization to

In [169]:
# Save generated queries to a json file

with open(queries_filename, 'w') as f:
    json.dump(queries, f, indent=2, ensure_ascii=False)

In [170]:
# Delete the assistant

res = client.beta.assistants.delete(assistant.id)
print(res)

AssistantDeleted(id='asst_SreO4Pk8aTxVftJmDMm2ybHM', deleted=True, object='assistant.deleted')
