In [1]:
# https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/assistant


from openai import AzureOpenAI
import os
from dotenv import load_dotenv
env_path = os.path.join(".venv/.env")
load_dotenv(dotenv_path=env_path)


True

In [2]:
# https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/file-search?tabs=python
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-05-01-preview",
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    )

In [3]:
# Create a vector store called "Financial Statements"

# need to check whether that exists first

vector_store = client.beta.vector_stores.create(name="CMA_merger")
 

In [4]:
# Ready the files for upload to OpenAI
file_paths = [r"data\2024_00020.00002.pdf"]
file_streams = [open(path, "rb") for path in file_paths]

In [5]:
# Use the upload and poll SDK helper to upload the files, add them to the vector store,
# and poll the status of the file batch for completion.
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

In [6]:
# You can print the status and the file counts of the batch to see the result of this operation.
print(file_batch.status)
print(file_batch.file_counts)

completed
FileCounts(cancelled=0, completed=1, failed=0, in_progress=0, total=1)


In [7]:
# If we need to update assistant after setting up the vector store

# assistant = client.beta.assistants.create(
#   name="Financial Analyst Assistant",
#   instructions="You are a UK CMA merger specialist and very competent in fields of Antitrust, Economic theories of mergers (e.g., theories of harm). Use your knowledge base to answer questions about CMA merger decisions.",
#   model="document_summarisation",
#   tools=[{"type": "file_search"}],
#   temperature=0
# )


# # To make the files accessible to your assistant, update the assistant’s tool_resources with the new vector_store ID.
# assistant = client.beta.assistants.update(
#   assistant_id=assistant.id,
#   tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
# )


# if we set up the assistant after setting up the vector store (all in one go)

assistant = client.beta.assistants.create(
  name="Financial Analyst Assistant",
  instructions="You are a UK CMA merger specialist and very competent in fields of Antitrust, Economic theories of mergers (e.g., theories of harm). Use your knowledge base to answer questions about CMA merger decisions.",
  model="document_summarisation",
  tools=[{"type": "file_search"}],
  temperature=0,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}}
)

# alternatively can attach the pdf as an attachment in the thread, for example

# # Upload the user provided file to OpenAI
# message_file = client.files.create(
#   file=open(r"data\2024_00020.00002.pdf", "rb"), purpose="assistants"
# )
 

In [8]:
# https://platform.openai.com/docs/assistants/quickstart


# Start a thread
# https://platform.openai.com/docs/api-reference/vector-stores-files/file-object
thread = thread = client.beta.threads.create()
print(f"Thread ID: {thread.id}")

Thread ID: thread_hhzAAOd3PZgO3pCOcd2YNrIA


In [9]:
message = client.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="What is this document about, provide an executive summary of no longer than 200 words."
)

In [41]:
# check message is associated with thread correctly

# thread_messages = client.beta.threads.messages.list(thread.id)
# print(thread_messages.model_dump_json(indent=2))

In [10]:
# https://learn.microsoft.com/en-us/azure/ai-services/openai/assistants-reference-runs?tabs=python

# run thread

run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

In [13]:
# retrieve thread status

# Retrieve the status of the run
run = client.beta.threads.runs.retrieve(
  thread_id=thread.id,
  run_id=run.id
)

status = run.status
print(status)

completed


In [14]:
# List thread messages post run

messages_ = client.beta.threads.messages.list(
  thread_id=thread.id
)

In [25]:
all_messages = [i.content[0].text.value for i in messages_.data]

break_line = '-'*30 + '\n\n'

print('Original question:\n'+ break_line + all_messages[-1])

print( '\n\n' + '=' * 30 + '\n\n')

print('Response:\n'+ break_line  +''.join(all_messages[:-1]))

Original question:
------------------------------

What is this document about, provide an executive summary of no longer than 200 words.




Response:
------------------------------

The document is a decision by the Competition and Markets Authority (CMA) regarding the anticipated acquisition of PGS ASA by TGS ASA. The CMA concluded that the merger does not give rise to a realistic prospect of a substantial lessening of competition (SLC) within any market in the United Kingdom. 

TGS and PGS are global suppliers of scientific data and intelligence to the energy sector, particularly in marine seismic data used for oil and gas exploration, carbon capture and storage (CCS), and offshore wind infrastructure. The merger was reviewed because both companies have a significant presence in the UK market, with a combined share of supply exceeding 25% in the multiclient marine seismic data sector.

The CMA's assessment focused on three main theories of harm: horizontal unilateral effects in the

In [27]:
# ask a follow-up question

# Add a new user question to the thread
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content="What are some key parties involved and their respective market shares?"
)

In [28]:
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  #instructions="New instructions" #You can optionally provide new instructions  but these will override the default instructions
)

# Retrieve the status of the run
run = client.beta.threads.runs.retrieve(
  thread_id=thread.id,
  run_id=run.id
)

status = run.status
print(status)

queued


In [52]:
# Retrieve the status of the run
run = client.beta.threads.runs.retrieve(
  thread_id=thread.id,
  run_id=run.id
)

status = run.status
print(status)

completed


### Looks like we have some memory preserved but not managed well yet, and looks like it can retrieve table based market share

Note that the original quesiton still uses the start message when creating the thread, and some response for earlier question was not discarded

In [56]:
messages_output_2 = client.beta.threads.messages.list(
  thread_id=thread.id
)


all_messages_2 = [i.content[0].text.value for i in messages_output_2.data]

break_line = '-'*30 + '\n\n'

print('Original question:\n'+ break_line + all_messages_2[-1])

print( '\n\n' + '=' * 30 + '\n\n')

print('Response:\n'+ break_line  +''.join(all_messages_2[:-1]))

Original question:
------------------------------

What is this document about, provide an executive summary of no longer than 200 words.




Response:
------------------------------

The key parties involved in the merger are TGS ASA and PGS ASA. Their respective market shares in the supply of new streamer multiclient marine seismic data from 2020 to 2023 are as follows:

- **TGS ASA**: 
  - North Sea Area: 2020 ([5-10]%), 2021 ([10-20]%), 2022 ([10-20]%), 2023 ([0-5]%)
  - Global: 2020 ([20-30]%), 2021 ([20-30]%), 2022 ([20-30]%), 2023 ([10-20]%)

- **PGS ASA**: 
  - North Sea Area: 2020 ([30-40]%), 2021 ([20-30]%), 2022 ([5-10]%), 2023 ([40-50]%)
  - Global: 2020 ([20-30]%), 2021 ([20-30]%), 2022 ([20-30]%), 2023 ([20-30]%)

- **Combined Entity**: 
  - North Sea Area: 2020 ([40-50]%), 2021 ([30-40]%), 2022 ([20-30]%), 2023 ([40-50]%)
  - Global: 2020 ([40-50]%), 2021 ([50-60]%), 2022 ([50-60]%), 2023 ([40-50]%)

Other significant competitors in the market include:

- **CGG**: 
  - N

## Drafts/ failed runs below -> to further explore

### Run the assistant

In [31]:
# https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/file-search?tabs=python

from typing_extensions import override
from openai import AssistantEventHandler#, OpenAI

# First, we create a EventHandler class to define
# how we want to handle the events in the response stream.


 
class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        # print a citation to the file searched
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))



In [32]:
# https://platform.openai.com/docs/assistants/deep-dive [Context window management]

# When using the File Search tool, we recommend setting the max_prompt_tokens to no less than 20,000. 
# For longer conversations or multiple interactions with File Search, consider increasing this limit to 50,000, or ideally, 
# removing the max_prompt_tokens limits altogether to get the highest quality results.




# Then, we use the stream SDK helper
# with the EventHandler class to create the Run
# and stream the response.

with client.beta.threads.runs.stream(
    thread_id=thread.id,
    assistant_id=assistant.id,
    instructions="Please address the user as Li. The user has a premium account.",
    event_handler=EventHandler(),
) as stream:
    stream.until_done()


assistant > file_search


assistant > The document is a decision by the Competition and Markets Authority (CMA) regarding the anticipated acquisition of PGS ASA by TGS ASA. The CMA concluded that the merger does not give rise to a realistic prospect of a substantial lessening of competition (SLC) within any market in the United Kingdom. 

TGS and PGS are global suppliers of scientific data and intelligence to companies in the energy sector, particularly in marine seismic data used for oil and gas exploration, carbon capture and storage (CCS), and offshore wind infrastructure. The CMA reviewed the merger to ensure it would not harm competition, given that both companies are significant players in the market.

The CMA's assessment focused on three main theories of harm: 
1. Horizontal unilateral effects in the supply of new streamer multiclient marine seismic data in the North Sea Area.
2. Horizontal unilateral effects in the supply of proprietary marine seismic data in the North Sea Ar

In [37]:
# ask follow-up questions

follow_up_response = client.beta.threads.messages.create(
    thread_id=thread.id,
    role = 'user',
    content="What are the key parties mentioned in the document and their market shares"
)

# Print the assistant's response to the follow-up question
print(follow_up_response.content)

[TextContentBlock(text=Text(annotations=[], value='What are the key parties mentioned in the document and their market shares'), type='text')]


In [38]:
follow_up_response

Message(id='msg_qqDzLchK484RRydHAmAluKK8', assistant_id=None, attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='What are the key parties mentioned in the document and their market shares'), type='text')], created_at=1735927691, incomplete_at=None, incomplete_details=None, metadata={}, object='thread.message', role='user', run_id=None, status=None, thread_id='thread_fKjequmTOYrovEFUzfxh7JnT')

In [None]:
response = client.beta.assistants.messages.create(
    assistant_id=assistant.id,
    thread_id=thread.id,
    content="What are the key decisions in the CMA merger document?"
)

In [None]:

# Create a thread and attach the file to the message
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": "How many company shares were outstanding last quarter?",
      # Attach the new file to the message.
      "attachments": [
        { "file_id": message_file.id, "tools": [{"type": "file_search"}] }
      ],
    }
  ]
)
 
# The thread now has a vector store with that file in its tool resources.
print(thread.tool_resources.file_search)
