# Pipeline

1) Use Document Intelligence parse layout of relevant PDF

2) Use Python to heuristically extract the chunk with information relevant to the metrics of interest

3) Define a prompt template for metrics of interest

4) Pass prompt + text chunk through LLM 

5) Retrieve and assess completion

# Document intelligence

In [4]:
"""
This code sample shows Prebuilt Layout operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs
https://learn.microsoft.com/azure/applied-ai-services/form-recognizer/quickstarts/get-started-v3-sdk-rest-api?view=doc-intel-3.1.0&pivots=programming-language-python
"""
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

import os

import pandas as pd
import numpy as np

import json

In [5]:
"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = os.getenv("YOUR_FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("FORM_RECOGNIZER_KEY")

In [6]:
# sample document
# formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [7]:
#poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
form = open("10qs/Q/q_baml_2023q2.pdf", 'rb')
poller = document_analysis_client.begin_analyze_document("prebuilt-layout", form)
result = poller.result()

# for idx, style in enumerate(result.styles):
#     print(
#         "Document contains {} content".format(
#          "handwritten" if style.is_handwritten else "no handwritten"
#         )
#     )

# for page in result.pages:
#     for line_idx, line in enumerate(page.lines):
#         print(
#          "...Line # {} has text content '{}'".format(
#         line_idx,
#         line.content.encode("utf-8")
#         )
#     )

#     for selection_mark in page.selection_marks:
#         print(
#          "...Selection mark is '{}' and has a confidence of {}".format(
#          selection_mark.state,
#          selection_mark.confidence
#          )
#     )

# for table_idx, table in enumerate(result.tables):
#     print(
#         "Table # {} has {} rows and {} columns".format(
#         table_idx, table.row_count, table.column_count
#         )
#     )
        
#     for cell in table.cells:
#         print(
#             "...Cell[{}][{}] has content '{}'".format(
#             cell.row_index,
#             cell.column_index,
#             cell.content.encode("utf-8"),
#             )
#         )

# print("----------------------------------------")



In [8]:
# change to json fromat 
result_dict = result.to_dict()

# find_word = 'Derivative Notionals'
# # find page with Derivatives otional table
# for i in range(0, len(result_dict['paragraphs'])):
#     para = result_dict['paragraphs'][i]
#     if (para['role']=='sectionHeading') & (para['content']==find_word):
#         page = para['bounding_regions'][0]['page_number']
# page

# Extract content

In [9]:
# create df with all paragraphs
idx_list = []
role_list = []
content_list = []
page_list = []
for idx, paragraph in enumerate(result_dict["paragraphs"]):
    idx_list.append(idx)
    role_list.append(paragraph['role'])
    content_list.append(paragraph['content'])
    page_list.append(paragraph['bounding_regions'][0]['page_number'])
df_paragraph = pd.DataFrame({'idx':idx_list,
                             'role':role_list,
                             'content':content_list,
                             'page':page_list
                             })
df_paragraph

Unnamed: 0,idx,role,content,page
0,0,title,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,1
1,1,,(Mark One),1
2,2,,:selected:,1
3,3,,QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(...,1
4,4,,EXCHANGE ACT OF 1934,1
...,...,...,...,...
15767,15767,,"2. I hereby certify, pursuant to 18 U.S.C. Sec...",114
15768,15768,,• the Quarterly Report on Form 10-Q of the reg...,114
15769,15769,,• the information contained in the periodic re...,114
15770,15770,,"Date: July 31, 2023",114


In [10]:
def topic_content(topic):
    content_btw_section = []
    for i in range(len(df_paragraph)):
        # Find the index where role='sectionHeading' and content= topics
        start_idx = df_paragraph[(df_paragraph['role'] == 'sectionHeading') & (df_paragraph['content'] == topic)]['idx'].values[0]
        # start_page = df_paragraph.loc[start_idx]['page']
        
        # Find the index of the next 'sectionHeading' if exists
        next_section_idx = df_paragraph[df_paragraph['role'] == 'sectionHeading'].loc[df_paragraph['idx'].gt(start_idx)].iloc[0]['idx']
        # next_page = df_paragraph.loc[next_section_idx]['page']

        # Filter rows with role=None between the indices
        filtered_rows = df_paragraph[(df_paragraph['role'].isna()) & (df_paragraph['idx'].gt(start_idx)) & (df_paragraph['idx'].lt(next_section_idx))]['content']
        # Print the content of filtered rows
    for content in filtered_rows:
        content_btw_section.append(content)
    return content_btw_section

In [11]:
topic = 'Capital Management'
asked_topic = topic_content(topic)
asked_topic

['The Board of Governors of the Federal Reserve System (Federal Reserve) requires BHCs to submit a capital plan and planned capital actions on an annual basis, consistent with the rules governing the Comprehensive Capital Analysis and Review (CCAR) capital plan. On July 27, 2023, the Federal Reserve released final 2023 CCAR supervisory stress test results for Bank of America. Based on the results, our stress capital buffer (SCB) will be 2.5 percent, 90 basis points (bps) lower than the current level of 3.4 percent, and our Common equity tier 1 (CET1) minimum requirement will decline to 9.5 percent effective October 1, 2023. Beginning January 1, 2024, we expect our minimum CET1 requirement to increase 50 bps, aligned with planned growth in the global systemically important bank (G-SIB) surcharge.',
 'On July 27, 2023, U.S. banking regulators issued proposed rules that would update future U.S. regulatory capital requirements, including the calculation of risk-weighted assets and the G-SI

# Put into prompt

### GPT4

In [12]:
#Note: The openai-python library support for Azure OpenAI is in preview.
      #Note: This code sample requires OpenAI Python library version 0.28.1 or lower.
import os
import openai

openai.api_type = "azure"
openai.api_base = "https://ascent-hackathon.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")

In [20]:
message_text = [{"role":"system","content":
           """You are a financial analysis AI assistant that helps people find information. 
           You will receive a text extracted from a firm's quarterly report and retrieve some metricts from it.
           You will structure your answer in a JSON format with the following fields:
           Current value: the value of the metrics CET1 ratio for the latest period.
           Previous value: the value of the metrics CET1 ratio for the previous period.
           Direction: the direction of the change either up, down or no change.
           Driver: a one sentence summary of the reason for the change, if not found fill with "None".
           """},
    {"role":"user",
     "content":f'{asked_topic}'}]

completion = openai.ChatCompletion.create(
  engine="10qs_poc",
  messages = message_text,
  temperature=0.7,
  max_tokens=464,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None
)

In [21]:
answer = completion.choices[0].message.content
answer = answer.strip("```json\n")
result = json.loads(answer)
result

{'Current value': '9.5',
 'Previous value': '10.4',
 'Direction': 'down',
 'Driver': "The reduction is due to the Federal Reserve's final 2023 CCAR supervisory stress test results lowering the stress capital buffer requirement."}

### GPT3.5

In [15]:
message_text = [{"role":"system","content":
           """You are a financial analysis AI assistant that helps people find information. 
           You will receive a text extracted from a firm's quarterly report and retrieve some metricts from it.
           You will structure your answer in a JSON format with the following fields:
           Current value: the value of the metrics CET1 ratio for the latest period.
           Previous value: the value of the metrics CET1 ratio for the previous period.
           Direction: the direction of the change either up, down or no change.
           Driver: a one sentence summary of the reason for the change, if not found fill with "None".
           """},
    {"role":"user",
     "content":f'{asked_topic}'}]

completion = openai.ChatCompletion.create(
  engine="10qs_poc_gpt3",
  messages = message_text,
  temperature=0.7,
  max_tokens=464,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None
)

In [16]:
answer = completion.choices[0].message.content
answer = answer.strip("```json\n")
result = json.loads(answer)
result

{'Current value': '3.4 percent',
 'Previous value': 'None',
 'Direction': 'Down',
 'Driver': 'The stress capital buffer will decrease to 2.5 percent, resulting in a decline in the Common equity tier 1 (CET1) minimum requirement to 9.5 percent.'}

In [17]:

asked_topic

['The Board of Governors of the Federal Reserve System (Federal Reserve) requires BHCs to submit a capital plan and planned capital actions on an annual basis, consistent with the rules governing the Comprehensive Capital Analysis and Review (CCAR) capital plan. On July 27, 2023, the Federal Reserve released final 2023 CCAR supervisory stress test results for Bank of America. Based on the results, our stress capital buffer (SCB) will be 2.5 percent, 90 basis points (bps) lower than the current level of 3.4 percent, and our Common equity tier 1 (CET1) minimum requirement will decline to 9.5 percent effective October 1, 2023. Beginning January 1, 2024, we expect our minimum CET1 requirement to increase 50 bps, aligned with planned growth in the global systemically important bank (G-SIB) surcharge.',
 'On July 27, 2023, U.S. banking regulators issued proposed rules that would update future U.S. regulatory capital requirements, including the calculation of risk-weighted assets and the G-SI