## Housekeeping

In [7]:
import openai
import azure
import os
import PyPDF2
import json 
import re

In [8]:
openai.api_type = "azure"
openai.api_base = "https://ascent-hackathon.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")

## Document Intel funcs

In [9]:
"""
This code sample shows Prebuilt Layout operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs
https://learn.microsoft.com/azure/applied-ai-services/form-recognizer/quickstarts/get-started-v3-sdk-rest-api?view=doc-intel-3.1.0&pivots=programming-language-python
"""

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = os.getenv("YOUR_FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("FORM_RECOGNIZER_KEY")


In [10]:
# Load pdf 
# creating a pdf file object
citi = open('10qs\Q\q_citi_2022q3.pdf', 'rb')


In [11]:
form = citi

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

poller = document_analysis_client.begin_analyze_document("prebuilt-layout", form)
result = poller.result()



## Find pages with tables

In [12]:
entries = result.tables

# Define a set to store the extracted page numbers
page_numbers_set = set()

# Iterate over each entry in the list
for entry in entries:
    # Use regular expression to find numbers following "page_number="
    page_numbers = re.findall(r'page_number=(\d+)', str(entry))
    
    # Add the extracted numbers to the set
    page_numbers_set.update(page_numbers)

# Convert the set to a list if needed
page_numbers_list = list(page_numbers_set)

## Find page with term & table

In [14]:
pdf_file_path = '10qs/Q/q_citi_2022q3.pdf'

# Initialize found variable
found = False

with open(pdf_file_path, 'rb') as pdf_file:
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    # Iterate through the pages
    for page_num in range(len(pdf_reader.pages)):
        # Get the text from the page (make it lowercase for case-insensitive search)
        page_text = pdf_reader.pages[page_num].extract_text().lower()
        
        # Check if the target word is in the page text
        if 'total derivative notionals' in page_text:
            print(f"Found 'total derivative notionals' on page {page_num+1}")
            found = True
            if str(page_num+1) in page_numbers_set and found == True:
                print(f'{page_num}')
                page_text = pdf_reader.pages[page_num].extract_text()

Found 'total derivative notionals' on page 174
173


In [12]:
page = pdf_reader.pages[173].extract_text()
page

'19.  DERIVATIVES\n \nIn the ordinary course of business, Citigroup enters into \nvarious types of derivative transactions. All derivatives are \nrecorded in Trading account assets/Trading account liabilities  \non the Consolidated Balance Sheet. For additional information \nregarding Citi’s use of and accounting for derivatives, see \nNote 22 to the Consolidated Financial Statements in Citi’s \n2021  Form 10-K.\nInformation pertaining to Citigroup’s derivatives \nactivities, based on notional amounts, is presented in the table \nbelow. Derivative notional amounts are reference amounts \nfrom which contractual payments are derived and do not \nrepresent a complete measure of Citi’s exposure to derivative \ntransactions. Citi’s derivative exposure arises primarily from market fluctuations (i.e., market risk), counterparty failure \n(i.e., credit risk) and/or periods of high volatility or financial \nstress (i.e., liquidity risk), as well as any market valuation \nadjustments that may be

In [13]:
# Ask GPT
openai.api_type = "azure"
openai.api_base = "https://ascent-hackathon.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")

message_text = [
    {"role":"system","content":"You are a financial analysis AI assistant that helps people find information."},
    {"role":"user",
     "content":f'consider the following text, ignore text that is not part of a table. What is the total value of derivative notinals  under ASC 815? answer only with the metric requested. {page}'}]

completion = openai.ChatCompletion.create(
  engine="10qs_poc",
  messages = message_text,
  temperature=0.7,
  max_tokens=800,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None
)

In [17]:
completion.choices[0].message.content

'The total value of derivative notionals under ASC 815 is $327,957 million as of September 30, 2022.'

## Capital

In [45]:
def find_page(role,word):
    for i in range(0, len(citi_dict['paragraphs'])):
        para = citi_dict['paragraphs'][i]
        if (para['role']==role) & (para['content']==word):
            page = para['bounding_regions'][0]['page_number']
    return page



In [48]:
find_page('sectionHeading','Capital')

6

In [44]:
page

(6, None)

In [15]:
page = pdf_reader.pages[5].extract_text()
page

'exchange translation, partially offset by the issuance of \ninstitutional certificates of deposit, reflected in Corporate/\nOther , as Citigroup continued to diversify its funding profile.\nExpenses\nCitigroup’s operating expenses of $12.7 billion increased 8% \nversus the prior-year period, largely driven by continued \ninvestments in Citi’s transformation, business-led investments \nand volume-related expenses, other risk and control \ninvestments and inflation, partially offset by productivity \nsavings and the benefit of foreign exchange translation. As \ndiscussed above, reported expenses included approximately \n$107 million of divestiture-related costs. Excluding these \ndivestiture-related costs, expenses increased 7% versus the \nprior-year period, largely driven by the following:\n•Approximately 2% was driven by transformation \ninvestments, with about two-thirds related to the risk, \ncontrols, data and finance programs (approximately 25% \nof the program investments were r

In [16]:
# Ask GPT
openai.api_type = "azure"
openai.api_base = "https://ascent-hackathon.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")


prompt = [{"role":"system","content":
           """You are a financial analysis AI assistant that helps people find information. 
           You will receive a text extracted from a firm's quarterly report and retrieve some metricts from it.
           You will structure your answer in a JSON format with the following fields:
           CET1 Ratio: the value of the metric CET1 ratio for the latest period.
           Previous CET1 Ratio: the value of the metric CET1 ratio for the previous period.
           Direction: the direction of the change either up, down or no change.
           Driver: a one sentence summary of the reason for the change, if not found fill with None.
           """},
    {"role":"user",
     "content":f'{page}'}]



message_text = prompt
completion = openai.ChatCompletion.create(
  engine="10qs_poc",
  messages = message_text,
  temperature=0.7,
  max_tokens=800,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None
)

In [36]:
completion.choices[0].message.content

'```json\n{\n  "CET1 Ratio": "12.3%",\n  "Previous CET1 Ratio": "11.7%",\n  "Direction": "up",\n  "Driver": "The increase was primarily driven by net income, the impacts related to the closing of the Australia and Philippines consumer business sales, and business actions, including a reduction in RWA, partially offset by interest rate impacts on Citigroup\'s investment portfolio and the return of capital to common shareholders."\n}\n```'

In [40]:
answer = completion.choices[0].message.content
answer = answer.strip("```json\n")
result = json.loads(answer)
result

{'CET1 Ratio': '12.3%',
 'Previous CET1 Ratio': '11.7%',
 'Direction': 'up',
 'Driver': "The increase was primarily driven by net income, the impacts related to the closing of the Australia and Philippines consumer business sales, and business actions, including a reduction in RWA, partially offset by interest rate impacts on Citigroup's investment portfolio and the return of capital to common shareholders."}

In [47]:
citi_dict['paragraphs']

[{'role': 'title',
  'content': 'UNITED STATES SECURITIES AND EXCHANGE COMMISSION WASHINGTON, D.C. 20549 FORM 10-Q',
  'bounding_regions': [{'page_number': 1,
    'polygon': [{'x': 1.8468, 'y': 0.725},
     {'x': 6.6465, 'y': 0.7121},
     {'x': 6.6492, 'y': 1.7235},
     {'x': 1.8495, 'y': 1.7364}]}],
  'spans': [{'offset': 0, 'length': 81}]},
 {'role': None,
  'content': '(Mark One)',
  'bounding_regions': [{'page_number': 1,
    'polygon': [{'x': 0.4871, 'y': 1.9002},
     {'x': 1.0697, 'y': 1.9002},
     {'x': 1.0697, 'y': 2.0243},
     {'x': 0.4871, 'y': 2.0243}]}],
  'spans': [{'offset': 82, 'length': 10}]},
 {'role': None,
  'content': '☒ :selected:',
  'bounding_regions': [{'page_number': 1,
    'polygon': [{'x': 0.8166, 'y': 2.2084},
     {'x': 0.9138, 'y': 2.2084},
     {'x': 0.9138, 'y': 2.3024},
     {'x': 0.8166, 'y': 2.3024}]}],
  'spans': [{'offset': 93, 'length': 12}]},
 {'role': None,
  'content': 'QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCH

## Check performance of GPT 3.5


In [17]:
prompt = [{"role":"system","content":
           """You are a financial analysis AI assistant that helps people find information. 
           You will receive a text extracted from a firm's quarterly report and retrieve some metricts from it.
           You will structure your answer in a JSON format with the following fields:
           CET1 Ratio: the value of the metric CET1 ratio for the latest period.
           Previous CET1 Ratio: the value of the metric CET1 ratio for the previous period.
           Direction: the direction of the change either up, down or no change.
           Driver: a one sentence summary of the reason for the change, if not found fill with None.
           """},
    {"role":"user",
     "content":f'{page}'}]



message_text = prompt
completion = openai.ChatCompletion.create(
  engine="10qs_poc_gpt3",
  messages = message_text,
  temperature=0.7,
  max_tokens=800,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None
)

In [18]:
answer = completion.choices[0].message.content
answer = answer.strip("```json\n")
result = json.loads(answer)
result

{'CET1 Ratio': 12.3,
 'Previous CET1 Ratio': 11.7,
 'Direction': 'up',
 'Driver': "The increase in Citigroup's CET1 Capital ratio was primarily driven by net income, impacts related to the closing of the Australia and Philippines consumer business sales, and business actions including a reduction in risk-weighted assets, partially offset by interest rate impacts on Citigroup’s investment portfolio and the return of capital to common shareholders."}