# Document intelligence

In [62]:
"""
This code sample shows Prebuilt Layout operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs
https://learn.microsoft.com/azure/applied-ai-services/form-recognizer/quickstarts/get-started-v3-sdk-rest-api?view=doc-intel-3.1.0&pivots=programming-language-python
"""
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

import os

import pandas as pd
import numpy as np

import json

In [13]:
"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = os.getenv("YOUR_FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("YOUR_FORM_RECOGNIZER_KEY")

In [14]:
# sample document
# formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [15]:
#poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", formUrl)
form = open("10qs/Q/q_citi_2022q3.pdf", 'rb')
poller = document_analysis_client.begin_analyze_document("prebuilt-layout", form)
result = poller.result()

# for idx, style in enumerate(result.styles):
#     print(
#         "Document contains {} content".format(
#          "handwritten" if style.is_handwritten else "no handwritten"
#         )
#     )

# for page in result.pages:
#     for line_idx, line in enumerate(page.lines):
#         print(
#          "...Line # {} has text content '{}'".format(
#         line_idx,
#         line.content.encode("utf-8")
#         )
#     )

#     for selection_mark in page.selection_marks:
#         print(
#          "...Selection mark is '{}' and has a confidence of {}".format(
#          selection_mark.state,
#          selection_mark.confidence
#          )
#     )

# for table_idx, table in enumerate(result.tables):
#     print(
#         "Table # {} has {} rows and {} columns".format(
#         table_idx, table.row_count, table.column_count
#         )
#     )
        
#     for cell in table.cells:
#         print(
#             "...Cell[{}][{}] has content '{}'".format(
#             cell.row_index,
#             cell.column_index,
#             cell.content.encode("utf-8"),
#             )
#         )

# print("----------------------------------------")



In [16]:
# change to json fromat 
result_dict = result.to_dict()

# find_word = 'Derivative Notionals'
# # find page with Derivatives otional table
# for i in range(0, len(result_dict['paragraphs'])):
#     para = result_dict['paragraphs'][i]
#     if (para['role']=='sectionHeading') & (para['content']==find_word):
#         page = para['bounding_regions'][0]['page_number']
# page

# Find tables and text

In [17]:
def find_page(role,word):
    # find page with Derivatives otional table
    for i in range(0, len(result_dict['paragraphs'])):
        para = result_dict['paragraphs'][i]
        if (para['role']==role) & (para['content']==word):
            page = para['bounding_regions'][0]['page_number']
    return page

In [18]:
page = find_page('sectionHeading','Derivative Notionals')
page

174

In [19]:
# find index for table
def find_table(page):
    for idx, atable in enumerate(result_dict["tables"]):
        if result_dict['tables'][idx]['bounding_regions'][0]['page_number'] == page:
            index_table = idx
    return index_table

In [20]:
index_tbl = find_table(page)
index_tbl

225

In [21]:
def create_table(index_table): 
    atable = result_dict["tables"][index_table]
    row_count = atable["row_count"]
    column_count = atable["column_count"]

    # create empty cell
    tmp_df = pd.DataFrame(index=range(row_count), columns=range(column_count))
    content_list = list() 

    # replace each cell
    for info in atable["cells"]:
        row_idx = info['row_index']
        col_idx = info['column_index']
        content = info['content']
        tmp_df.iloc[row_idx, col_idx] = content
        content_list.append(content)
    
    return tmp_df, content_list

In [22]:
df, df_content = create_table(index_tbl)
df

Unnamed: 0,0,1,2,3,4
0,In millions of dollars,Hedging instruments under ASC 815,,Trading derivative instruments,
1,,"September 30, 2022","December 31, 2021","September 30, 2022","December 31, 2021"
2,Interest rate contracts,,,,
3,Swaps,"$ 241,324","$ 267,035","$ 23,601,980","$ 21,873,538"
4,Futures and forwards,—,—,2646937,2383702
5,Written options,—,—,1804687,1584451
6,Purchased options,—,—,1722756,1428376
7,Total interest rate contracts,"$ 241,324","$ 267,035","$ 29,776,360","$ 27,270,067"
8,Foreign exchange contracts,,,,
9,Swaps,"$ 44,981","$ 47,298","$ 6,486,067","$ 6,288,193"


In [23]:
# find text 
def find_text(page):
    text_list = []
    for i in range(0, len(result_dict['paragraphs'])):
        para = result_dict['paragraphs'][i]
        if (para['bounding_regions'][0]['page_number'] == page) &  (para['role'] == None):
            if para['content'] not in df_content:
                text_list.append(para['content'])
        else:
            continue
    return text_list

In [31]:
text_content = find_text(page)
text_content

["In the ordinary course of business, Citigroup enters into various types of derivative transactions. All derivatives are recorded in Trading account assets/Trading account liabilities on the Consolidated Balance Sheet. For additional information regarding Citi's use of and accounting for derivatives, see Note 22 to the Consolidated Financial Statements in Citi's 2021 Form 10-K.",
 "Information pertaining to Citigroup's derivatives activities, based on notional amounts, is presented in the table below. Derivative notional amounts are reference amounts from which contractual payments are derived and do not represent a complete measure of Citi's exposure to derivative transactions. Citi's derivative exposure arises primarily from",
 'market fluctuations (i.e., market risk), counterparty failure (i.e., credit risk) and/or periods of high volatility or financial stress (i.e., liquidity risk), as well as any market valuation adjustments that may be required on the transactions. Moreover, no

In [25]:
find_text(page+1)

["The following tables present the gross and net fair values of the Company's derivative transactions and the related offsetting amounts as of September 30, 2022 and December 31, 2021. Gross positive fair values are offset against gross negative fair values by counterparty, pursuant to enforceable master netting agreements. Under ASC 815-10-45, payables and receivables in respect of cash collateral received from or paid to a given counterparty pursuant to a credit support annex are included in the offsetting amount if a legal opinion supporting the enforceability of netting and collateral rights has been obtained. GAAP does not permit similar offsetting for security collateral.",
 'In addition, the following tables reflect rule changes adopted by clearing organizations that require or allow entities to treat certain derivative assets, liabilities and the related variation margin as settlement of the related derivative fair values for legal and accounting purposes, as opposed to present

# Ask Chatbot from Azure OpenAI

In [1]:
#Note: The openai-python library support for Azure OpenAI is in preview.
      #Note: This code sample requires OpenAI Python library version 0.28.1 or lower.
import os
import openai

openai.api_type = "azure"
openai.api_base = "https://ascent-hackathon.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")


In [32]:
prompt = f'consider the following text and summarize key information {text_content}'


In [33]:
message_text = [{"role":"system","content":prompt}]

completion = openai.ChatCompletion.create(
  engine="10qs_poc",
  messages = message_text,
  temperature=0.7,
  max_tokens=464,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None
)

In [57]:
answer = completion['choices'][0]['message']['content']
answer

"Key Information:\n\n- Citigroup engages in various derivative transactions as part of its normal business operations.\n- Derivatives are recorded on the Consolidated Balance Sheet as either Trading account assets or Trading account liabilities.\n- Further details on Citi's derivative practices and accounting can be found in Note 22 of the Consolidated Financial Statements in Citi's 2021 Form 10-K.\n- Notional amounts of derivatives are used as reference for contractual payments but do not fully represent Citigroup's exposure to derivative transactions.\n- Citigroup's derivative exposure is affected by market risk, credit risk, liquidity risk, and required market valuation adjustments.\n- Notional amounts do not account for netting of offsetting trades, meaning that if Citigroup has offsetting positions, the notional reported is the sum of both, though the actual market risk may be minimal.\n- The total notional amounts of derivatives can vary over time due to changes in market share, 

# Content for certain topics
In this case is Capital topic

In [37]:
# create df with all paragraphs
idx_list = []
role_list = []
content_list = []
page_list = []
for idx, paragraph in enumerate(result_dict["paragraphs"]):
    idx_list.append(idx)
    role_list.append(paragraph['role'])
    content_list.append(paragraph['content'])
    page_list.append(paragraph['bounding_regions'][0]['page_number'])

In [38]:
df_paragraph = pd.DataFrame({'idx':idx_list,
                             'role':role_list,
                             'content':content_list,
                             'page':page_list
                             })
df_paragraph

Unnamed: 0,idx,role,content,page
0,0,title,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,1
1,1,,(Mark One),1
2,2,,☒ :selected:,1
3,3,,QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(...,1
4,4,,"For the quarterly period ended September 30, 2...",1
...,...,...,...,...
24430,24430,,104,234
24431,24431,,See the cover page of this Quarterly Report on...,234
24432,24432,,The total amount of securities authorized purs...,234
24433,24433,,* Denotes a management contract or compensator...,234


In [69]:
def topic_content(topic):
    content_btw_section = []
    for i in range(len(df_paragraph)):
        # Find the index where role='sectionHeading' and content= topics
        start_idx = df_paragraph[(df_paragraph['role'] == 'sectionHeading') & (df_paragraph['content'] == topic)]['idx'].values[0]
        # start_page = df_paragraph.loc[start_idx]['page']
        
        # Find the index of the next 'sectionHeading' if exists
        next_section_idx = df_paragraph[df_paragraph['role'] == 'sectionHeading'].loc[df_paragraph['idx'].gt(start_idx)].iloc[0]['idx']
        # next_page = df_paragraph.loc[next_section_idx]['page']

        # Filter rows with role=None between the indices
        filtered_rows = df_paragraph[(df_paragraph['role'].isna()) & (df_paragraph['idx'].gt(start_idx)) & (df_paragraph['idx'].lt(next_section_idx))]['content']
        # Print the content of filtered rows
    for content in filtered_rows:
        content_btw_section.append(content)
    return content_btw_section

In [70]:
topic = 'Capital'
asked_topic = topic_content(topic)
asked_topic

["Citigroup's CET1 Capital ratio was 12.3% as of September 30, 2022, compared to 11.7% as of September 30, 2021, based on the Basel III Standardized Approach for determining risk- weighted assets (RWA). The increase was primarily driven by net income, the impacts related to the closing of the Australia and Philippines consumer business sales, and business actions, including a reduction in RWA, partially offset by interest rate impacts on Citigroup's investment portfolio and the return of capital to common shareholders. The increase in Citi's CET1 Capital ratio was also partially offset by the impact of adopting the Standardized Approach for Counterparty Credit Risk (SA-CCR) on January 1, 2022.",
 'Citigroup\'s Supplementary Leverage ratio as of September 30, 2022 was 5.7%, compared to 5.8% as of September 30, 2021. The decrease was driven by lower Tier 1 Capital, partially offset by a decrease in Total Leverage Exposure. For additional information on Citi\'s capital ratios and related 

In [71]:
# prompt = f'consider the following text: {content_btw_section}. Write JSON file for Capital metrics'

# message_text = [{"role":"system","content":prompt}]
message_text = [{"role":"system","content":
           """You are a financial analysis AI assistant that helps people find information. 
           You will receive a text extracted from a firm's quarterly report and retrieve some metricts from it.
           You will structure your answer in a JSON format with the following fields:
           CET1 Ratio: the value of the metric CET1 ratio for the latest period.
           Previous CET1 Ratio: the value of the metric CET1 ratio for the previous period.
           Direction: the direction of the change either up, down or no change.
           Driver: a one sentence summary of the reason for the change, if not found fill with None.
           """},
    {"role":"user",
     "content":f'{asked_topic}'}]

completion = openai.ChatCompletion.create(
  engine="10qs_poc",
  messages = message_text,
  temperature=0.7,
  max_tokens=464,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None
)

In [72]:
answer = completion.choices[0].message.content
answer = answer.strip("```json\n")
result = json.loads(answer)
result

{'CET1 Ratio': '12.3%',
 'Previous CET1 Ratio': '11.7%',
 'Direction': 'up',
 'Driver': "The increase was primarily driven by net income, the impacts related to the closing of the Australia and Philippines consumer business sales, and business actions, including a reduction in RWA, partially offset by interest rate impacts on Citigroup's investment portfolio and the return of capital to common shareholders."}

# Test 

In [None]:
# Ask GPT
openai.api_type = "azure"
openai.api_base = "https://ascent-hackathon.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")

message_text = [
    {"role":"system","content":"You are a financial analysis AI assistant that helps people find information."},
    {"role":"user",
     "content":f'consider the following text, ignore text that is not part of a table. What is the total value of derivative notinals  under ASC 815? answer only with the metric requested. {page}'}]

completion = openai.ChatCompletion.create(
  engine="10qs_poc",
  messages = message_text,
  temperature=0.7,
  max_tokens=800,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None
)

In [18]:
#Note: The openai-python library support for Azure OpenAI is in preview.
      #Note: This code sample requires OpenAI Python library version 0.28.1 or lower.
import os
import openai

openai.api_type = "azure"
openai.api_base = "https://ascent-hackathon.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")

input_prompt = 'Print REVENUE table as JSON file'
message_text = [{"role":"system","content":input_prompt}]

completion = openai.ChatCompletion.create(
  engine="10qs_poc",
  messages = message_text,
  temperature=0.7,
  max_tokens=800,
  top_p=0.95,
  frequency_penalty=0,
  presence_penalty=0,
  stop=None
)

In [19]:
print(completion)

{
  "id": "chatcmpl-8pIVIM8DvLIVvV1OWy3yxR0miZqnb",
  "object": "chat.completion",
  "created": 1707236548,
  "model": "gpt-4",
  "prompt_filter_results": [
    {
      "prompt_index": 0,
      "content_filter_results": {
        "hate": {
          "filtered": false,
          "severity": "safe"
        },
        "self_harm": {
          "filtered": false,
          "severity": "safe"
        },
        "sexual": {
          "filtered": false,
          "severity": "safe"
        },
        "violence": {
          "filtered": false,
          "severity": "safe"
        }
      }
    }
  ],
  "choices": [
    {
      "index": 0,
      "finish_reason": "stop",
      "message": {
        "role": "assistant",
        "content": "To provide you with a JSON representation of a hypothetical \"REVENUE\" table, I'll need to make some assumptions about the structure and data contained within the table. Since I don't have access to an actual database or table, I'll create an example with fictio

In [29]:
# change to json fromat 
result_dict = result.to_dict()

# find page with Derivatives otional table
for i in range(0, len(result_dict['paragraphs'])):
    para = result_dict['paragraphs'][i]
    if (para['role']=='sectionHeading') & (para['content']=='Derivative Notionals'):
        page = para['bounding_regions'][0]['page_number']
page

174

In [None]:
for idx, paragraph in enumerate(result_dict["paragraphs"]):
    #para = result_dict['paragraphs'][idx]
    if (paragraph['role']=='sectionHeading') & (paragraph['content']=='Capital'):
        start_idx = idx
        start_page = paragraph['bounding_regions'][0]['page_number']

In [26]:
result_dict["paragraphs"][0]

{'role': 'title',
 'content': 'UNITED STATES SECURITIES AND EXCHANGE COMMISSION WASHINGTON, D.C. 20549 FORM 10-Q',
 'bounding_regions': [{'page_number': 1,
   'polygon': [{'x': 1.8468, 'y': 0.725},
    {'x': 6.6465, 'y': 0.7121},
    {'x': 6.6492, 'y': 1.7235},
    {'x': 1.8495, 'y': 1.7364}]}],
 'spans': [{'offset': 0, 'length': 81}]}

In [52]:
df_paragraph.loc[155]['page']

6

In [46]:
df_paragraph.loc[140:160]

Unnamed: 0,idx,role,content,page
140,140,sectionHeading,Expenses,6
141,141,,Citigroup's operating expenses of $12.7 billio...,6
142,142,,· Approximately 2% was driven by transformatio...,6
143,143,,· Approximately 1% was driven by business-led ...,6
144,144,,· Approximately 1% was driven by higher volume...,6
145,145,,· Approximately 3% was driven by other risk an...,6
146,146,,"As previously disclosed, Citi expects to conti...",6
147,147,,Cost of Credit,6
148,148,,Citi's total provisions for credit losses and ...,6
149,149,,Net credit losses of $0.9 billion decreased 8%...,6


In [25]:
# Find the index where role='sectionHeading' and content='Expenses'
start_idx = df_paragraph[(df_paragraph['role'] == 'sectionHeading') & (df_paragraph['content'] == 'Capital')]['idx'].values[0]

# Find the index of the next 'sectionHeading' if exists
next_section_idx = df_paragraph[df_paragraph['role'] == 'sectionHeading'].loc[df_paragraph['idx'].gt(start_idx)].iloc[0]['idx']

In [26]:
find_table(174)

225

In [20]:
result_dict['tables'][0]['bounding_regions'][0]['page_number']

2

In [57]:
if result_dict['paragraphs'][0]['role']=='title':
    print(result_dict['paragraphs'][0]['content'])

UNITED STATES SECURITIES AND EXCHANGE COMMISSION WASHINGTON, D.C. 20549 FORM 10-Q


In [None]:
result_dict['paragraphs'][i]['bounding_regions'][0]['page_number']

In [25]:

for i in range(0, len(result_dict['paragraphs'])):
    para = result_dict['paragraphs'][i]
    print(i)
    if (para['role']=='sectionHeading') & (para['content']=='Derivative Notionals'):
        page = para['bounding_regions'][0]['page_number']
        

0
1
2
3
4
5
6
7
8
9


In [31]:
def create_table(index_table): 
    for idx, atable in enumerate(result_dict["tables"]):
        if idx == index_table :
            row_count = atable["row_count"]
            column_count = atable["column_count"]

            # create empty cell
            tmp_df = pd.DataFrame(index=range(row_count), columns=range(column_count))

            # replace each cell
            for info in atable["cells"]:
                row_idx = info['row_index']
                col_idx = info['column_index']
                content = info['content']
                tmp_df.iloc[row_idx, col_idx] = content
        else:
            continue
    
    return tmp_df

7


In [48]:
def table_content(index_table): 
    atable = result_dict["tables"][index_table]

    content_list = list()
    for info in atable["cells"]:
        content_list.append(info['content'])
    
    return content_list

In [50]:
table_content(index_tbl)

['In millions of dollars',
 'Hedging instruments under ASC 815',
 'Trading derivative instruments',
 'September 30, 2022',
 'December 31, 2021',
 'September 30, 2022',
 'December 31, 2021',
 'Interest rate contracts',
 '',
 '',
 '',
 '',
 'Swaps',
 '$ 241,324',
 '$ 267,035',
 '$ 23,601,980',
 '$ 21,873,538',
 'Futures and forwards',
 '—',
 '—',
 '2,646,937',
 '2,383,702',
 'Written options',
 '—',
 '—',
 '1,804,687',
 '1,584,451',
 'Purchased options',
 '—',
 '—',
 '1,722,756',
 '1,428,376',
 'Total interest rate contracts',
 '$ 241,324',
 '$ 267,035',
 '$ 29,776,360',
 '$ 27,270,067',
 'Foreign exchange contracts',
 '',
 '',
 '',
 '',
 'Swaps',
 '$ 44,981',
 '$ 47,298',
 '$ 6,486,067',
 '$ 6,288,193',
 'Futures, forwards and spot',
 '40,271',
 '50,926',
 '3,877,382',
 '4,316,242',
 'Written options',
 '—\n:unselected:',
 '—\n:unselected:',
 '951,315',
 '664,942',
 'Purchased options',
 '—\n:unselected:',
 '—\n:unselected:',
 '938,775',
 '651,958',
 'Total foreign exchange contracts',


In [22]:
for idx, atable in enumerate(result_dict["tables"]):
    if idx == 3:
        row_count = atable["row_count"]
        column_count = atable["column_count"]

        
        for aval in atable["cells"]:
            l.append(aval["content"])
        print(len(l),l)

44 ['In millions of dollars', 'Third Quarter', '% Change', 'Nine Months', '% Change', '2022', '2021', '2022', '2021', 'Institutional Clients Group', '$ 9,468', '$ 9,991', '(5)%', '$ 32,047', '$ 30,928', '4 %', 'Personal Banking and Wealth Management', '6,187', '5,852', '6', '18,121', '17,542', '3', 'Legacy Franchises', '2,554', '1,536', '66', '6,420', '6,058', '6', 'Corporate/Other', '299', '68', 'NM', '744', '339', 'NM', 'Total Citigroup net revenues', '$ 18,508', '$ 17,447', '6 %', '$ 57,332', '$ 54,867', '4 %']


In [34]:
result_dict["tables"][3]['cells']

[{'kind': 'columnHeader',
  'row_index': 0,
  'column_index': 0,
  'row_span': 2,
  'column_span': 1,
  'content': 'In millions of dollars',
  'bounding_regions': [{'page_number': 12,
    'polygon': [{'x': 0.4898, 'y': 1.225},
     {'x': 3.2012, 'y': 1.225},
     {'x': 3.1935, 'y': 1.6213},
     {'x': 0.4898, 'y': 1.6213}]}],
  'spans': [{'offset': 33683, 'length': 22}]},
 {'kind': 'columnHeader',
  'row_index': 0,
  'column_index': 1,
  'row_span': 1,
  'column_span': 2,
  'content': 'Third Quarter',
  'bounding_regions': [{'page_number': 12,
    'polygon': [{'x': 3.2012, 'y': 1.225},
     {'x': 5.0581, 'y': 1.225},
     {'x': 5.0581, 'y': 1.4348},
     {'x': 3.2012, 'y': 1.4348}]}],
  'spans': [{'offset': 33706, 'length': 13}]},
 {'kind': 'columnHeader',
  'row_index': 0,
  'column_index': 3,
  'row_span': 2,
  'column_span': 1,
  'content': '% Change',
  'bounding_regions': [{'page_number': 12,
    'polygon': [{'x': 5.0581, 'y': 1.225},
     {'x': 5.7728, 'y': 1.225},
     {'x': 5.7

In [19]:
atable

{'row_count': 7,
 'column_count': 7,
 'cells': [{'kind': 'columnHeader',
   'row_index': 0,
   'column_index': 0,
   'row_span': 2,
   'column_span': 1,
   'content': 'In millions of dollars',
   'bounding_regions': [{'page_number': 12,
     'polygon': [{'x': 0.4898, 'y': 1.225},
      {'x': 3.2012, 'y': 1.225},
      {'x': 3.1935, 'y': 1.6213},
      {'x': 0.4898, 'y': 1.6213}]}],
   'spans': [{'offset': 33683, 'length': 22}]},
  {'kind': 'columnHeader',
   'row_index': 0,
   'column_index': 1,
   'row_span': 1,
   'column_span': 2,
   'content': 'Third Quarter',
   'bounding_regions': [{'page_number': 12,
     'polygon': [{'x': 3.2012, 'y': 1.225},
      {'x': 5.0581, 'y': 1.225},
      {'x': 5.0581, 'y': 1.4348},
      {'x': 3.2012, 'y': 1.4348}]}],
   'spans': [{'offset': 33706, 'length': 13}]},
  {'kind': 'columnHeader',
   'row_index': 0,
   'column_index': 3,
   'row_span': 2,
   'column_span': 1,
   'content': '% Change',
   'bounding_regions': [{'page_number': 12,
     'polygo

In [38]:
def get_tables(result):
    result_dict = result
    all_tables = []
    for idx, atable in enumerate(result_dict["tables"]):
        l = list()
        row_count = atable["row_count"]
        column_count = atable["column_count"]
        for aval in atable["cells"]:
            l.append(aval["content"])
        df = pd.DataFrame(np.array(l).reshape(row_count, column_count))
        df.columns = df.iloc[0]
        df = df.drop(df.index[0])
        all_tables.append(df)
    return all_tables

In [24]:
# Create an empty DataFrame with shape (7, 7)
empty_df = pd.DataFrame(index=range(7), columns=range(7))

empty_df.iloc[0, 0] = 1

empty_df


Unnamed: 0,0,1,2,3,4,5,6
0,1.0,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,
6,,,,,,,


In [55]:
result_dict['paragraphs'][1]

{'role': None,
 'content': '(Mark One)',
 'bounding_regions': [{'page_number': 1,
   'polygon': [{'x': 0.4871, 'y': 1.9002},
    {'x': 1.0697, 'y': 1.9002},
    {'x': 1.0697, 'y': 2.0243},
    {'x': 0.4871, 'y': 2.0243}]}],
 'spans': [{'offset': 82, 'length': 10}]}

In [58]:
text_list = []
for i in range(0, len(result_dict['paragraphs'])):
    para = result_dict['paragraphs'][i]
    if (para['bounding_regions'][0]['page_number'] == 174) &  (para['role'] == None):
        if para['content'] not in df_content:
            text_list.append(para['content'])
    else:
        continue
text_list

["In the ordinary course of business, Citigroup enters into various types of derivative transactions. All derivatives are recorded in Trading account assets/Trading account liabilities on the Consolidated Balance Sheet. For additional information regarding Citi's use of and accounting for derivatives, see Note 22 to the Consolidated Financial Statements in Citi's 2021 Form 10-K.",
 "Information pertaining to Citigroup's derivatives activities, based on notional amounts, is presented in the table below. Derivative notional amounts are reference amounts from which contractual payments are derived and do not represent a complete measure of Citi's exposure to derivative transactions. Citi's derivative exposure arises primarily from",
 'market fluctuations (i.e., market risk), counterparty failure (i.e., credit risk) and/or periods of high volatility or financial stress (i.e., liquidity risk), as well as any market valuation adjustments that may be required on the transactions. Moreover, no