In [1]:
import os
import sys
sys.path.append('..')

import argparse
from typing import Dict, List

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import numpy as np
import pandas as pd

from dotenv import load_dotenv
load_dotenv() # load environment variables from .env (add to .gitignore)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)

In [2]:
def parse_pdf(pdf_file: str) -> List[Dict[str, str]]:

    endpoint = os.environ["DOCUMENT_ENDPOINT"]
    key = os.environ["DOCUMENT_KEY"]
    document_analysis_client = DocumentAnalysisClient(
      endpoint=endpoint, credential=AzureKeyCredential(key))

    if pdf_file.startswith("http"):
      poller = document_analysis_client.begin_analyze_document_from_url(
          "prebuilt-layout", pdf_file)
    else:
      with open(pdf_file, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=f)

    result = poller.result()
    # Returns a dict representation of AnalyzeResult.
    result_dict = result.to_dict()
    return result_dict

In [3]:
result_dict = parse_pdf("../pdfs/MFC_QPR_2023_Q4_EN.pdf")

In [4]:
print('Result dict keys():\n', result_dict.keys())
print('\nResult dict.documents keys():\n', result_dict.get('pages')[0].keys())
print('\nResult dict.pages keys():\n', result_dict.get('pages')[0].keys())
print('\nResult dict.paragraphs keys():\n', result_dict.get('paragraphs')[0].keys())
print('\nResult dict.tables keys()\n:', result_dict.get('tables')[0].keys())

Result dict keys():
 dict_keys(['api_version', 'model_id', 'content', 'languages', 'pages', 'paragraphs', 'tables', 'key_value_pairs', 'styles', 'documents'])

Result dict.documents keys():
 dict_keys(['page_number', 'angle', 'width', 'height', 'unit', 'lines', 'words', 'selection_marks', 'spans', 'barcodes', 'formulas'])

Result dict.pages keys():
 dict_keys(['page_number', 'angle', 'width', 'height', 'unit', 'lines', 'words', 'selection_marks', 'spans', 'barcodes', 'formulas'])

Result dict.paragraphs keys():
 dict_keys(['role', 'content', 'bounding_regions', 'spans'])

Result dict.tables keys()
: dict_keys(['row_count', 'column_count', 'cells', 'bounding_regions', 'spans'])


In [5]:
# result_dict.get('pages')
# for i, dict in enumerate(result_dict.get('pages')):
#     print('Page number:', dict.get('page_number'), '\n')
#     for j, line in enumerate(dict.get('lines')):
#         print('Line:', j, line.get('content'))

# for paragraph in result_dict.get('paragraphs'):
#     # print(paragraph)
#     print("\nParagraph:\n", paragraph.get('content'))
#     print("Bounding regions:\n", paragraph.get('bounding_regions'))

In [6]:
def page_text_and_tables(result_dict):

    page_contents = {}
    # tables = []
    # paragraphs = []

    for i, paragraph in enumerate(result_dict.get('paragraphs')):
        # print(paragraph.get('bounding_regions')[0].get('page_number'))
        # print(page_contents.keys())
        if paragraph.get('bounding_regions')[0].get('page_number') in page_contents.keys():
            # print(paragraph.get('bounding_regions')[0].get('page_number'))
            # print(i, page_contents[i])
            page_contents[paragraph.get('bounding_regions')[0].get('page_number')].\
                            get('text').append(paragraph.get('content'))
        else:
            page_contents[paragraph.get('bounding_regions')[0].get('page_number')] = \
                {'tables': [], 'text': [paragraph.get('content')]}
            # print(paragraph.get('bounding_regions')[0].get('page_number'))
            # print(page_contents.keys())

    for idx, atable in enumerate(result_dict["tables"]):
        # print(idx)
        # if idx == -1:
        #   continue
        row_count = atable["row_count"]
        column_count = atable["column_count"]
        arr = np.empty((row_count, column_count), dtype=object)
        arr[-1][:] = ""
        for aval in atable["cells"]:
        # Handles complex headers
            if aval["kind"] == "columnHeader":
                arr[-1][aval["column_index"]:aval["column_index"] +
                    aval["column_span"]] += str(aval["content"])
            else:
                arr[aval["row_index"]][aval["column_index"]] = aval["content"]

        df = pd.DataFrame(arr)
        # print(df)
        df.columns = df.iloc[0]
        df = df.drop(df.index[0:2])
        df.reset_index(inplace=True, drop=True)
        df.dropna(inplace=True)
        page_contents[atable.get('bounding_regions')[0].get('page_number')].get('tables').append(df)
        # tables.append(df)

    return page_contents

In [7]:
page_contents = page_text_and_tables(result_dict)

In [8]:
for page_num, item in page_contents.items():
    print('Page number:', page_num)
    print('Number of tables:', len(item['tables']))
    print('Lenght of text:', len(item['text']))
    print('Text:', item['text'])
    if item['tables']:
        for i, table in enumerate(item['tables']):
            print('Table', i+1, 'in page', page_num, '\n', table, '\n')
    print('')
    if page_num == 3:
        break

Page number: 1
Number of tables: 0
Lenght of text: 23
Text: ['Manulife', 'Manulife Reports Full Year and Fourth Quarter 2023 Results', 'TSX/NYSE/PSE: MFC SEHK: 945', 'C$ unless otherwise stated', 'TORONTO, ON - February 14, 2024 - Manulife Financial Corporation ("Manulife" or the "Company") reported its full year and fourth quarter results for the period ended December 31, 2023, during which we delivered double-digit growth in core EPS, and today declared a common share dividend increase of 9.6%.', 'Key highlights for full year 2023 and the fourth quarter ("4Q23") include:', '• Net income attributed to shareholders of $5.1 billion in 2023, up $1.6 billion from 2022 transitional net income attributed to shareholders ("Transitional Net Income")1, and $1.7 billion in 4Q23, up $0.4 billion from Transitional Net Income in the fourth quarter of 2022 ("4Q22")', '. Net income attributed to shareholders of $5.1 billion in 2023, up $7.0 billion from 2022, and $1.7 billion in 4Q23, up $0.7 billio