In [1]:
#!/usr/bin/python3
import argparse
import os
from typing import Dict, List

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import numpy as np
import pandas as pd

from dotenv import load_dotenv

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)

In [2]:
load_dotenv() # load environment variables from .env (add to .gitignore) 

True

In [3]:
def parse_pdf(pdf_file: str) -> List[Dict[str, str]]:

    endpoint = os.environ["DOCUMENT_ENDPOINT"]
    key = os.environ["DOCUMENT_KEY"]
    document_analysis_client = DocumentAnalysisClient(
      endpoint=endpoint, credential=AzureKeyCredential(key))

    if pdf_file.startswith("http"):
      poller = document_analysis_client.begin_analyze_document_from_url(
          "prebuilt-layout", pdf_file)
    else:
      with open(pdf_file, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=f)

    result = poller.result()
    # Returns a dict representation of AnalyzeResult.
    result_dict = result.to_dict()
    return result_dict

In [4]:
result_dict = parse_pdf("pdfs/MFC_QPR_2023_Q4_EN.pdf")

In [5]:
print('Result dict keys():\n', result_dict.keys())
print('\nResult dict.documents keys():\n', result_dict.get('pages')[0].keys())
print('\nResult dict.pages keys():\n', result_dict.get('pages')[0].keys())
print('\nResult dict.paragraphs keys():\n', result_dict.get('paragraphs')[0].keys())
print('\nResult dict.tables keys()\n:', result_dict.get('tables')[0].keys())
# print('Result dict.tables keys():', result_dict.get('tables')[0]['cells'][20]['kind'])

Result dict keys():
 dict_keys(['api_version', 'model_id', 'content', 'languages', 'pages', 'paragraphs', 'tables', 'key_value_pairs', 'styles', 'documents'])

Result dict.documents keys():
 dict_keys(['page_number', 'angle', 'width', 'height', 'unit', 'lines', 'words', 'selection_marks', 'spans', 'barcodes', 'formulas'])

Result dict.pages keys():
 dict_keys(['page_number', 'angle', 'width', 'height', 'unit', 'lines', 'words', 'selection_marks', 'spans', 'barcodes', 'formulas'])

Result dict.paragraphs keys():
 dict_keys(['role', 'content', 'bounding_regions', 'spans'])

Result dict.tables keys()
: dict_keys(['row_count', 'column_count', 'cells', 'bounding_regions', 'spans'])


In [6]:
# result_dict.get('pages')
# for i, dict in enumerate(result_dict.get('pages')):
#     print('Page number:', dict.get('page_number'), '\n')
#     for j, line in enumerate(dict.get('lines')):
#         print('Line:', j, line.get('content'))

# for paragraph in result_dict.get('paragraphs'):
#     # print(paragraph)
#     print("\nParagraph:\n", paragraph.get('content'))
#     print("Bounding regions:\n", paragraph.get('bounding_regions'))

In [7]:
def page_text_and_tables(result_dict):

    page_contents = {}
    # tables = []
    # paragraphs = []

    for i, paragraph in enumerate(result_dict.get('paragraphs')):
        # print(paragraph.get('bounding_regions')[0].get('page_number'))
        # print(page_contents.keys())
        if paragraph.get('bounding_regions')[0].get('page_number') in page_contents.keys():
            # print(paragraph.get('bounding_regions')[0].get('page_number'))
            # print(i, page_contents[i])
            page_contents[paragraph.get('bounding_regions')[0].get('page_number')].get('text').append(paragraph.get('content'))
        else:
            page_contents[paragraph.get('bounding_regions')[0].get('page_number')] = \
                {'tables': [], 'text': [paragraph.get('content')]}
            # print(paragraph.get('bounding_regions')[0].get('page_number'))
            # print(page_contents.keys())

    for idx, atable in enumerate(result_dict["tables"]):
        # print(idx)
        # if idx == -1:
        #   continue
        row_count = atable["row_count"]
        column_count = atable["column_count"]
        arr = np.empty((row_count, column_count), dtype=object)
        arr[-1][:] = ""
        for aval in atable["cells"]:
        # Handles complex headers
            if aval["kind"] == "columnHeader":
                arr[-1][aval["column_index"]:aval["column_index"] +
                    aval["column_span"]] += str(aval["content"])
            else:
                arr[aval["row_index"]][aval["column_index"]] = aval["content"]

        df = pd.DataFrame(arr)
        # print(df)
        df.columns = df.iloc[0]
        df = df.drop(df.index[0:2])
        df.reset_index(inplace=True, drop=True)
        df.dropna(inplace=True)
        page_contents[atable.get('bounding_regions')[0].get('page_number')].get('tables').append(df)
        # tables.append(df)

    return page_contents

In [8]:
page_contents = page_text_and_tables(result_dict)

In [9]:
# for page_no, item in page_contents.items():
#     print('Page number:', page_no)
#     print('Number of tables:', len(item['tables']))
#     print('Lenght of text:', len(item['text']))
#     print('Text:', item['text'])
#     if item['tables']:
#         for i, table in enumerate(item['tables']):
#             print('Table:', i+1, 'in page', page_no, '\n', table, '\n')
#     print('')