In [3]:
!pip -q install google-cloud-documentai

In [4]:
from google.cloud import documentai
import pandas as pd

In [5]:
PROJECT_ID = "YOUR PROJECT"
LOCATION = "us"
PROCESSOR_ID = "YOUR PROCESSOR ID"
FILE_PATH = "form_with_tables.pdf"
MIME_TYPE = "application/pdf"

In [6]:
opts = {"api_endpoint": f"{LOCATION}-documentai.googleapis.com"}

documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

resource_name = documentai_client.processor_path(PROJECT_ID, LOCATION, PROCESSOR_ID)

In [8]:
with open(FILE_PATH, "rb") as image:
    image_content = image.read()
    
    raw_document = documentai.RawDocument(
        content = image_content,
        mime_type = MIME_TYPE
    )
    
    request = documentai.ProcessRequest(
        name=resource_name,
        raw_document=raw_document
    )
    
    result = documentai_client.process_document(request=request)

In [9]:
def text_anchor_to_text(text_anchor, text):
    """
    Document AI identifies table data by their offsets in the entirety of the
    document's text. This function converts offsets to a string.
    """
    response = ""
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    for segment in text_anchor.text_segments:
        start_index = int(segment.start_index)
        end_index = int(segment.end_index)
        response += text[start_index:end_index]
    return response.strip().replace("\n", " ")

In [16]:
def get_table_data(rows, text):
    all_values = []
    for row in rows:
        current_row_values = []
        for cell in row.cells:
            current_row_values.append(
                text_anchor_to_text(cell.layout.text_anchor, text)
            )
        all_values.append(current_row_values)
    return all_values

In [17]:
document = result.document

header_row_values = []
body_row_values = []

for page in document.pages:
    for index, table in enumerate(page.tables):
        header_row_values = get_table_data(table.header_rows, document.text)
        body_row_values = get_table_data(table.body_rows, document.text)
        
        df = pd.DataFrame(
            data=body_row_values,
            columns=pd.MultiIndex.from_arrays(header_row_values)
        )
        
        df.to_csv("output.csv", index=False)