Skip to content

Commit

Permalink
Merge pull request #429 from microsoft/geearl/6323-large-tables
Browse files Browse the repository at this point in the history
Geearl/6323 large tables
  • Loading branch information
georearl committed Jan 20, 2024
2 parents a1a0b02 + 6934088 commit aa712e3
Show file tree
Hide file tree
Showing 2 changed files with 219 additions and 47 deletions.
2 changes: 1 addition & 1 deletion functions/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ azure-ai-vision == 0.15.1b1
unstructured[csv,doc,docx,email,html,md,msg,ppt,pptx,text,xlsx,xml] == 0.10.27
pyoo == 1.4
azure-search-documents == 11.4.0b11

beautifulsoup4 == 4.12.2
264 changes: 218 additions & 46 deletions functions/shared_code/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import nltk
# Try to download using nltk.download
nltk.download('punkt')
from bs4 import BeautifulSoup

punkt_dir = os.path.join(nltk.data.path[0], 'tokenizers/punkt')

Expand Down Expand Up @@ -62,6 +63,7 @@ class MediaType:
MEDIA = "media"

class Utilities:

""" Class to hold utility functions """
def __init__(self,
azure_blob_storage_account,
Expand Down Expand Up @@ -105,30 +107,97 @@ def get_blob_and_sas(self, blob_path):
""" Function to retrieve the uri and sas token for a given blob in azure storage"""
return self.utilities_helper.get_blob_and_sas(blob_path)

# def table_to_html(self, table):
# """ Function to take an output FR table json structure and convert to HTML """
# header_processing_complete = False
# table_html = "<table>"
# rows = [sorted([cell for cell in table["cells"] if cell["rowIndex"] == i],
# key=lambda cell: cell["columnIndex"]) for i in range(table["rowCount"])]
# for row_cells in rows:
# is_row_a_header = False
# row_html = "<tr>"
# for cell in row_cells:
# tag = "td"
# #if hasattr(cell, 'kind'):
# if 'kind' in cell:
# if (cell["kind"] == "columnHeader" or cell["kind"] == "rowHeader"):
# tag = "th"
# if (cell["kind"] == "columnHeader"):
# is_row_a_header = True
# else:
# # we have encountered a cell that isn't tagged as a header,
# # so assume we have now rerached regular table cells
# header_processing_complete = True
# cell_spans = ""
# #if hasattr(cell, 'columnSpan'):
# if 'columnSpan' in cell:
# if cell["columnSpan"] > 1:
# cell_spans += f" colSpan={cell['columnSpan']}"
# #if hasattr(cell, 'rowSpan'):
# if 'rowSpan' in cell:
# if cell["rowSpan"] > 1:
# cell_spans += f" rowSpan={cell['rowSpan']}"
# row_html += f"<{tag}{cell_spans}>{html.escape(cell['content'])}</{tag}>"
# row_html += "</tr>"

# if is_row_a_header and header_processing_complete == False:
# row_html = "<thead>" + row_html + "</thead>"
# table_html += row_html
# table_html += "</table>"
# return table_html






def table_to_html(self, table):
""" Function to take an output FR table json structure and convert to HTML """
table_html = "<table>"
rows = [sorted([cell for cell in table["cells"] if cell["rowIndex"] == i],
key=lambda cell: cell["columnIndex"]) for i in range(table["rowCount"])]
for row_cells in rows:
table_html += "<tr>"
thead_open_added = False
thead_closed_added = False

for i, row_cells in enumerate(rows):
is_row_a_header = False
row_html = "<tr>"
for cell in row_cells:
tag = "td"
if hasattr(cell, 'kind'):
if 'kind' in cell:
if (cell["kind"] == "columnHeader" or cell["kind"] == "rowHeader"):
tag = "th"
if (cell["kind"] == "columnHeader"):
is_row_a_header = True
cell_spans = ""
if hasattr(cell, 'columnSpan'):
if 'columnSpan' in cell:
if cell["columnSpan"] > 1:
cell_spans += f" colSpan={cell['columnSpan']}"
if hasattr(cell, 'rowSpan'):
if 'rowSpan' in cell:
if cell["rowSpan"] > 1:
cell_spans += f" rowSpan={cell['rowSpan']}"
table_html += f"<{tag}{cell_spans}>{html.escape(cell['content'])}</{tag}>"
table_html +="</tr>"
row_html += f"<{tag}{cell_spans}>{html.escape(cell['content'])}</{tag}>"
row_html += "</tr>"

# add the opening thead if this is the first row and the first header row encountered
if is_row_a_header and i == 0 and not thead_open_added:
row_html = "<thead>" + row_html
thead_open_added = True

# add the closing thead if we have added an opening thead and if this is not a header row
if not is_row_a_header and thead_open_added and not thead_closed_added:
row_html = "</thead>" + row_html
thead_closed_added = True

table_html += row_html
table_html += "</table>"
return table_html






def build_document_map_pdf(self, myblob_name, myblob_uri, result, azure_blob_log_storage_container, enable_dev_code):
""" Function to build a json structure representing the paragraphs in a document,
including metadata such as section heading, title, page number, etc.
Expand Down Expand Up @@ -313,7 +382,58 @@ def build_chunk_filepath (self, file_directory, file_name, file_extension, file_
folder_set = file_directory + file_name + file_extension + "/"
output_filename = file_name + f'-{file_number}' + '.json'
return f'{folder_set}{output_filename}'


previous_table_header = ""

def chunk_table_with_headers(self, prefix_text, table_html, standard_chunk_target_size,
previous_paragraph_element_is_a_table):
soup = BeautifulSoup(table_html, 'html.parser')
thead = str(soup.find('thead'))

# check if this table is a continuation of a table on a previous page.
# If yes then apply the header row from the previous table
if previous_paragraph_element_is_a_table:
if thead != "":
# update thead to include the main table header
thead = thead.replace("<thead>", "<thead>"+self.previous_table_header)

else:
# just use the previoud thead
thead = "<thead>"+self.previous_table_header+"</thead>"

def add_current_table_chunk(chunk):
# Close the table tag for the current chunk and add it to the chunks list
if chunk.strip() and not chunk.endswith("<table>"):
chunk = '<table>' + chunk + '</table>'
chunks.append(chunk)
# Start a new chunk with header if it exists

# Initialize chunks list
chunks = []
current_chunk = prefix_text
# set the target size of the first chunk
chunk_target_size = standard_chunk_target_size - self.token_count(prefix_text)
rows = soup.find_all('tr')
# Filter out rows that are part of thead block
filtered_rows = [row for row in rows if row.parent.name != "thead"]

for i, row in enumerate(filtered_rows):
row_html = str(row)

# If adding this row to the current chunk exceeds the target size, start a new chunk
if self.token_count(current_chunk + row_html) > chunk_target_size:
add_current_table_chunk(current_chunk)
current_chunk = thead
chunk_target_size = standard_chunk_target_size

# Add the current row to the chunk
current_chunk += row_html

# Add the final chunk if there's any content left
add_current_table_chunk(current_chunk)

return chunks

def build_chunks(self, document_map, myblob_name, myblob_uri, chunk_target_size):
""" Function to build chunk outputs based on the document map """

Expand All @@ -339,47 +459,83 @@ def build_chunks(self, document_map, myblob_name, myblob_uri, chunk_target_size)
#if the collected tokens in the current in-memory chunk + the next paragraph
# will be larger than the allowed chunk size prepare to write out the total chunk
if (chunk_size + paragraph_size >= chunk_target_size) or section_name != previous_section_name or title_name != previous_title_name or subtitle_name != previous_subtitle_name:

# If the current paragraph just by itself is larger than CHUNK_TARGET_SIZE,
# then we need to split this up and treat each slice as a new in-memory chunk
# that fall under the max size and ensure the first chunk,
# which will be added to the current
if paragraph_size >= chunk_target_size:
# start by keeping the existing in-memory chunk in front of the large paragraph
# and begin to process it on sentence boundaries to break it down into
# sub-chunks that are below the CHUNK_TARGET_SIZE
sentences = sent_tokenize(chunk_text + paragraph_text)
chunks = []
chunk = ""
for sentence in sentences:
temp_chunk = chunk + " " + sentence if chunk else sentence
if self.token_count(temp_chunk) <= chunk_target_size:
chunk = temp_chunk
else:

# We will process tables and regular text differently as text can be split by sentence boundaries,
# but tables fail as the code sees a table as a single sentence.
# We need a speciality way of splitting a table that is greater than
# our target chunk size
if paragraph_element["type"] == "table":
# table processing & splitting
table_chunks = self.chunk_table_with_headers(chunk_text,
paragraph_text,
chunk_target_size,
previous_paragraph_element_is_a_table)

for i, table_chunk in enumerate(table_chunks):

# write out each table chunk, apart from the last, as this will be less than or
# equal to CHUNK_TARGET_SIZE the last chunk will be processed like
# a regular paragraph
if i < len(table_chunks) - 1:
self.write_chunk(myblob_name, myblob_uri,
f"{file_number}.{i}",
self.token_count(table_chunk),
table_chunk, page_list,
previous_section_name, previous_title_name, previous_subtitle_name,
MediaType.TEXT)
chunk_count += 1
else:
# Reset the paragraph token count to just the tokens left in the last
# chunk and leave the remaining text from the large paragraph to be
# combined with the next in the outer loop
paragraph_size = self.token_count(table_chunk)
paragraph_text = table_chunk
chunk_text = ''
file_number += 1

else:
# text processing & splitting
# start by keeping the existing in-memory chunk in front of the large paragraph
# and begin to process it on sentence boundaries to break it down into
# sub-chunks that are below the CHUNK_TARGET_SIZE
sentences = sent_tokenize(chunk_text + paragraph_text)
chunks = []
chunk = ""
for sentence in sentences:
temp_chunk = chunk + " " + sentence if chunk else sentence
if self.token_count(temp_chunk) <= chunk_target_size:
chunk = temp_chunk
else:
chunks.append(chunk)
chunk = sentence
if chunk:
chunks.append(chunk)
chunk = sentence
if chunk:
chunks.append(chunk)

# Now write out each chunk, apart from the last, as this will be less than or
# equal to CHUNK_TARGET_SIZE the last chunk will be processed like
# a regular paragraph
for i, chunk_text_p in enumerate(chunks):
if i < len(chunks) - 1:
# Process all but the last chunk in this large para
self.write_chunk(myblob_name, myblob_uri,
f"{file_number}.{i}",
self.token_count(chunk_text_p),
chunk_text_p, page_list,
previous_section_name, previous_title_name, previous_subtitle_name,
MediaType.TEXT)
chunk_count += 1
else:
# Reset the paragraph token count to just the tokens left in the last
# chunk and leave the remaining text from the large paragraph to be
# combined with the next in the outer loop
paragraph_size = self.token_count(chunk_text_p)
paragraph_text = chunk_text_p
chunk_text = ''

# Now write out each chunk, apart from the last, as this will be less than or
# equal to CHUNK_TARGET_SIZE the last chunk will be processed like
# a regular paragraph
for i, chunk_text_p in enumerate(chunks):
if i < len(chunks) - 1:
# Process all but the last chunk in this large para
self.write_chunk(myblob_name, myblob_uri,
f"{file_number}.{i}",
self.token_count(chunk_text_p),
chunk_text_p, page_list,
previous_section_name, previous_title_name, previous_subtitle_name,
MediaType.TEXT)
chunk_count += 1
else:
# Reset the paragraph token count to just the tokens left in the last
# chunk and leave the remaining text from the large paragraph to be
# combined with the next in the outer loop
paragraph_size = self.token_count(chunk_text_p)
paragraph_text = chunk_text_p
chunk_text = ''
file_number += 1
else:
# if this para is not large by itself but will put us over the max token count
# or it is a new section, then write out the chunk text we have to this point
Expand All @@ -404,7 +560,23 @@ def build_chunks(self, document_map, myblob_name, myblob_uri, chunk_target_size)
# add paragraph to the chunk
chunk_size = chunk_size + paragraph_size
chunk_text = chunk_text + "\n" + paragraph_text


# store the type of paragraph and content, to be used if a table crosses page boundaries and
# we need to apply the column headings to subsequent pages

if paragraph_element["type"] == "table":
previous_paragraph_element_is_a_table = True
if self.previous_table_header == "":
# Stash the current tables heading to apply to subsequent page tables if they are missing column headings,
# but only for the first page of the multi-page table
soup = BeautifulSoup(paragraph_text, 'html.parser')
# Extract the thead and strip the thead wrapper to just leave the header cells
self.previous_table_header = str(soup.find('thead'))
self.previous_table_header = self.previous_table_header.replace("<thead>", "").replace("</thead>", "")
else:
previous_paragraph_element_is_a_table = False
self.previous_table_header = ""

# If this is the last paragraph then write the chunk
if index == len(document_map['structure'])-1:
self.write_chunk(myblob_name, myblob_uri, file_number, chunk_size,
Expand Down

0 comments on commit aa712e3

Please sign in to comment.