Merge pull request #429 from microsoft/geearl/6323-large-tables

Geearl/6323 large tables
microsoft · Jan 20, 2024 · aa712e3 · aa712e3
2 parents a1a0b02 + 6934088
commit aa712e3
Show file tree

Hide file tree

Showing 2 changed files with 219 additions and 47 deletions.
diff --git a/functions/requirements.txt b/functions/requirements.txt
@@ -18,4 +18,4 @@ azure-ai-vision == 0.15.1b1
 unstructured[csv,doc,docx,email,html,md,msg,ppt,pptx,text,xlsx,xml] == 0.10.27
 pyoo == 1.4
 azure-search-documents == 11.4.0b11
-
+beautifulsoup4 == 4.12.2
diff --git a/functions/shared_code/utilities.py b/functions/shared_code/utilities.py
@@ -15,6 +15,7 @@
 import nltk
 # Try to download using nltk.download
 nltk.download('punkt')
+from bs4 import BeautifulSoup
 
 punkt_dir = os.path.join(nltk.data.path[0], 'tokenizers/punkt')
 
@@ -62,6 +63,7 @@ class MediaType:
     MEDIA = "media"    
 
 class Utilities:
+
     """ Class to hold utility functions """
     def __init__(self,
                  azure_blob_storage_account,
@@ -105,30 +107,97 @@ def  get_blob_and_sas(self, blob_path):
         """ Function to retrieve the uri and sas token for a given blob in azure storage"""
         return self.utilities_helper.get_blob_and_sas(blob_path)
 
+    # def table_to_html(self, table):
+    #     """ Function to take an output FR table json structure and convert to HTML """
+    #     header_processing_complete = False
+    #     table_html = "<table>"
+    #     rows = [sorted([cell for cell in table["cells"] if cell["rowIndex"] == i],
+    #                    key=lambda cell: cell["columnIndex"]) for i in range(table["rowCount"])]
+    #     for row_cells in rows:
+    #         is_row_a_header = False
+    #         row_html = "<tr>"
+    #         for cell in row_cells:
+    #             tag = "td"
+    #             #if hasattr(cell, 'kind'):
+    #             if 'kind' in cell:                      
+    #                 if (cell["kind"] == "columnHeader" or cell["kind"] == "rowHeader"):
+    #                     tag = "th"
+    #                 if (cell["kind"] == "columnHeader"):
+    #                     is_row_a_header = True
+    #             else:
+    #                 # we have encountered a cell that isn't tagged as a header, 
+    #                 # so assume we have now rerached regular table cells
+    #                 header_processing_complete = True
+    #             cell_spans = ""
+    #             #if hasattr(cell, 'columnSpan'):
+    #             if 'columnSpan' in cell:
+    #                 if cell["columnSpan"] > 1:
+    #                     cell_spans += f" colSpan={cell['columnSpan']}"
+    #             #if hasattr(cell, 'rowSpan'):
+    #             if 'rowSpan' in cell:
+    #                 if cell["rowSpan"] > 1:
+    #                     cell_spans += f" rowSpan={cell['rowSpan']}"
+    #             row_html += f"<{tag}{cell_spans}>{html.escape(cell['content'])}</{tag}>"
+    #         row_html += "</tr>"
+
+    #         if is_row_a_header and header_processing_complete == False:
+    #             row_html = "<thead>" + row_html + "</thead>"     
+    #         table_html += row_html
+    #     table_html += "</table>"
+    #     return table_html
+
+
+
+
+
+
     def table_to_html(self, table):
         """ Function to take an output FR table json structure and convert to HTML """
         table_html = "<table>"
         rows = [sorted([cell for cell in table["cells"] if cell["rowIndex"] == i],
                        key=lambda cell: cell["columnIndex"]) for i in range(table["rowCount"])]
-        for row_cells in rows:
-            table_html += "<tr>"
+        thead_open_added = False
+        thead_closed_added = False 
+
+        for i, row_cells in enumerate(rows):
+            is_row_a_header = False
+            row_html = "<tr>"
             for cell in row_cells:
                 tag = "td"
-                if hasattr(cell, 'kind'):
+                if 'kind' in cell:                      
                     if (cell["kind"] == "columnHeader" or cell["kind"] == "rowHeader"):
                         tag = "th"
+                    if (cell["kind"] == "columnHeader"):
+                        is_row_a_header = True
                 cell_spans = ""
-                if hasattr(cell, 'columnSpan'):
+                if 'columnSpan' in cell:
                     if cell["columnSpan"] > 1:
                         cell_spans += f" colSpan={cell['columnSpan']}"
-                if hasattr(cell, 'rowSpan'):
+                if 'rowSpan' in cell:
                     if cell["rowSpan"] > 1:
                         cell_spans += f" rowSpan={cell['rowSpan']}"
-                table_html += f"<{tag}{cell_spans}>{html.escape(cell['content'])}</{tag}>"
-            table_html +="</tr>"
+                row_html += f"<{tag}{cell_spans}>{html.escape(cell['content'])}</{tag}>"
+            row_html += "</tr>"
+
+            # add the opening thead if this is the first row and the first header row encountered
+            if is_row_a_header and i == 0 and not thead_open_added:
+                row_html = "<thead>" + row_html 
+                thead_open_added = True 
+
+            # add the closing thead if we have added an opening thead and if this is not a header row
+            if not is_row_a_header and thead_open_added and not thead_closed_added:
+                row_html = "</thead>" + row_html                 
+                thead_closed_added = True
+
+            table_html += row_html
         table_html += "</table>"
         return table_html
 
+
+
+
+
+
     def build_document_map_pdf(self, myblob_name, myblob_uri, result, azure_blob_log_storage_container, enable_dev_code):
         """ Function to build a json structure representing the paragraphs in a document, 
         including metadata such as section heading, title, page number, etc.
@@ -313,7 +382,58 @@ def build_chunk_filepath (self, file_directory, file_name, file_extension, file_
         folder_set = file_directory + file_name + file_extension + "/"
         output_filename = file_name + f'-{file_number}' + '.json'
         return f'{folder_set}{output_filename}'
-
+
+    previous_table_header = ""
+
+    def chunk_table_with_headers(self, prefix_text, table_html, standard_chunk_target_size, 
+                                 previous_paragraph_element_is_a_table):
+        soup = BeautifulSoup(table_html, 'html.parser')
+        thead = str(soup.find('thead'))
+
+        # check if this table is a continuation of a table on a previous page. 
+        # If yes then apply the header row from the previous table
+        if previous_paragraph_element_is_a_table:
+            if thead != "":
+                # update thead to include the main table header
+                thead = thead.replace("<thead>", "<thead>"+self.previous_table_header)
+
+            else:
+                # just use the previoud thead
+                thead = "<thead>"+self.previous_table_header+"</thead>"    
+
+        def add_current_table_chunk(chunk):
+            # Close the table tag for the current chunk and add it to the chunks list
+            if chunk.strip() and not chunk.endswith("<table>"):
+                chunk = '<table>' + chunk + '</table>'
+                chunks.append(chunk)
+                # Start a new chunk with header if it exists                
+
+        # Initialize chunks list
+        chunks = []
+        current_chunk = prefix_text
+        # set the target size of the first chunk 
+        chunk_target_size = standard_chunk_target_size - self.token_count(prefix_text)
+        rows = soup.find_all('tr')
+        # Filter out rows that are part of thead block
+        filtered_rows = [row for row in rows if row.parent.name != "thead"] 
+
+        for i, row in enumerate(filtered_rows):
+            row_html = str(row)
+
+            # If adding this row to the current chunk exceeds the target size, start a new chunk
+            if self.token_count(current_chunk + row_html) > chunk_target_size:
+                add_current_table_chunk(current_chunk)    
+                current_chunk = thead
+                chunk_target_size = standard_chunk_target_size                
+
+            # Add the current row to the chunk
+            current_chunk += row_html
+
+        # Add the final chunk if there's any content left
+        add_current_table_chunk(current_chunk)      
+
+        return chunks
+
     def build_chunks(self, document_map, myblob_name, myblob_uri, chunk_target_size):
         """ Function to build chunk outputs based on the document map """
 
@@ -339,47 +459,83 @@ def build_chunks(self, document_map, myblob_name, myblob_uri, chunk_target_size)
             #if the collected tokens in the current in-memory chunk + the next paragraph
             # will be larger than the allowed chunk size prepare to write out the total chunk
             if (chunk_size + paragraph_size >= chunk_target_size) or section_name != previous_section_name or title_name != previous_title_name or subtitle_name != previous_subtitle_name:
+
                 # If the current paragraph just by itself is larger than CHUNK_TARGET_SIZE,
                 # then we need to split this up and treat each slice as a new in-memory chunk
-                # that fall under the max size and ensure the first chunk,
-                # which will be added to the current
                 if paragraph_size >= chunk_target_size:
-                    # start by keeping the existing in-memory chunk in front of the large paragraph
-                    # and begin to process it on sentence boundaries to break it down into
-                    # sub-chunks that are below the CHUNK_TARGET_SIZE
-                    sentences = sent_tokenize(chunk_text + paragraph_text)
-                    chunks = []
-                    chunk = ""
-                    for sentence in sentences:
-                        temp_chunk = chunk + " " + sentence if chunk else sentence
-                        if self.token_count(temp_chunk) <= chunk_target_size:
-                            chunk = temp_chunk
-                        else:
+
+                    # We will process tables and regular text differently as text can be split by sentence boundaries, 
+                    # but tables fail as the code sees a table as a single sentence.
+                    # We need a speciality way of splitting a table that is greater than
+                    # our target chunk size                    
+                    if paragraph_element["type"] == "table":
+                        # table processing & splitting
+                        table_chunks = self.chunk_table_with_headers(chunk_text, 
+                                                                     paragraph_text, 
+                                                                     chunk_target_size,
+                                                                     previous_paragraph_element_is_a_table)
+
+                        for i, table_chunk in enumerate(table_chunks):
+
+                            # write out each table chunk, apart from the last, as this will be less than or
+                            # equal to CHUNK_TARGET_SIZE the last chunk will be processed like
+                            # a regular paragraph
+                            if i < len(table_chunks) - 1:
+                                self.write_chunk(myblob_name, myblob_uri,
+                                                f"{file_number}.{i}",
+                                                self.token_count(table_chunk),
+                                                table_chunk, page_list,
+                                                previous_section_name, previous_title_name, previous_subtitle_name, 
+                                                MediaType.TEXT)
+                                chunk_count += 1      
+                            else:
+                                # Reset the paragraph token count to just the tokens left in the last
+                                # chunk and leave the remaining text from the large paragraph to be
+                                # combined with the next in the outer loop
+                                paragraph_size = self.token_count(table_chunk)
+                                paragraph_text = table_chunk
+                                chunk_text = ''
+                                file_number += 1                                
+
+                    else:
+                        # text processing & splitting
+                        # start by keeping the existing in-memory chunk in front of the large paragraph
+                        # and begin to process it on sentence boundaries to break it down into
+                        # sub-chunks that are below the CHUNK_TARGET_SIZE
+                        sentences = sent_tokenize(chunk_text + paragraph_text)
+                        chunks = []
+                        chunk = ""
+                        for sentence in sentences:
+                            temp_chunk = chunk + " " + sentence if chunk else sentence
+                            if self.token_count(temp_chunk) <= chunk_target_size:
+                                chunk = temp_chunk
+                            else:
+                                chunks.append(chunk)
+                                chunk = sentence
+                        if chunk:
                             chunks.append(chunk)
-                            chunk = sentence
-                    if chunk:
-                        chunks.append(chunk)
-
-                    # Now write out each chunk, apart from the last, as this will be less than or
-                    # equal to CHUNK_TARGET_SIZE the last chunk will be processed like
-                    # a regular paragraph
-                    for i, chunk_text_p in enumerate(chunks):
-                        if i < len(chunks) - 1:
-                            # Process all but the last chunk in this large para
-                            self.write_chunk(myblob_name, myblob_uri,
-                                             f"{file_number}.{i}",
-                                             self.token_count(chunk_text_p),
-                                             chunk_text_p, page_list,
-                                             previous_section_name, previous_title_name, previous_subtitle_name, 
-                                             MediaType.TEXT)
-                            chunk_count += 1
-                        else:
-                            # Reset the paragraph token count to just the tokens left in the last
-                            # chunk and leave the remaining text from the large paragraph to be
-                            # combined with the next in the outer loop
-                            paragraph_size = self.token_count(chunk_text_p)
-                            paragraph_text = chunk_text_p
-                            chunk_text = ''
+
+                        # Now write out each chunk, apart from the last, as this will be less than or
+                        # equal to CHUNK_TARGET_SIZE the last chunk will be processed like
+                        # a regular paragraph
+                        for i, chunk_text_p in enumerate(chunks):
+                            if i < len(chunks) - 1:
+                                # Process all but the last chunk in this large para
+                                self.write_chunk(myblob_name, myblob_uri,
+                                                f"{file_number}.{i}",
+                                                self.token_count(chunk_text_p),
+                                                chunk_text_p, page_list,
+                                                previous_section_name, previous_title_name, previous_subtitle_name, 
+                                                MediaType.TEXT)
+                                chunk_count += 1
+                            else:
+                                # Reset the paragraph token count to just the tokens left in the last
+                                # chunk and leave the remaining text from the large paragraph to be
+                                # combined with the next in the outer loop
+                                paragraph_size = self.token_count(chunk_text_p)
+                                paragraph_text = chunk_text_p
+                                chunk_text = ''
+                                file_number += 1
                 else:
                     # if this para is not large by itself but will put us over the max token count
                     # or it is a new section, then write out the chunk text we have to this point
@@ -404,7 +560,23 @@ def build_chunks(self, document_map, myblob_name, myblob_uri, chunk_target_size)
             # add paragraph to the chunk
             chunk_size = chunk_size + paragraph_size
             chunk_text = chunk_text + "\n" + paragraph_text
-
+
+            # store the type of paragraph and content, to be used if a table crosses page boundaries and
+            # we need to apply the column headings to subsequent pages
+
+            if paragraph_element["type"] == "table":
+                previous_paragraph_element_is_a_table = True
+                if self.previous_table_header == "":
+                    # Stash the current tables heading to apply to subsequent page tables if they are missing column headings,
+                    # but only for the first page of the multi-page table
+                    soup = BeautifulSoup(paragraph_text, 'html.parser')
+                    # Extract the thead and strip the thead wrapper to just leave the header cells
+                    self.previous_table_header = str(soup.find('thead'))
+                    self.previous_table_header = self.previous_table_header.replace("<thead>", "").replace("</thead>", "")
+            else:
+                previous_paragraph_element_is_a_table = False
+                self.previous_table_header = ""
+
             # If this is the last paragraph then write the chunk
             if index == len(document_map['structure'])-1:
                 self.write_chunk(myblob_name, myblob_uri, file_number, chunk_size,