In [2]:
import os
from uuid import uuid4
from typing import Optional, Union, List, Tuple
from pydantic import BaseModel
from pathlib import Path
import openparse
from openparse.schemas import TableElement, TextElement, ImageElement, LineElement
import pdfplumber
from IPython.display import HTML, display
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser


class TextIOU(BaseModel):
    element: TextElement
    target_table: TableElement
    iou: float

    @property
    def text(self):
        return self.element.text

def get_bbox(element: Union[TableElement, TextElement]) -> Tuple[float, float, float, float]:
    x0 = element.bbox.x0
    y0 = element.bbox.y0
    x1 = element.bbox.x1
    y1 = element.bbox.y1

    return (x0, y0, x1, y1)

def calc_text_iou(table_element: TableElement, text_element: TextElement, y_offsets: Tuple[float, float]) -> float:
    b1x0, b1y0, b1x1, b1y1 = get_bbox(table_element)
    b2x0, b2y0, b2x1, b2y1 = get_bbox(text_element)

    b1y0, b1y1 = b1y0 - y_offsets[0], b1y1 + y_offsets[1]

    left = max(b1x0, b2x0)
    top = max(b1y0, b2y0)
    right = min(b1x1, b2x1)
    bottom = min(b1y1, b2y1)

    if right < left or bottom < top:
        return 0.0

    intersection_area = (right - left) * (bottom - top)
    text_area = (b2x1 - b2x0) * (b2y1 - b2y0)
    iou = intersection_area / text_area

    return iou

def find_description_by_iou(candidates: List[TextIOU]) -> str:
    if not candidates:
        return "Not found"
    
    table_descriptions = \
        [candidate for candidate in candidates if candidate.text.lower().startswith("table")]
    
    if table_descriptions:
        highest_iou = 0
        highest = None
        for candidate in table_descriptions:
            if highest_iou <= candidate.iou:
                highest_iou = candidate.iou
                highest = candidate
        return highest.text.replace("\n", " ")
    
    highest = sorted(candidates, key=lambda x: x.iou, reverse=True)[0]
    return highest.text.replace("\n", " ")

def refine(table_html: str, window_context: List[str]):
    template = """# Role
You're an expert at refining and modifying HTML tables.
Given a table and its surrounding contextual information, you need to fix and correct what is wrong or incorrect about the table's columns and data.
Do not add data by referencing and inferring from the contextual information. The number of rows does not change.
If a description of the table exists in the contextual information, add it as a <h2> tag.
Your answers are formatted as HTML code only (e.g., <html> ... </html>).
# TABLE
{table_html}

# CONTEXT
{context}

html:"""

    prompt = PromptTemplate.from_template(template)
    llm = ChatOpenAI(model_name="gpt-4o")
    ouput_parser = StrOutputParser()

    chain = prompt | llm | ouput_parser
    return chain.invoke({"table_html": table_html, "context": "\n\n".join(window_context)})

In [3]:
os.environ["OPENAI_API_KEY"] = "..."

PDF_PATH = "../data/pdf"
IMAGE_PATH = "../data/images"
pdf_path = Path(PDF_PATH)
image_path = Path(IMAGE_PATH)

In [4]:
# 테스트 PDF 파일(https://arxiv.org/pdf/2410.10315v1)
filepath = pdf_path / "EasyRAG: Efficient Retrieval-Augmented Generation Framework for Automated Network Operations.pdf"

In [5]:
table_args={
    "parsing_algorithm": "unitable",
    "min_table_confidence": 0.8,
}
parser = openparse.DocumentParser(table_args=table_args)
parsed_basic_doc = parser.parse(filepath)

  from .autonotebook import tqdm as notebook_tqdm


Finished loading models. Ready for inference.


In [6]:
table_elements = []
text_elements = []

for node in parsed_basic_doc.nodes:
    for idx, element in enumerate(node.elements):
        if isinstance(element, TableElement):
            table_elements.append(element)
        
        if isinstance(element, TextElement):
            text_elements.append(element)

In [7]:
Y_OFFSETS = (50, 400)
MIN_IOU = 0.1

for table_element in table_elements:
    candidates = []
    table_page = table_element.page
    lookup_elements = [text for text in text_elements if text.page == table_page]
    
    # TableElement와 TextElement 간 IOU 계산을 통한 Table description 검색
    for text_element in lookup_elements:
        text_iou = calc_text_iou(table_element, text_element, Y_OFFSETS)
        if text_iou >= MIN_IOU:
            candidates.append(
                TextIOU(**{
                    "element": text_element, 
                    "target_table": table_element, 
                    "iou": text_iou
                })
            )
            
    description = find_description_by_iou(candidates)

    print(description)
    display(HTML(table_element.text.replace("[html]", "")))


Not found


id,data,chunk,coarse ranking,re - ranking,fusion,accuracy
0,@,1024.5,"( j ), 3",-,-,57.86
1,( θ ),1024.5,"Q, 8",-,-,68.59
2,@,1024.5,"( 3 ), 8",-,-,69.55
3,@,1024.5,"Q, 192","Q, 8",-,73.73
4,@,1024.5,"Q, 256","Q, 8",-,70.68
5,( θ ),1024.5,"( 2 ), 192","Q, 8",-,69.25
6,( θ ),1024.5,"( T ), 288","(≥), 8",-,77.07
7,Q,1024.5,"Q, 288","(≥), 8",-,77.51
8,( 2 ),1024.5,"( T ), 288","(≥), 8",-,77.92
9,( 2 ),1024.5,"( 1 ), 256","(≥), 8",-,78.49


Table 2: Semi-final experimental results. In the ’Chunk’ column, the two numbers represent chunk_size and  chunk_overlap, respectively. In the ’Coarse Ranking’ and ’Re-ranking’ columns, multiple search paths are separated  by spaces, and within each search path, the components separated by commas represent the retrieval/sorting method,  top-k, and the type of document expansion (if no expansion is applied, it is not listed). 


id,data,chunk,coarse ranking,re - ranking,fusion,image,answer merge,accuracy
0,( 2 ),960.2,3.192,( 3.6,-,-,-,91.53
1,( 2 ),960.2,"( 4 ), 288","( 3 ), 6",-,-,-,88.4
2,( 2 ),960.2,"( 4 ), 288 ( 3 ), 192",( 3.6,(≥),-,-,90.0
3,( 3 ),1024.2,"( 3 ), 192",( 3.6,-,-,-,90.26
4,ð4Þ,1024.2,"( 3 ), 192",3.6,-,-,-,91.38
5,( 3 ),1024.2,"( 3 ), 192,( l )",3.6,-,-,-,92.7
6,( 3 ),1024.2,"( 3 ), 192, Q","( 3.6, 1 )",-,-,-,89.3
7,( 3 ),1024.2,"( 3 ), 192,( l )","3.6, 2",-,-,-,87.12
8,( 3 ),1024.2,"( 3 ), 192,( 2 )",( 3.6,-,-,-,92.43
9,( 3 ),1024.2,"( 3 ), 192,( 2 )","( 3.6,( T )",-,-,-,93.11


Table 3: Rewrite Performance 


Method,Preliminary Accuracy
Original,82.0
Concat,78.2
Summary,79.4


Table 4: HyDE Performance 


Method,| Semifinal Accuracy
Original,92.7
Retrieval + HyDE,89.2
rerank + HyDE,88.2


Table 5: Effects of Different Prompts 


Prompt Type,Semi - final Accuracy
Normal QA Template,94.49
CoT QA Template,89.75
Markdown Format QA Template,92.27
Focused QA Template,93.51


Table 6: Effects of BM25 acceleration on the test set.  Time represents the total search time for 103 questions  related to BM25, and accuracy represents the evaluation  score of the final generated answers. 


Implementation,Time ( s ),Accuracy
BM25Okapi,17.0,94.49
BM25s,0.05,94.24


Table 7: Reranker Acceleration Experiment 


Method,Time ( s ) Similarity (%) Rank,8 - layer,1.67
73,2.5,12 - layer,2.2
88,3.2,20 - layer,3.58
86,4.0,28 - layer,5.25
100,6.0,40 - layer,7.71
100,5.4,Maximum ( 0.1 ),2.59
90,3.7,Maximum ( 0.2 ),3.55
96,4.5,Maximum ( 0.4 ),4.57
97,5.4,Entropy ( 0.2 ),2.74
89,3.4,Entropy ( 0.4 ),3.37
91,3.6,Entropy ( 0.6 ),4.01


Table 7: Reranker Acceleration Experiment 


Compression Algorithm,Compression Rate (%),Tokens Saved,Accuracy,Time ( s )
Original Context,100.0,0,94.49,9.3
LLMLingua ( 0.5 ),62.8,143k,83.44,10.47
LongLLMLingua ( 0.5 ),62.8,143k,80.86,10.52
BM25 - Extract ( 0.5 ),55.92,160k,86.48,7.7
BM25 - Extract ( 0.8 ),83.84,5 %,89.0,8.12


In [8]:
elements = []

for node in parsed_basic_doc.nodes:
    for idx, element in enumerate(node.elements):
        if isinstance(element, TableElement) or isinstance(element, TextElement):
            elements.append(element)

window_size = 9

table_elements = [(idx, element) for idx, element in enumerate(elements) if isinstance(element, TableElement)]
text_elements = [element for element in elements if isinstance(element, TextElement)]

for i, (element_index, table) in enumerate(table_elements):
    print("BEFORE")
    display(HTML(table.text))
    window_elements = text_elements[element_index - window_size//2 - i:element_index + window_size//2 - i]
    window_contexts = [element.text for element in window_elements]
    refined = refine(table.text.replace("[html]", ""), [context for context in window_contexts])
    print("AFTER")
    display(HTML(refined.replace("[html]", "")))

BEFORE


id,data,chunk,coarse ranking,re - ranking,fusion,accuracy
0,@,1024.5,"( j ), 3",-,-,57.86
1,( θ ),1024.5,"Q, 8",-,-,68.59
2,@,1024.5,"( 3 ), 8",-,-,69.55
3,@,1024.5,"Q, 192","Q, 8",-,73.73
4,@,1024.5,"Q, 256","Q, 8",-,70.68
5,( θ ),1024.5,"( 2 ), 192","Q, 8",-,69.25
6,( θ ),1024.5,"( T ), 288","(≥), 8",-,77.07
7,Q,1024.5,"Q, 288","(≥), 8",-,77.51
8,( 2 ),1024.5,"( T ), 288","(≥), 8",-,77.92
9,( 2 ),1024.5,"( 1 ), 256","(≥), 8",-,78.49


AFTER


id,data,chunk,coarse ranking,re-ranking,fusion,accuracy
0,@,1024.5,"( j ), 3",-,-,57.86
1,( θ ),1024.5,"Q, 8",-,-,68.59
2,@,1024.5,"( 3 ), 8",-,-,69.55
3,@,1024.5,"Q, 192","Q, 8",-,73.73
4,@,1024.5,"Q, 256","Q, 8",-,70.68
5,( θ ),1024.5,"( 2 ), 192","Q, 8",-,69.25
6,( θ ),1024.5,"( T ), 288","(≥), 8",-,77.07
7,Q,1024.5,"Q, 288","(≥), 8",-,77.51
8,( 2 ),1024.5,"( T ), 288","(≥), 8",-,77.92
9,( 2 ),1024.5,"( 1 ), 256","(≥), 8",-,78.49


BEFORE


id,data,chunk,coarse ranking,re - ranking,fusion,image,answer merge,accuracy
0,( 2 ),960.2,3.192,( 3.6,-,-,-,91.53
1,( 2 ),960.2,"( 4 ), 288","( 3 ), 6",-,-,-,88.4
2,( 2 ),960.2,"( 4 ), 288 ( 3 ), 192",( 3.6,(≥),-,-,90.0
3,( 3 ),1024.2,"( 3 ), 192",( 3.6,-,-,-,90.26
4,ð4Þ,1024.2,"( 3 ), 192",3.6,-,-,-,91.38
5,( 3 ),1024.2,"( 3 ), 192,( l )",3.6,-,-,-,92.7
6,( 3 ),1024.2,"( 3 ), 192, Q","( 3.6, 1 )",-,-,-,89.3
7,( 3 ),1024.2,"( 3 ), 192,( l )","3.6, 2",-,-,-,87.12
8,( 3 ),1024.2,"( 3 ), 192,( 2 )",( 3.6,-,-,-,92.43
9,( 3 ),1024.2,"( 3 ), 192,( 2 )","( 3.6,( T )",-,-,-,93.11


AFTER


id,data,chunk,coarse ranking,re-ranking,fusion,image,answer merge,accuracy
0,(2),"960, 200","3, 192",(3.6),-,-,-,91.53
1,(2),"960, 200","(4), 288","(3), 6",-,-,-,88.4
2,(2),"960, 200","(4), 288 (3), 192",(3.6),(≥),-,-,90.0
3,(3),"1024, 200","(3), 192",(3.6),-,-,-,90.26
4,(4),"1024, 200","(3), 192",3.6,-,-,-,91.38
5,(3),"1024, 200","(3), 192",3.6,-,-,-,92.7
6,(3),"1024, 200","(3), 192","(3.6, 1)",-,-,-,89.3
7,(3),"1024, 200","(3), 192","3.6, 2",-,-,-,87.12
8,(3),"1024, 200","(3), 192",(3.6),-,-,-,92.43
9,(3),"1024, 200","(3), 192",(3.6),-,-,-,93.11


BEFORE


Method,Preliminary Accuracy
Original,82.0
Concat,78.2
Summary,79.4


AFTER


Method,Preliminary Accuracy (%)
Original,82.0
Concat,78.2
Summary,79.4


BEFORE


Method,| Semifinal Accuracy
Original,92.7
Retrieval + HyDE,89.2
rerank + HyDE,88.2


AFTER


Method,Semifinal Accuracy
Original,92.7
Retrieval + HyDE,89.2
Rerank + HyDE,88.2


BEFORE


Prompt Type,Semi - final Accuracy
Normal QA Template,94.49
CoT QA Template,89.75
Markdown Format QA Template,92.27
Focused QA Template,93.51


AFTER


Prompt Type,Semi-final Accuracy
Normal QA Template,94.49
CoT QA Template,89.75
Markdown Format QA Template,92.27
Focused QA Template,93.51


BEFORE


Implementation,Time ( s ),Accuracy
BM25Okapi,17.0,94.49
BM25s,0.05,94.24


AFTER


Implementation,Time (s),Accuracy (%)
BM25Okapi,17.0,94.49
BM25s,0.05,94.24


BEFORE


Method,Time ( s ) Similarity (%) Rank,8 - layer,1.67
73,2.5,12 - layer,2.2
88,3.2,20 - layer,3.58
86,4.0,28 - layer,5.25
100,6.0,40 - layer,7.71
100,5.4,Maximum ( 0.1 ),2.59
90,3.7,Maximum ( 0.2 ),3.55
96,4.5,Maximum ( 0.4 ),4.57
97,5.4,Entropy ( 0.2 ),2.74
89,3.4,Entropy ( 0.4 ),3.37
91,3.6,Entropy ( 0.6 ),4.01


AFTER


Method,Time (s),Similarity (%),Rank
8 - layer,1.67,73,2.5
12 - layer,2.2,88,3.2
20 - layer,3.58,86,4.0
28 - layer,5.25,100,6.0
40 - layer,7.71,100,5.4
Maximum (0.1),2.59,90,3.7
Maximum (0.2),3.55,96,4.5
Maximum (0.4),4.57,97,5.4
Entropy (0.2),2.74,89,3.4
Entropy (0.4),3.37,91,3.6


BEFORE


Compression Algorithm,Compression Rate (%),Tokens Saved,Accuracy,Time ( s )
Original Context,100.0,0,94.49,9.3
LLMLingua ( 0.5 ),62.8,143k,83.44,10.47
LongLLMLingua ( 0.5 ),62.8,143k,80.86,10.52
BM25 - Extract ( 0.5 ),55.92,160k,86.48,7.7
BM25 - Extract ( 0.8 ),83.84,5 %,89.0,8.12


AFTER


Compression Algorithm,Compression Rate (%),Tokens Saved,Accuracy (%),Time (s)
Original Context,100.0,0,94.49,9.3
LLMLingua (0.5),62.8,143k,83.44,10.47
LongLLMLingua (0.5),62.8,143k,80.86,10.52
BM25 - Extract (0.5),55.92,160k,86.48,7.7
BM25 - Extract (0.8),83.84,80k,89.0,8.12
