In [None]:
from azformrec import AzureDocumentTool, EnhancePage
# import devtools

doc_file = "../dataset/rhb_dataset/openai_policy_search/01. S07_SM773_MY.V2.0.pdf"
cache_dir = "../dataset/rhb_dataset/openai_policy_search/01. S07_SM773_MY.V2.0.pdf-cache"
analyze_path = f"{cache_dir}/analyzed-result.json"
result = AzureDocumentTool.load_cache_form_recognizer(
    analyze_path
)

results = EnhancePage.parse_enhance_pages(result)



In [None]:
results[0].paragraphs[0].bounding_regions[0].polygon

# 1 ---- 2
# |      |
# |      |
# 4 ---- 3

In [None]:
from IPython.display import display, HTML
HTML(results[-1].to_plain_text())

In [None]:
import math
from typing import Tuple
from azformrec import DocumentPage
import fitz
from fitz.fitz import Page, Document
import io
from PIL import Image


def acoord2fcoord(
        fitz_page: Page, 
        az_page: DocumentPage, 
        acoord: Tuple[float, float]
    ) -> Tuple[float]:
    fitz_p = fitz_page.get_pixmap()
    fdim = (float(fitz_p.width), float(fitz_p.height))
    adim = (float(az_page.width), float(az_page.height))
    return (
        fdim[0] / adim[0] * acoord[0], 
        fdim[1] / adim[1] * acoord[1]
    )
    
def get_information_region(fitz_page: Page):
    top_right_text = "ALS-BDS User Manual"
    bottom_left_text = "Strictly"
    top_right_rect = fitz_page.search_for(top_right_text)[0]
    top_right_x, top_right_y = (top_right_rect.br.x, top_right_rect.br.y)
    
    bottom_left_rect = fitz_page.search_for(bottom_left_text)[0]
    bottom_left_x, bottom_left_y= (bottom_left_rect.tl.x, bottom_left_rect.tl.y)
    informative_width = abs(top_right_x - bottom_left_x)
    informative_height = abs(top_right_y - bottom_left_y)
    return (informative_width, informative_height)
    
def format_link(link: fitz.fitz.Link):
    import re
    r = link['from']
    text = page.get_text(clip=r, textpage=None)
    text = text.replace("..", "").replace("\n", "").strip()
    re.match(r"(\d|\d\d)\.(\d|\d\d)(\N*)(\d|\d\d)-(\d|\d\d)")
    pass
    
def is_table_of_content_page(fitz_page: Page, threshold=0.5, **overrides):
    fdim = (
        overrides.get('w', None) or fitz_page.rect.width,
        overrides.get('h', None) or fitz_page.rect.height
    )
    area = fdim[0] * fdim[1]
    total_link_area = 0
    for link in fitz_page.links(kinds=[fitz.LINK_GOTO]):
        r = link['from']
        w = abs(r.x0 - r.x1)
        h = abs(r.y0 - r.y1)
        total_link_area += w * h
        pass
    
    if total_link_area / area > threshold:
        
        pass
        
        # text = page.get_text(clip=r, textpage=None)
        # text = text.replace("..", "").replace("\n", "").strip()
        # print(f"Text: {text:} -> Page {link['page'] + 1}")
        # annot = page.add_rect_annot(r)
        # annot.set_colors(stroke=(0, 0, 1))
        # annot.update()
    pass

page_num = 5
doc = fitz.open(doc_file) # open a document
page: Page = doc[page_num]
w, h = get_information_region(page)
is_table_of_content_page(page, w=w, h=h)
az_page = results[page_num]

for paragraph in az_page.get_non_overlap_paragraph:
    # if paragraph.role is not None: continue
    p = paragraph.bounding_regions[0].polygon
    p1 = (p[0].x, p[0].y)
    p3 = (p[2].x, p[2].y)
    f1 = acoord2fcoord(page, az_page.page, p1)
    f3 = acoord2fcoord(page, az_page.page, p3)
    r = fitz.Rect(f1[0], f1[1], f3[0], f3[1])
    try:
        annot = page.add_rect_annot(r)
        if paragraph.role is not None:
            annot.parent.insert_text(
                annot.rect.tl - (0, 5), 
                paragraph.role,
                color=(1, 0, 0), 
                fontsize=10
            )
    except Exception as msg:
        print(
            f"Exception for content: {paragraph.content} with {msg} -> {r}"
            )
    
    pass

for link in page.links(kinds=[fitz.LINK_GOTO]):
    r = link['from']
    text = page.get_text(clip=r, textpage=None)
    text = text.replace("..", "").replace("\n", "").strip()
    # print(f"Text: {text:} -> Page {link['page'] + 1}")
    print(text)
    annot = page.add_rect_annot(r)
    annot.set_colors(stroke=(0, 0, 1))
    annot.update()
    
    pass

# for bbox in page.get_bboxlog():
#     if bbox[0] != 'fill-text':
#         continue
#     r = bbox[1]
    
#     try:
#         annot = page.add_rect_annot(r)
#         annot.set_colors(stroke=(0, 0, 1))
#         annot.update()
#     except Exception as msg:
#         print(msg)
    
    
    # r = link['from']
    # print(r)
    # annot = page.add_rect_annot(r)
    # annot.set_colors(stroke=(0, 0, 1))
    # annot.update()
    
    # pass

# print("start saving")
# doc.save(f"{doc_file}-annotate.pdf")

display(
    Image.open(
        io.BytesIO(page.get_pixmap().pil_tobytes(format="PNG"))
    )
)

In [None]:
import json
import re
class TOCTreeBuilder:
    def __init__(self) -> None:
        pass
    
    def make_level_of_dot_number(self, pattern: List[int]):
        """
        pattern = [1, 1, 0, 1, 1]
            -> 1 is more than than zero
            -> 0 is any match
        """
        more_than_zero_num = "(\d{1,})"
        any_number = "(\d{0,})"
        pattern_list = [any_number, more_than_zero_num]
        return "\.".join([
           pattern_list[p] for p in pattern
        ])
        
    def match_leading_number(self, text) -> Tuple[list, str]:
        pattern = "^(\d{1,}\.|\d{1,})"
        leading_num = []
        while True:
            result = re.search(pattern, text)
            if result is None:
                break
            matched = result.groups()[0]
            text = text[len(matched):]
            matched = matched.strip(".").strip()
            leading_num.append(matched)
        
        return leading_num, text

    
    def extract_info_from_text(self, text : str) -> dict:
        if 'glossary' in text.lower():
            return {
                'level_info': [],
                'text': 'Glossary',
                'page_signature': ""
            }

        try:    
            level_info, text = self.match_leading_number(text)
            result = re.fullmatch("(.*)(\d{1,})-(\d{0,})(.*)", text)
            result = result.groups()
            content = result[0].strip().strip(".").strip()
            page_signature = result[1:]
            pass
            
            return {
                'level_info': level_info,
                'content': content,
                'page_signature': page_signature,
            }
        except Exception as msg:
            display(f"Exception: {msg} -> {text}")
        
        return {
            'level_info': [],
            'content': text,
            'page_signature': "",
        }

        
    pass

    def build(self, toc_list: list, tree: dict={}):
        # Assume TOC is a tree that have depth of 2
        # if the level_info is empty 
        # use the last element to infer
        # -> wait for the next element 
        pointer = 0 if len(
            list(tree.keys())) == 0 else max([int(sig) for sig in tree.keys()]
        )
        print(f"Starting pointer -> {pointer}")
        for toc in toc_list:
            data = self.extract_info_from_text(toc['text'])
            link = toc['link']
            level_info : list = data['level_info']
            print(level_info)
            if len(level_info) <= 1:
                if len(level_info) == 0: 
                    if tree.get(str(pointer), None) is not None:
                        pointer += 1
                        pass
                else:
                    pointer = level_info[0]
                
                
                tree[str(pointer)] = {
                    'data': data,
                    "link": link['page'],
                    'children': {},
                }
                pointer = pointer + 1
            else:
                # Force the pointer to be up-to-date
                pointer = max(pointer, int(level_info[0]))
                # Build the children
                sub_tree = tree
                for level_num in level_info[:-1]:
                    sub_tree: dict = sub_tree[level_num].get('children')
                    if len(list(sub_tree.keys())) == 0:
                        break

                sub_tree[level_info[-1]] = {
                    'data': data,
                    "link": link['page'],
                    'children': {},
                }
            
        return tree

page_num = 5
doc = fitz.open(doc_file) # open a document
page: Page = doc[page_num]

toc_list = []    
for link in page.links(kinds=[fitz.LINK_GOTO]):
    r = link['from']
    text = page.get_text(clip=r, textpage=None)
    text = text.replace("..", "").replace("\n", "").strip()
    toc_list.append({
        'text': text,
        'link': link,
    })
    pass

builder = TOCTreeBuilder()
tree = builder.build(toc_list=toc_list, tree={})

page: Page = doc[page_num+1]
toc_list = []
for link in page.links(kinds=[fitz.LINK_GOTO]):
    r = link['from']
    text = page.get_text(clip=r, textpage=None)
    text = text.replace("..", "").replace("\n", "").strip()
    toc_list.append({
        'text': text,
        'link': link,
    })
    pass


tree = builder.build(toc_list=toc_list, tree=tree)

with open('toc.json', 'w') as out:
    json.dump(tree, out)


In [None]:
def get_iou(bb1, bb2):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.

    Parameters
    ----------
    bb1 : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x1, y1) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner
    bb2 : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x, y) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner

    Returns
    -------
    float
        in [0, 1]
    """
    assert bb1['x1'] < bb1['x2']
    assert bb1['y1'] < bb1['y2']
    assert bb2['x1'] < bb2['x2']
    assert bb2['y1'] < bb2['y2']

    # determine the coordinates of the intersection rectangle
    x_left = max(bb1['x1'], bb2['x1'])
    y_top = max(bb1['y1'], bb2['y1'])
    x_right = min(bb1['x2'], bb2['x2'])
    y_bottom = min(bb1['y2'], bb2['y2'])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
    bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    assert iou >= 0.0
    assert iou <= 1.0
    return iou

In [None]:
from typing import Any
from azformrec import DocumentParagraph

class LinkDocumentParagraph(DocumentParagraph):
    def __init__(self, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.link : Any = kwargs.get("link", None)

    def match_link(self, fitz_page: Page, az_page: DocumentPage):
        poly = self.bounding_regions[0].polygon
        # top-left, bottom-right
        p1, p3 = poly[0], poly[2]
        f1 = acoord2fcoord(fitz_page, az_page, (p1.x, p1.y))
        f3 = acoord2fcoord(fitz_page, az_page, (p3.x, p3.y))
        print(f1, f3)
        for idx, link in enumerate(fitz_page.links([fitz.LINK_GOTO])):
            rect = link['from']
            iou_score = get_iou(
                bb1={
                "x1": f1[0],
                "x2": f3[0],
                "y1": f1[1],
                "y2": f3[1],
            }, bb2={
                "x1": rect.x0,
                "x2": rect.x1,  
                "y1": rect.y0,
                "y2": rect.y1,
            })
            print(f"{idx} -> {rect} -> {iou_score}")
            pass
        
        pass
    


In [None]:
link_p = LinkDocumentParagraph.from_dict(
    {**az_page.paragraphs[3].to_dict()}
)
link_p.match_link(az_page=az_page.page, fitz_page=page)


In [None]:
for link in page.links(kinds=[fitz.LINK_GOTO]):
    print(link)
    

In [None]:
for link in page.links():
    print(link['from'])
    print(type(link['from']))

In [None]:
display(HTML(az_page.to_plain_text()))

In [None]:
annot = page.add_rect_annot(r)

display(
    Image.open(
        io.BytesIO(page.get_pixmap().pil_tobytes(format="PNG"))
    )
)

In [None]:
page.find_tables()

In [None]:
result.pages[0].width, result.pages[0].height