In [None]:
# This can be skipped
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

In [None]:
# Update this path to the folder where the PDFs are stored
pdf_folder="../bem_pdfs"

## Get some basic info about the pdf

In [None]:
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser

my_pdf=f"{pdf_folder}/100.pdf"
fp = open(my_pdf, "rb")
doc = PDFDocument(PDFParser(fp))

doc.info

In [None]:
import pdf_tagextractor
pdf_info = pdf_tagextractor.get_pdf_info(doc)

## Extract the very useful outline (i.e., heading hierarchy)

In [None]:
outline = pdf_tagextractor.extract_outline(doc)

## Out-of-the-box results using their TagExtractor

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import TagExtractor
from pdfminer.pdfpage import PDFPage
from io import BytesIO
import xml.dom.minidom as minidom

# Extracted from pdfminer.high_level.py:extract_text_to_fp()
def interpreter_for_builtin_tag_extractor(output_io, output_codec: str = "utf-8"):
    rsrcmgr = PDFResourceManager(caching=True)
    pdf_device = TagExtractor(rsrcmgr, outfp=output_io, codec=output_codec)
    return PDFPageInterpreter(rsrcmgr, pdf_device)

def extract_xml(doc: PDFDocument, validate_xml: bool = False):
    output_io = BytesIO()
    interpreter = interpreter_for_builtin_tag_extractor(output_io)
    for (pageno, page) in enumerate(PDFPage.create_pages(doc), start=1):
        # print("page", pageno, page.pageid)
        # As the interpreter reads the PDF, it will call methods on interpreter.device,
        # which will write to output_io
        interpreter.process_page(page)

    # After done writing to output_io, go back to the beginning so we can read() it
    output_io.seek(0)
    # Wrap all tags in a root tag
    xml_string = "<pdf>" + output_io.read().decode() + "</pdf>"

    # Paste this string into https://jsonformatter.org/xml-formatter
    # and click "Format" to diagnose any XML validation issues
    # print(xml_string)

    if validate_xml:
        minidom.parseString(xml_string) # nosec

    return xml_string

orig_xml_string = extract_xml(doc, validate_xml=True)
len(orig_xml_string.splitlines())

In [None]:
import xml.dom.minidom
dom = xml.dom.minidom.parseString(orig_xml_string) # nosec
print(dom.toprettyxml(indent="  "))

The following PDFs have errors using original TagExtractor:
```
Error in extracted xml for 101.pdf: mismatched tag: line 2, column 9032
Error in extracted xml for 105.pdf: mismatched tag: line 5, column 2360
Error in extracted xml for 203.pdf: mismatched tag: line 4, column 3184
Error in extracted xml for 210.pdf: 13
Error in extracted xml for 225.pdf: mismatched tag: line 34, column 4550
Error in extracted xml for 230A.pdf: 7
Error in extracted xml for 400.pdf: mismatched tag: line 72, column 5552
Error in extracted xml for 554.pdf: 28
```

## Use custom BemTagExtractor and postprocess XML into JSON

In [None]:
bem_parser = pdf_tagextractor.BemPdfParser(my_pdf)
xml_string = bem_parser.extract_xml(validate_xml=not True)
len(orig_xml_string.splitlines())

In [None]:
xml_string

In [None]:
import xml.dom.minidom
dom = xml.dom.minidom.parseString(xml_string) # nosec
print(dom.toprettyxml(indent="  "))

In [None]:
import pprint
ann_texts = bem_parser.to_annotated_texts(xml_string)
pprint.pprint(ann_texts)

In [None]:
bem_parser.close()


## Test all PDFs and save JSON to file

In [None]:
# Capturing notes about problematic PDFs

pdfs_with_issues = [
    # BEM-specific PDFs with issues
    # Manually fixed by removing duplicate consecutive headings
    "106.pdf", # outline has multiple "MSA waiver service agents" headings in outline (due to table overflow?) but not in the TEXT

    ## PDFs with issues using the original TagExtractor

    # The following have missing closing tags
    "101.pdf", # missing closing P tag in table on page 2
#		<P MCID="55">166</P>
#		<P MCID="56">
#			<P MCID="57"></P>
#			<P MCID="58"></P>

    ## Fixed by not allowing nested Span tags
    "105.pdf", # missing closing SPAN tag (nested SPAN in numbered list on page 5)
#		<Span Lang="en-US" MCID="2">1. BEM 150 addresses MA for SSI recipients and persons appealing an SSI disability termination. The other SSI-related categories must be considered in the following order: BEM 154, Special Disabled Children 
#			<Span Lang="en-US" MCID="3">2. Special categories: </Span>

    "203.pdf", # missing closing SPAN tag (nested SPAN in list on page 4)
# 		<Span Lang="en-US" MCID="18">â€¢
#			<Span Lang="en-US" MCID="19"> Above individual&#x27;s ID, date of birth, race, sex and SSN. </Span>

    "225.pdf", # missing closing SPAN tag (due to nested SPAN in table on page 34?)
# 		<Span Lang="en-US" MCID="43">Yes 
#			<Span Lang="en-US" MCID="44">Yes </Span>

    "400.pdf", # missing closing SPAN tag (due to nested SPAN in table on page 72? plus table within table!)
# 		<P MCID="6"></P>
#		<Span Lang="en-US" MCID="7">Client has: 
#			<P MCID="8"></P>
#			<Span Lang="en-US" MCID="9">$2,500 Savings Account </Span>
    ]
pdfs_with_extra_end_tag = [
    # Fixed by ignoring call to end_tag()
    "210.pdf", # page 14 of 20: assert self._stack, str(self.pageno)
    "230A.pdf", # page 8: assert self._stack, str(self.pageno)
    "554.pdf", # page 29: assert self._stack, str(self.pageno)
    ]


In [None]:
TEST_ALL_PDFS = False

import os
if TEST_ALL_PDFS:
    import jsonpickle

for file in sorted(os.listdir(pdf_folder)):
    if not TEST_ALL_PDFS:
        break
    if file.endswith(".pdf"):
        pdf_filename = os.path.join(pdf_folder, file)
        print(file)
        fp = open(pdf_filename, "rb")
        try:
            test_original_tagextractor = False
            if test_original_tagextractor:
                if file in pdfs_with_issues:
                    continue
                doc = PDFDocument(PDFParser(fp))
                orig_xml_string = extract_xml(doc, validate_xml=True)
            else:
                if not os.path.exists(f"{file}.json"):
                    bem_parser = pdf_tagextractor.BemPdfParser(file)
                    if file == "106.pdf":
                        for i in range(3):
                            print("Removed duplicate heading: ", bem_parser.parsing_context.heading_stack.pop(0))                    
                    xml_string = bem_parser.extract_xml(validate_xml=True)
                    ann_texts = bem_parser.to_annotated_texts(xml_string)
                    with open(f"{file}.json", 'w') as fp:
                        fp.write(jsonpickle.encode(ann_texts, indent=2, make_refs=False, unpicklable=False))
                bem_parser.close()
                # break
        except Exception as e:
            print(f"Error in extracted xml for {file}: {e}")
            break
        fp.close()


### TODOs:
- merge text, spans, and lists that overflow onto the next page or next TEXT element
    - handle sublist -- [Slack](https://nava.slack.com/archives/C06DP498D1D/p1724182273941319?thread_ts=1723826732.335659&cid=C06DP498D1D)
- parsing tables (large/med effort)
- remove stop words from `tags`
extract-hyperlinks-from-pdf-in-python
- test pdfminer's image extraction (e.g., 105.pdf page 7) 
- extract weblinks for hyperlinked text: https://stackoverflow.com/questions/27744210/

### Exploring hyperlink identification

In [None]:
my_pdf=f"{pdf_folder}/100.pdf"
fp = open(my_pdf, "rb")
doc = PDFDocument(PDFParser(fp))
doc.catalog

In [None]:
from pdfminer.pdftypes import dict_value

entry=doc.catalog['Outlines']
entry_d = dict_value(entry)
print(entry_d)
entry_d2 = dict_value(entry_d['Last'])
print(entry_d2)
dest = dict_value(entry_d2['Dest'][0])

In [None]:
entry=doc.catalog['Pages']
entry_d = dict_value(entry)
print(entry_d)
print(len(entry_d['Kids']))

In [None]:
for i, k in enumerate(entry_d['Kids']):
    print(i, pprint.pformat(dict_value(k)))

In [None]:
page_ref=entry_d['Kids'][9]
page=dict_value(page_ref)

In [None]:
dict_value(page['Annots'][0])

In [None]:
from pdfminer.pdftypes import PDFObjRef

def get_page_links(page: PDFObjRef):
    annotationList = []
    if 'Annots' in page:
        for annotation_ref in page['Annots']:
            annotationDict = dict_value(annotation_ref)
            # print(annotationDict)
            # print(annotationDict["Subtype"])
            if str(annotationDict["Subtype"]) != "/'Link'":
                # Skip over any annotations that are not links
                continue
            position = annotationDict["Rect"]
            uriDict = dict_value(annotationDict["A"])
            # This has always been true so far.
            # print(uriDict)
            assert str(uriDict["S"]) == "/'URI'"
            # Some of my URI's have spaces.
            uri = str(uriDict["URI"]).replace(" ", "%20")
            # print(uri)
            annotationList.append((position, uri))
    return annotationList

entry=doc.catalog['Pages']
for p, k in  enumerate(dict_value(entry)['Kids'], start=1):
    page = dict_value(k)
    print(p, get_page_links(page))
