In [1]:
# This can be skipped
%load_ext autoreload
%autoreload 1
%aimport pdf_tagextractor

In [2]:
# Update this path to the folder where the PDFs are stored
pdf_folder="/Users/yoom/dev/bem"

import os
os.chdir(pdf_folder)

In [3]:
# Capturing notes about problematic PDFs

pdfs_with_issues = [
    # BEM-specific PDFs with issues
    # Manually fixed by removing duplicate consecutive headings
    "106.pdf", # outline has multiple "MSA waiver service agents" headings in outline (due to table overflow?) but not in the TEXT

    ## PDFs with issues using the original TagExtractor

    # The following have missing closing tags
    "101.pdf", # missing closing P tag in table on page 2
#		<P MCID="55">166</P>
#		<P MCID="56">
#			<P MCID="57"></P>
#			<P MCID="58"></P>

    ## Fixed by not allowing nested Span tags
    "105.pdf", # missing closing SPAN tag (nested SPAN in numbered list on page 5)
#		<Span Lang="en-US" MCID="2">1. BEM 150 addresses MA for SSI recipients and persons appealing an SSI disability termination. The other SSI-related categories must be considered in the following order: BEM 154, Special Disabled Children 
#			<Span Lang="en-US" MCID="3">2. Special categories: </Span>

    "203.pdf", # missing closing SPAN tag (nested SPAN in list on page 4)
# 		<Span Lang="en-US" MCID="18">•
#			<Span Lang="en-US" MCID="19"> Above individual&#x27;s ID, date of birth, race, sex and SSN. </Span>

    "225.pdf", # missing closing SPAN tag (due to nested SPAN in table on page 34?)
# 		<Span Lang="en-US" MCID="43">Yes 
#			<Span Lang="en-US" MCID="44">Yes </Span>

    "400.pdf", # missing closing SPAN tag (due to nested SPAN in table on page 72? plus table within table!)
# 		<P MCID="6"></P>
#		<Span Lang="en-US" MCID="7">Client has: 
#			<P MCID="8"></P>
#			<Span Lang="en-US" MCID="9">$2,500 Savings Account </Span>
    ]
pdfs_with_extra_end_tag = [
    # Fixed by ignoring call to end_tag()
    "210.pdf", # page 14 of 20: assert self._stack, str(self.pageno)
    "230A.pdf", # page 8: assert self._stack, str(self.pageno)
    "554.pdf", # page 29: assert self._stack, str(self.pageno)
    ]


['210.pdf', '230A.pdf', '554.pdf']

## Get some basic info about the pdf

In [4]:
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser

my_pdf="100.pdf"
fp = open(my_pdf, "rb")
doc = PDFDocument(PDFParser(fp))

doc.info

[{'Title': b'INTRODUCTION',
  'Author': b'KaleJ',
  'Creator': b'\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00 \x003\x006\x005',
  'CreationDate': b"D:20230301140652-05'00'",
  'ModDate': b"D:20230301140652-05'00'",
  'Producer': b'\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00 \x003\x006\x005'}]

In [5]:
import pdf_tagextractor
pdf_info = pdf_tagextractor.get_pdf_info(doc)

{'pdf': {'creation_date': "D:20230301140652-05'00'",
  'mod_date': "D:20230301140652-05'00'",
  'producer': 'Microsoft® Word for Microsoft 365',
  'page_count': 18},
 'title': 'INTRODUCTION'}

## Extract the very useful outline (i.e., heading hierarchy)

In [6]:
outline = pdf_tagextractor.extract_outline(doc)

[Heading(title='Overview', level=1, pageno=1),
 Heading(title='Family Independence Program (FIP)', level=2, pageno=1),
 Heading(title='Program Goal', level=2, pageno=1),
 Heading(title='Medical Assistance Program', level=2, pageno=2),
 Heading(title='Program Goal', level=2, pageno=2),
 Heading(title='Food Assistance Program (FAP)', level=2, pageno=2),
 Heading(title='Program Goal', level=2, pageno=3),
 Heading(title='Authorized Purchases', level=2, pageno=3),
 Heading(title='Refugee Assistance Programs', level=1, pageno=4),
 Heading(title='Program Goal', level=2, pageno=4),
 Heading(title='Child Development and Care (CDC)', level=2, pageno=5),
 Heading(title='Program Goal', level=2, pageno=5),
 Heading(title='State Disability Assistance (SDA)', level=2, pageno=5),
 Heading(title='Program Goal', level=2, pageno=5),
 Heading(title='Policy manuals', level=1, pageno=6),
 Heading(title='Bridges Eligibility Manual (BEM)', level=2, pageno=6),
 Heading(title='Bridges Administrative Manual (BAM

## Out-of-the-box results using their TagExtractor

In [7]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import TagExtractor
from pdfminer.pdfpage import PDFPage
from io import BytesIO
import xml.dom.minidom as minidom

# Extracted from pdfminer.high_level.py:extract_text_to_fp()
def interpreter_for_builtin_tag_extractor(output_io, output_codec: str = "utf-8"):
    rsrcmgr = PDFResourceManager(caching=True)
    pdf_device = TagExtractor(rsrcmgr, outfp=output_io, codec=output_codec)
    return PDFPageInterpreter(rsrcmgr, pdf_device)

def extract_xml(doc: PDFDocument, validate_xml: bool = False):
    output_io = BytesIO()
    interpreter = interpreter_for_builtin_tag_extractor(output_io)
    for (pageno, page) in enumerate(PDFPage.create_pages(doc), start=1):
        # print("page", pageno, page.pageid)
        # As the interpreter reads the PDF, it will call methods on interpreter.device,
        # which will write to output_io
        interpreter.process_page(page)

    # After done writing to output_io, go back to the beginning so we can read() it
    output_io.seek(0)
    # Wrap all tags in a root tag
    xml_string = "<pdf>" + output_io.read().decode() + "</pdf>"

    # Paste this string into https://jsonformatter.org/xml-formatter
    # and click "Format" to diagnose any XML validation issues
    print(xml_string)

    if validate_xml:
        minidom.parseString(xml_string)

    return xml_string

orig_xml_string = extract_xml(doc, validate_xml=True)

<pdf><page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0"><Artifact Attached="[/'Top']" Subtype="/'Header'" Type="/'Pagination'">BEM 100 1 of 18 INTRODUCTION BPB 2023-006 4-1-2023 <Artifact></Artifact> </Artifact><Artifact Attached="[/'Bottom']" Subtype="/'Footer'" Type="/'Pagination'"> BRIDGES ELIGIBILITY MANUAL STATE OF MICHIGAN DEPARTMENT OF HEALTH &amp; HUMAN SERVICES <Artifact></Artifact> </Artifact><P MCID="0">OVERVIEW   </P><P MCID="1">Family Independence Program (FIP) </P><P MCID="2">Temporary Assistance to Needy Families (TANF), called the Family Independence Program (FIP) in Michigan, is a block grant that was established by the Social Security Act. Public Act (P.A.) 223 of 1995 amended P.A. 280 of 1939 and provides a state legal base for FIP. FIP policies are also authorized by the Code of Federal Regulations (CFR), Michigan Compiled Laws (MCL), Michigan Administrative Code (MAC), and federal court orders. Amendments to the Social Security Act by the U.S. Congress affe

'<pdf><page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0"><Artifact Attached="[/\'Top\']" Subtype="/\'Header\'" Type="/\'Pagination\'">BEM 100 1 of 18 INTRODUCTION BPB 2023-006 4-1-2023 <Artifact></Artifact> </Artifact><Artifact Attached="[/\'Bottom\']" Subtype="/\'Footer\'" Type="/\'Pagination\'"> BRIDGES ELIGIBILITY MANUAL STATE OF MICHIGAN DEPARTMENT OF HEALTH &amp; HUMAN SERVICES <Artifact></Artifact> </Artifact><P MCID="0">OVERVIEW   </P><P MCID="1">Family Independence Program (FIP) </P><P MCID="2">Temporary Assistance to Needy Families (TANF), called the Family Independence Program (FIP) in Michigan, is a block grant that was established by the Social Security Act. Public Act (P.A.) 223 of 1995 amended P.A. 280 of 1939 and provides a state legal base for FIP. FIP policies are also authorized by the Code of Federal Regulations (CFR), Michigan Compiled Laws (MCL), Michigan Administrative Code (MAC), and federal court orders. Amendments to the Social Security Act by the U.S. 

The following have errors using original TagExtractor:
```
Error in extracted xml for 101.pdf: mismatched tag: line 2, column 9032
Error in extracted xml for 105.pdf: mismatched tag: line 5, column 2360
Error in extracted xml for 203.pdf: mismatched tag: line 4, column 3184
Error in extracted xml for 210.pdf: 13
Error in extracted xml for 225.pdf: mismatched tag: line 34, column 4550
Error in extracted xml for 230A.pdf: 7
Error in extracted xml for 400.pdf: mismatched tag: line 72, column 5552
Error in extracted xml for 554.pdf: 28
```

## Use custom BemTagExtractor and postprocess XML into JSON

In [8]:
bem_parser = pdf_tagextractor.BemPdfParser(my_pdf)
xml_string = bem_parser.extract_xml(validate_xml=not True)

>>>>> begin_tag /'Artifact' {'Attached': [/'Top'], 'Type': /'Pagination', 'Subtype': /'Header'} []
>>>>>>>>  <PDFTrueTypeFont: basefont='Arial-BoldMT'> 0 [/'Artifact']
   >>>>>T  BEM
>>>>>>>>  <PDFTrueTypeFont: basefont='Arial-BoldMT'> 0 [/'Artifact']
   >>>>>T   
>>>>>>>>  <PDFTrueTypeFont: basefont='Arial-BoldMT'> 0 [/'Artifact']
   >>>>>T  100
>>>>>>>>  <PDFTrueTypeFont: basefont='Arial-BoldMT'> 0 [/'Artifact']
   >>>>>T   
>>>>>>>>  <PDFTrueTypeFont: basefont='ArialMT'> 0 [/'Artifact']
   >>>>>T  1
>>>>>>>>  <PDFTrueTypeFont: basefont='ArialMT'> 0 [/'Artifact']
   >>>>>T   
>>>>>>>>  <PDFTrueTypeFont: basefont='ArialMT'> 0 [/'Artifact']
   >>>>>T  of 
>>>>>>>>  <PDFTrueTypeFont: basefont='ArialMT'> 0 [/'Artifact']
   >>>>>T  18
>>>>>>>>  <PDFTrueTypeFont: basefont='ArialMT'> 0 [/'Artifact']
   >>>>>T   
>>>>>>>>  <PDFTrueTypeFont: basefont='Arial-BoldMT'> 0 [/'Artifact']
   >>>>>T  INTRODUCTION
>>>>>>>>  <PDFTrueTypeFont: basefont='Arial-BoldMT'> 0 [/'Artifact']
   >>>>>T   
>>>>>>

'<pdf><page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0"><Artifact Attached="[/\'Top\']" Subtype="/\'Header\'" Type="/\'Pagination\'"><BOLD>BEM 100 </BOLD>1 of 18 <BOLD>INTRODUCTION </BOLD>BPB 2023-006 4-1-2023 <Artifact></Artifact> </Artifact><Artifact Attached="[/\'Bottom\']" Subtype="/\'Footer\'" Type="/\'Pagination\'"> <BOLD>BRIDGES ELIGIBILITY MANUAL STATE OF MICHIGAN DEPARTMENT OF HEALTH &amp; HUMAN SERVICES <Artifact></Artifact></BOLD> </Artifact><P MCID="0"><BOLD>OVERVIEW   </BOLD></P><P MCID="1"><BOLD>Family Independence Program (FIP) </BOLD></P><P MCID="2"><BOLD>Temporary Assistance to Needy Families (TANF), called the Family Independence Program (FIP) in Michigan, </BOLD>is a block grant that was established by the Social Security Act. Public Act (P.A.) 223 of 1995 amended P.A. 280 of 1939 and provides a state legal base for FIP. FIP policies are also authorized by the Code of Federal Regulations (CFR), Michigan Compiled Laws (MCL), Michigan Administrative Code (MAC)

In [9]:
import xml.dom.minidom
dom = xml.dom.minidom.parseString(xml_string)
print(dom.toprettyxml(indent="  "))

<?xml version="1.0" ?>
<pdf>
  <page id="0" bbox="0.000,0.000,612.000,792.000" rotate="0">
    <Artifact Attached="[/'Top']" Subtype="/'Header'" Type="/'Pagination'">
      <BOLD>BEM 100 </BOLD>
      1 of 18 
      <BOLD>INTRODUCTION </BOLD>
      BPB 2023-006 4-1-2023 
      <Artifact/>
       
    </Artifact>
    <Artifact Attached="[/'Bottom']" Subtype="/'Footer'" Type="/'Pagination'">
       
      <BOLD>
        BRIDGES ELIGIBILITY MANUAL STATE OF MICHIGAN DEPARTMENT OF HEALTH &amp; HUMAN SERVICES 
        <Artifact/>
      </BOLD>
       
    </Artifact>
    <P MCID="0">
      <BOLD>OVERVIEW   </BOLD>
    </P>
    <P MCID="1">
      <BOLD>Family Independence Program (FIP) </BOLD>
    </P>
    <P MCID="2">
      <BOLD>Temporary Assistance to Needy Families (TANF), called the Family Independence Program (FIP) in Michigan, </BOLD>
      is a block grant that was established by the Social Security Act. Public Act (P.A.) 223 of 1995 amended P.A. 280 of 1939 and provides a state legal

In [10]:
import pprint
ann_texts = bem_parser.to_annotated_texts(xml_string)
pprint.pprint(ann_texts)

Processing page 1
Processing page 2
Processing page 3
Processing page 4
Processing page 5
Processing page 6
Processing page 7
Processing page 8
Processing page 9
Processing page 10
Processing page 11
Processing page 12
Processing page 13
Processing page 14
Processing page 15
Processing page 16
Processing page 17
Processing page 18
[AnnotatedText(parano=1,
               text='Temporary Assistance to Needy Families (TANF), called the '
                    'Family Independence Program (FIP) in Michigan, ',
               bolded=True,
               span=False,
               page=PageInfo(pageno=1,
                             doc_title='INTRODUCTION',
                             doc_pageno='1',
                             header_title='INTRODUCTION',
                             bem_section='100',
                             text_date='4-1-2023'),
               headings=[Heading(title='Overview', level=1, pageno=1),
                         Heading(title='Family Independence Program

In [11]:
bem_parser.close()


## Test all PDFs and save JSON to file

In [12]:
TEST_ALL_PDFS = False
import jsonpickle

for file in sorted(os.listdir(".")):
    if not TEST_ALL_PDFS:
        break
    if file.endswith(".pdf"):
        pdf_filename = os.path.join(pdf_folder, file)
        print(file)
        fp = open(pdf_filename, "rb")
        try:
            test_original_tagextractor = False
            if test_original_tagextractor:
                if file in pdfs_with_issues:
                    continue
                doc = PDFDocument(PDFParser(fp))
                orig_xml_string = extract_xml(doc, validate_xml=True)
            else:
                if not os.path.exists(f"{file}.json"):
                    bem_parser = pdf_tagextractor.BemPdfParser(file)
                    if file == "106.pdf":
                        for i in range(3):
                            print("Removed duplicate heading: ", bem_parser.parsing_context.heading_stack.pop(0))                    
                    xml_string = bem_parser.extract_xml(validate_xml=True)
                    ann_texts = bem_parser.to_annotated_texts(xml_string)
                    with open(f"{file}.json", 'w') as fp:
                        fp.write(jsonpickle.encode(ann_texts, indent=2, make_refs=False, unpicklable=False))
                bem_parser.close()
                # break
        except Exception as e:
            print(f"Error in extracted xml for {file}: {e}")
            break
        fp.close()


### TODOs:
- merge text overflow:
    - If prev TEXT matches (BOLD vs. non-BOLD)
    - and ...sentence is continuing,
    - then merge text.
- merge SPANs
- handle list items https://nava.slack.com/archives/C06DP498D1D/p1724182273941319?thread_ts=1723826732.335659&cid=C06DP498D1D
- remove stop words from tags
- find hyperlinks; https://stackoverflow.com/questions/27744210/extract-hyperlinks-from-pdf-in-python


### Exploring hyperlink identification

In [13]:
my_pdf="100.pdf"
fp = open(my_pdf, "rb")
doc = PDFDocument(PDFParser(fp))
doc.catalog

{'Type': /'Catalog',
 'Pages': <PDFObjRef:2>,
 'Lang': b'en-US',
 'StructTreeRoot': <PDFObjRef:129>,
 'Outlines': <PDFObjRef:91>,
 'MarkInfo': {'Marked': True},
 'Metadata': <PDFObjRef:684>,
 'ViewerPreferences': <PDFObjRef:685>}

In [14]:
from pdfminer.pdftypes import dict_value

entry=doc.catalog['Outlines']
entry_d = dict_value(entry)
print(entry_d)
entry_d2 = dict_value(entry_d['Last'])
print(entry_d2)
dest = dict_value(entry_d2['Dest'][0])

{'Type': /'Outlines', 'First': <PDFObjRef:92>, 'Last': <PDFObjRef:128>}
{'Title': b'LEGAL BASE', 'Parent': <PDFObjRef:91>, 'Dest': [<PDFObjRef:84>, /'XYZ', 51, 651, 0], 'Prev': <PDFObjRef:127>}


{'Type': /'Page',
 'Parent': <PDFObjRef:2>,
 'Resources': {'Font': {'F1': <PDFObjRef:5>,
   'F2': <PDFObjRef:9>,
   'F3': <PDFObjRef:11>},
  'ExtGState': {'GS7': <PDFObjRef:7>, 'GS8': <PDFObjRef:8>},
  'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI']},
 'Annots': [<PDFObjRef:86>, <PDFObjRef:87>],
 'MediaBox': [0, 0, 612, 792],
 'Contents': <PDFObjRef:85>,
 'Group': {'Type': /'Group', 'S': /'Transparency', 'CS': /'DeviceRGB'},
 'Tabs': /'S',
 'StructParents': 43}

In [15]:
entry=doc.catalog['Pages']
entry_d = dict_value(entry)
print(entry_d)
print(len(entry_d['Kids']))

{'Type': /'Pages', 'Count': 18, 'Kids': [<PDFObjRef:3>, <PDFObjRef:13>, <PDFObjRef:16>, <PDFObjRef:23>, <PDFObjRef:32>, <PDFObjRef:34>, <PDFObjRef:36>, <PDFObjRef:39>, <PDFObjRef:41>, <PDFObjRef:45>, <PDFObjRef:49>, <PDFObjRef:53>, <PDFObjRef:55>, <PDFObjRef:57>, <PDFObjRef:68>, <PDFObjRef:77>, <PDFObjRef:84>, <PDFObjRef:88>]}
18


In [16]:
for i, k in enumerate(entry_d['Kids']):
    print(i, pprint.pformat(dict_value(k)))

0 {'Contents': <PDFObjRef:4>,
 'Group': {'CS': /'DeviceRGB', 'S': /'Transparency', 'Type': /'Group'},
 'MediaBox': [0, 0, 612, 792],
 'Parent': <PDFObjRef:2>,
 'Resources': {'ExtGState': {'GS7': <PDFObjRef:7>, 'GS8': <PDFObjRef:8>},
               'Font': {'F1': <PDFObjRef:5>,
                        'F2': <PDFObjRef:9>,
                        'F3': <PDFObjRef:11>},
               'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI']},
 'StructParents': 0,
 'Tabs': /'S',
 'Type': /'Page'}
1 {'Annots': [<PDFObjRef:15>],
 'Contents': <PDFObjRef:14>,
 'Group': {'CS': /'DeviceRGB', 'S': /'Transparency', 'Type': /'Group'},
 'MediaBox': [0, 0, 612, 792],
 'Parent': <PDFObjRef:2>,
 'Resources': {'ExtGState': {'GS7': <PDFObjRef:7>, 'GS8': <PDFObjRef:8>},
               'Font': {'F1': <PDFObjRef:5>,
                        'F2': <PDFObjRef:9>,
                        'F3': <PDFObjRef:11>},
               'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI']},
 'StructParents': 

In [17]:
page_ref=entry_d['Kids'][9]
page=dict_value(page_ref)

{'Type': /'Page',
 'Parent': <PDFObjRef:2>,
 'Resources': {'Font': {'F1': <PDFObjRef:5>,
   'F2': <PDFObjRef:9>,
   'F3': <PDFObjRef:11>,
   'F6': <PDFObjRef:30>},
  'ExtGState': {'GS7': <PDFObjRef:7>, 'GS8': <PDFObjRef:8>},
  'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI']},
 'Annots': [<PDFObjRef:47>, <PDFObjRef:48>],
 'MediaBox': [0, 0, 612, 792],
 'Contents': <PDFObjRef:46>,
 'Group': {'Type': /'Group', 'S': /'Transparency', 'CS': /'DeviceRGB'},
 'Tabs': /'S',
 'StructParents': 11}

In [18]:
dict_value(page['Annots'][0])

{'Subtype': /'Link',
 'Rect': [255.76, 251.14, 402.49, 264.93],
 'BS': {'W': 0},
 'F': 4,
 'A': {'Type': /'Action',
  'S': /'URI',
  'URI': b'mailto:Policy-FAP@michigan.gov'},
 'StructParent': 12}

In [19]:
from pdfminer.pdftypes import PDFObjRef

def get_page_links(page: PDFObjRef):
    annotationList = []
    if 'Annots' in page:
        for annotation_ref in page['Annots']:
            annotationDict = dict_value(annotation_ref)
            # print(annotationDict)
            # print(annotationDict["Subtype"])
            if str(annotationDict["Subtype"]) != "/'Link'":
                # Skip over any annotations that are not links
                continue
            position = annotationDict["Rect"]
            uriDict = dict_value(annotationDict["A"])
            # This has always been true so far.
            # print(uriDict)
            assert str(uriDict["S"]) == "/'URI'"
            # Some of my URI's have spaces.
            uri = str(uriDict["URI"]).replace(" ", "%20")
            # print(uri)
            annotationList.append((position, uri))
    return annotationList

entry=doc.catalog['Pages']
for p, k in  enumerate(dict_value(entry)['Kids'], start=1):
    page = dict_value(k)
    print(p, get_page_links(page))


1 []
2 [([259.79, 206.94, 411.66, 220.74], "b'http://www.michigan.gov/MDHHS'")]
3 []
4 []
5 []
6 []
7 [([229.78, 220.5, 362.31, 234.3], "b'http://www.michigan.gov/mdhhs'")]
8 []
9 []
10 [([255.76, 251.14, 402.49, 264.93], "b'mailto:Policy-FAP@michigan.gov'"), ([256.17, 110.54, 462.95, 146.34], "b'mailto:LEO-RefugeeServices@michigan.gov'")]
11 [([195.75, 265.07, 365.61, 289.87], "b'mailto:eligibilitypolicy@michigan.gov'"), ([0, 792, 0, 792], "b'mailto:eligibilitypolicy@michigan.gov'")]
12 []
13 []
14 [([219.75, 365.07, 369.14, 379.68], "b'mailto:Policy-CDC@michigan.gov'"), ([219.75, 325.67, 409.83, 340.27], "b'mailto:Policy-Employment@michigan.gov'"), ([267.74, 286.26, 483.17, 300.87], "b'mailto:Policy-FAPemployment@michigan.gov%20-'"), ([219.75, 235.85, 366.48, 261.46], "b'mailto:Policy-FAP@michigan.gov'"), ([219.75, 221.25, 390.48, 235.85], "b'mailto:Policy-FIP-SDA@michigan.gov'"), ([219.75, 181.84, 426.53, 196.45], "b'mailto:LEO-RefugeeServices@michigan.gov'"), ([219.75, 142.43, 385.