# Extracting Markdown and PDF from FHIR Orders/Reports

In [1]:

import fhirclient.models.binary as bin
import json

from pypdf import PdfReader
import pymupdf
import base64
import io
import pymupdf4llm
import pathlib
import tabula

report_list = ['ORU_R01_DLIMS.txt','ORU_R01_R125.1_R0A.txt','ORU_R01_R125.1_RBS.txt','ORU_R01_R125.1_REP.txt','ORU_R01_R125.1_RR8.txt','ORU_R01_R125.1_RX1.txt','ORU_R01_R125.1_SG9.txt','ORU_R01_R125.1_ZT001.txt','ORU_R01_R125.1_7A3.txt','ORU_R01_R125.1_RPY.txt','ORU_R01_R125.1_RXK.txt', 'SHIRE_ORU_R01_RM3.txt', 'WALES_ORU_R01_TX.txt']

# For testing

report_list = ['ORU_R01_R125.1_R0A.txt','ORU_R01_DLIMS.txt']


pdfReaderEnabled = False
pdfPymupdfEnabled = False
pdfpymupdf4llmEnabled = True
pdfTableEnabled = False


for file in report_list:
    with open('Output/FHIR/R01/' + file+ '.json', 'rb') as g:
        rawJson = g.read()
        jsonBundle = json.loads(rawJson)

        #bundle = b.Bundle(jsonBundle)
        for entry in jsonBundle['entry']:
            #print(entry['resource']['resourceType'])
            if entry['resource']['resourceType'] == 'Binary':
                binary = bin.Binary(entry['resource'])
                encoded = binary.data
                #print(encoded)
                decode = base64.b64decode(encoded)
                pdf_stream = io.BytesIO(decode)
                if pdfpymupdf4llmEnabled:
                    doc = pymupdf.open("pdf", pdf_stream)
                    md_text = pymupdf4llm.to_markdown(doc)
                    #print(md_text)
                    pathlib.Path('Output/Markdown/R01/' + file+ '.md').write_bytes(md_text.encode())
                if pdfPymupdfEnabled:
                    # https://pymupdf.readthedocs.io/en/latest/the-basics.html

                    # Initial Tests indicate this is the best method to use.

                    doc = pymupdf.open("pdf", pdf_stream)
                    print('Pages: '+ str(len(doc)))
                    print('============ pymupdf ' + file + ' ==============')
                    for page in doc: # iterate the document pages
                        text = page.get_text().encode("utf8")
                        print(text)
                if pdfReaderEnabled:
                    pdf = PdfReader(pdf_stream)
                    print('Pages: '+ str(len(pdf.pages)))
                    print('============ PDF Reader ' + file + ' ==============')
                    for page in pdf.pages:
                        text = page.extract_text()
                        # print(text)
                if pdfTableEnabled:
                    tables = tabula.read_pdf(pdf_stream
                                             , pages='all', multiple_tables=True)
                    print('============ Tabula ' + file + ' ==============')
                    for table in tables:
                        tab1 = table
                        #print(table)
                with open('Output/PDF/R01/' + file+ '.pdf', 'wb') as pdf:
                    pdf.write(decode)




Consider using the pymupdf_layout package for a greatly improved page layout analysis.


# Extracting Markdown from PDF

In [2]:
pdf_list = ['R24-0WH7-1.pdf','R24-1A4W-1.pdf']

for file in pdf_list:
    with open('Output/PDF/R01/' + file, 'rb') as g:
        pdf_stream = g.read()
        if pdfpymupdf4llmEnabled:
            doc = pymupdf.open("pdf", pdf_stream)
            md_text = pymupdf4llm.to_markdown(doc)
            print(md_text)
            pathlib.Path('Output/Markdown/R01/' + file+ '.md').write_bytes(md_text.encode())


pdf_list = ['genomic-medicine-device-test-order-form-cancer-v1.22.pdf']

for file in pdf_list:
    with open('Output/PDF/O21/' + file, 'rb') as g:
        pdf_stream = g.read()
        if pdfpymupdf4llmEnabled:
            doc = pymupdf.open("pdf", pdf_stream)
            md_text = pymupdf4llm.to_markdown(doc)
            print(md_text)
            pathlib.Path('Output/Markdown/O21/' + file+ '.md').write_bytes(md_text.encode())



NHS Genomic Medicine Service, WGS Test Request Cancer, July 2024 v1.22 to be used for WGS go-live.
This document is subject to version control and is regularly updated. Please confirm you are using the current version by
contacting your local Genomic Laboratory Hub

~~**Genomic Medicine Service**~~








|Whole Genome Sequencing (WGS) Test Request PLEASE DO NOT USE FOR NON-WGS TESTS CANCER|Col2|
|---|---|
|**Requesting organisation:**<br> <br>|**Requesting organisation:**<br> <br>|
|<br>~~**GLH laboratory to receive sample:**~~<br>North West|~~Test Required~~<br>**Whole Genome Sequencing **|







































|GLH laboratory to receive sample: North West|Col2|Col3|Col4|Test Required Whole Genome Sequencing|Col6|Col7|
|---|---|---|---|---|---|---|
|~~Patient first name~~<br> <br>Myrcella|~~Patient first name~~<br> <br>Myrcella|~~Patient first name~~<br> <br>Myrcella|~~Patient first name~~<br> <br>Myrcella|~~Ethnicity~~<br>|~~Ethnicity~~<br>|~~Ethnicity~~<br>|
|~~P