In [13]:

import fhirclient.models.binary as bin
import json
import spacy
from spacy import displacy
from spacypdfreader.spacypdfreader import pdf_reader
from spacy import displacy
from pypdf import PdfReader
import pymupdf
import tabula
import base64
import io
import pymupdf4llm
import pathlib

report_list = ['ORU_R01_DLIMS.txt','ORU_R01_R125.1_R0A.txt','ORU_R01_R125.1_RBS.txt','ORU_R01_R125.1_REP.txt','ORU_R01_R125.1_RR8.txt','ORU_R01_R125.1_RX1.txt','ORU_R01_R125.1_SG9.txt','ORU_R01_R125.1_ZT001.txt','ORU_R01_R125.1_7A3.txt','ORU_R01_R125.1_RPY.txt','ORU_R01_R125.1_RXK.txt', 'SHIRE_ORU_R01_RM3.txt', 'WALES_ORU_R01_TX.txt']

# For testing

report_list = ['ORU_R01_R125.1_R0A.txt','ORU_R01_DLIMS.txt']


pdfReaderEnabled = False
pdfPymupdfEnabled = False
pdfpymupdf4llmEnabled = True
pdfTableEnabled = False
spacyEnabled = True

for file in report_list:
    with open('Output/FHIR/R01/' + file+ '.json', 'rb') as g:
        rawJson = g.read()
        jsonBundle = json.loads(rawJson)

        #bundle = b.Bundle(jsonBundle)
        for entry in jsonBundle['entry']:
            #print(entry['resource']['resourceType'])
            if entry['resource']['resourceType'] == 'Binary':
                binary = bin.Binary(entry['resource'])
                encoded = binary.data
                #print(encoded)
                decode = base64.b64decode(encoded)
                pdf_stream = io.BytesIO(decode)
                if pdfpymupdf4llmEnabled:
                    doc = pymupdf.open("pdf", pdf_stream)
                    md_text = pymupdf4llm.to_markdown(doc)
                    #print(md_text)
                    pathlib.Path('Output/Markdown/R01/' + file+ '.md').write_bytes(md_text.encode())
                if pdfPymupdfEnabled:
                    # https://pymupdf.readthedocs.io/en/latest/the-basics.html

                    # Initial Tests indicate this is the best method to use.

                    doc = pymupdf.open("pdf", pdf_stream)
                    print('Pages: '+ str(len(doc)))
                    print('============ pymupdf ' + file + ' ==============')
                    for page in doc: # iterate the document pages
                        text = page.get_text().encode("utf8")
                        #print(text)
                if pdfReaderEnabled:
                    pdf = PdfReader(pdf_stream)
                    print('Pages: '+ str(len(pdf.pages)))
                    print('============ PDF Reader ' + file + ' ==============')
                    for page in pdf.pages:
                        text = page.extract_text()
                        # print(text)
                if pdfTableEnabled:
                    tables = tabula.read_pdf(pdf_stream
                                             , pages='all', multiple_tables=True)
                    print('============ Tabula ' + file + ' ==============')
                    for table in tables:
                        tab1 = table
                        #print(table)
                with open('Output/PDF/R01/' + file+ '.pdf', 'wb') as pdf:
                    pdf.write(decode)


pdf_list = ['R24-0WH7-1.pdf','R24-1A4W-1.pdf']

for file in pdf_list:
    with open('Output/PDF/R01/' + file, 'rb') as g:
        pdf_stream = g.read()
        if pdfpymupdf4llmEnabled:
            doc = pymupdf.open("pdf", pdf_stream)
            md_text = pymupdf4llm.to_markdown(doc)
            # print(md_text)
            pathlib.Path('Output/Markdown/R01/' + file+ '.md').write_bytes(md_text.encode())


pdf_list = ['genomic-medicine-device-test-order-form-cancer-v1.22.pdf']

for file in pdf_list:
    with open('Output/PDF/O21/' + file, 'rb') as g:
        pdf_stream = g.read()
        if pdfpymupdf4llmEnabled:
            doc = pymupdf.open("pdf", pdf_stream)
            md_text = pymupdf4llm.to_markdown(doc)
            # print(md_text)
            pathlib.Path('Output/Markdown/O21/' + file+ '.md').write_bytes(md_text.encode())



## Preparing Textual Data for Statistics and Machine Learning

### Tokens

In [18]:
file = 'R24-0WH7-1.pdf'
if spacyEnabled:
    print('============ Spacy ' + file + ' ==============')
    nlp = spacy.load('en_core_web_sm')
    # TODO use pdf_stream instead
    doc = pdf_reader('Output/PDF/R01/' + file, nlp)

    for token in doc:
        print(token,end="|")

North|West|Genomic|Laboratory|Hub|(|Manchester|Site|)|
|Manchester|Centre|for|Genomic|Medicine|
|6th|Floor|,|St|Mary|'s|Hospital|,|Manchester|,|M13|9WL|
|Scientific|Operational|Director|:|Dr|E.|Howard|
|https://mft.nhs.uk/nwglh/| |mft.genomics@nhs.net| |Tel|+44(0|)|161|276|6122|

|Forename|:|
|Surname|:|
|Sex|:|
|DoB|:|
|NHS|No|:|

|ACCESSIBILITY|
|ZZZTESTA|
|Unspecified|
|21/10/1978|
|999|001|2563|

|Report|No|:|
|R24|-|0WH7|-|1|
|Hospital|No|:|Not|provided|
|Not|provided|
|Other|No|:|
|Blood|
|Sample|:|

|Sample2|:|

|Blood|

|Collected|:|
|Collected2|:|

|Not|provided|
|25/06/2024|

|Received|:|
|Received2|:|

|08/10/2023|08:18|
|26/06/2024|14:21|

|Activated|:|
|Reported|:|

|26/06/2024|

|Referred|by|:|Dr|Anna|Castleton|,|Haematology|,|The|Christie|Hospital|,|Manchester|,|M20|4BX|,|(|chn-tr.Haematology.Results@nhs.net|)|

|Genomics|Laboratory|Report|

|Reason|for|Testing|
|MRD|follow|up|.|

|RESULT|SUMMARY|:|

|BCR::ABL1|Major|(|e14a2|/|e13a2|)|–|Detected|and|quantitated|.|See|gra

### Extracting Named Entities

In [19]:
 if spacyEnabled:
     for ent in doc.ents:
        print(ent.text, ent.label_)

North West Genomic Laboratory Hub ORG
Manchester Site PERSON
Manchester Centre PERSON
6th Floor ORG
St Mary's PERSON
Manchester GPE
M13 PERSON
Scientific Operational ORG
E. Howard PERSON
161 CARDINAL
276 6122 DATE
999 CARDINAL
001 2563

Report DATE
Anna Castleton PERSON
Haematology ORG
The Christie Hospital ORG
Manchester PERSON
M20 PERSON
Major (e14a2 ORG
XX GPE
XX GPE
MRX ORG
Hannah Reed
Pre-Registration Scientist PERSON
1 CARDINAL
2 CARDINAL
the International Scale (IS ORG
IRIS ORG
TKI ORG
CML ORG
ABL1 PERSON
2 CARDINAL
MMR ORG
3 CARDINAL
MR3 GPE
0.1% PERCENT
CML ORG
two CARDINAL
0.0001% PERCENT
0.00001% PERCENT
at least 5ml CARDINAL
1ml ORDINAL
24 hours TIME
48 hours TIME
Baccarani PERSON
Blood GPE
122 CARDINAL
872 CARDINAL
Maxwell RSC simplyRNA PERSON
NWGLH ORG
2.1 CARDINAL
3B CARDINAL
BlackBio Biotech India Ltd ORG
TRUPCR® Europe Ltd ORG
RNA ORG
Applied Biosystems ORG
QuantStudio PERSON
6 CARDINAL
six CARDINAL
European NORP
ABL1 PERSON
10 CARDINAL
7 CARDINAL
WHO International Sta

In [21]:
if spacyEnabled:
    displacy.render(doc, style='ent')