In this notebook the first section shows different libraries for reading PDF files.

This was influenced by [Blueprints for Text Analytics Using Python](https://www.oreilly.com/library/view/blueprints-for-text/9781492074076/) chapters 3 `Extracting Data`

The remaining sections are influenced by chapter 4 `Preparing Textual Data for Statistics and Machine Learning`

# Extracting Data from PDF files

In [58]:

import fhirclient.models.binary as bin
import json

from pypdf import PdfReader
import pymupdf
import tabula
import base64
import io
import pymupdf4llm
import pathlib

report_list = ['ORU_R01_DLIMS.txt','ORU_R01_R125.1_R0A.txt','ORU_R01_R125.1_RBS.txt','ORU_R01_R125.1_REP.txt','ORU_R01_R125.1_RR8.txt','ORU_R01_R125.1_RX1.txt','ORU_R01_R125.1_SG9.txt','ORU_R01_R125.1_ZT001.txt','ORU_R01_R125.1_7A3.txt','ORU_R01_R125.1_RPY.txt','ORU_R01_R125.1_RXK.txt', 'SHIRE_ORU_R01_RM3.txt', 'WALES_ORU_R01_TX.txt']

# For testing

report_list = ['ORU_R01_R125.1_R0A.txt','ORU_R01_DLIMS.txt']


pdfReaderEnabled = False
pdfPymupdfEnabled = False
pdfpymupdf4llmEnabled = True
pdfTableEnabled = False


for file in report_list:
    with open('Output/FHIR/R01/' + file+ '.json', 'rb') as g:
        rawJson = g.read()
        jsonBundle = json.loads(rawJson)

        #bundle = b.Bundle(jsonBundle)
        for entry in jsonBundle['entry']:
            #print(entry['resource']['resourceType'])
            if entry['resource']['resourceType'] == 'Binary':
                binary = bin.Binary(entry['resource'])
                encoded = binary.data
                #print(encoded)
                decode = base64.b64decode(encoded)
                pdf_stream = io.BytesIO(decode)
                if pdfpymupdf4llmEnabled:
                    doc = pymupdf.open("pdf", pdf_stream)
                    md_text = pymupdf4llm.to_markdown(doc)
                    #print(md_text)
                    pathlib.Path('Output/Markdown/R01/' + file+ '.md').write_bytes(md_text.encode())
                if pdfPymupdfEnabled:
                    # https://pymupdf.readthedocs.io/en/latest/the-basics.html

                    # Initial Tests indicate this is the best method to use.

                    doc = pymupdf.open("pdf", pdf_stream)
                    print('Pages: '+ str(len(doc)))
                    print('============ pymupdf ' + file + ' ==============')
                    for page in doc: # iterate the document pages
                        text = page.get_text().encode("utf8")
                        print(text)
                if pdfReaderEnabled:
                    pdf = PdfReader(pdf_stream)
                    print('Pages: '+ str(len(pdf.pages)))
                    print('============ PDF Reader ' + file + ' ==============')
                    for page in pdf.pages:
                        text = page.extract_text()
                        # print(text)
                if pdfTableEnabled:
                    tables = tabula.read_pdf(pdf_stream
                                             , pages='all', multiple_tables=True)
                    print('============ Tabula ' + file + ' ==============')
                    for table in tables:
                        tab1 = table
                        #print(table)
                with open('Output/PDF/R01/' + file+ '.pdf', 'wb') as pdf:
                    pdf.write(decode)


pdf_list = ['R24-0WH7-1.pdf','R24-1A4W-1.pdf']

for file in pdf_list:
    with open('Output/PDF/R01/' + file, 'rb') as g:
        pdf_stream = g.read()
        if pdfpymupdf4llmEnabled:
            doc = pymupdf.open("pdf", pdf_stream)
            md_text = pymupdf4llm.to_markdown(doc)
            print(md_text)
            pathlib.Path('Output/Markdown/R01/' + file+ '.md').write_bytes(md_text.encode())


pdf_list = ['genomic-medicine-device-test-order-form-cancer-v1.22.pdf']

for file in pdf_list:
    with open('Output/PDF/O21/' + file, 'rb') as g:
        pdf_stream = g.read()
        if pdfpymupdf4llmEnabled:
            doc = pymupdf.open("pdf", pdf_stream)
            md_text = pymupdf4llm.to_markdown(doc)
            print(md_text)
            pathlib.Path('Output/Markdown/O21/' + file+ '.md').write_bytes(md_text.encode())



NHS Genomic Medicine Service, WGS Test Request Cancer, July 2024 v1.22 to be used for WGS go-live.
This document is subject to version control and is regularly updated. Please confirm you are using the current version by
contacting your local Genomic Laboratory Hub

~~**Genomic Medicine Service**~~








|Whole Genome Sequencing (WGS) Test Request PLEASE DO NOT USE FOR NON-WGS TESTS CANCER|Col2|
|---|---|
|**Requesting organisation:**<br> <br>|**Requesting organisation:**<br> <br>|
|<br>~~**GLH laboratory to receive sample:**~~<br>North West|~~Test Required~~<br>**Whole Genome Sequencing **|







































|GLH laboratory to receive sample: North West|Col2|Col3|Col4|Test Required Whole Genome Sequencing|Col6|Col7|
|---|---|---|---|---|---|---|
|~~Patient first name~~<br> <br>Myrcella|~~Patient first name~~<br> <br>Myrcella|~~Patient first name~~<br> <br>Myrcella|~~Patient first name~~<br> <br>Myrcella|~~Ethnicity~~<br>|~~Ethnicity~~<br>|~~Ethnicity~~<br>|
|~~P

## Preparing Textual Data for Statistics and Machine Learning

### Extracting Data from a Laboratory Report

In [59]:
import markdown

file = 'R24-0WH7-1.pdf'
folder = 'Output/PDF/R01/'

with open(folder + file, 'rb') as g:
    pdf_stream = g.read()

    doc = pymupdf.open("pdf", pdf_stream)
    md_text = pymupdf4llm.to_markdown(doc)
    html_text = markdown.markdown(md_text)


### Cleaning Text Data

In [60]:
from io import StringIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

clean_text = strip_tags(html_text)

### [spaCy](https://spacy.io/)

In [61]:
import spacy
from spacy import displacy
from spacypdfreader.spacypdfreader import pdf_reader
from spacy import displacy

print('============ Spacy ' + file + ' ==============')
nlp = spacy.load('en_core_web_sm')

#doc = pdf_reader(folder + file, nlp)
doc = nlp(clean_text)

for token in doc:
    print(token,end="|")

North|West|Genomic|Laboratory|Hub|(|Manchester|Site|)|
|Manchester|Centre|for|Genomic|Medicine|
|6th|Floor|,|St|Mary|'s|Hospital|,|Manchester|,|M13|9WL|
|Scientific|Operational|Director|:|Dr|E.|Howard|
|https://mft.nhs.uk/nwglh/|mft.genomics@nhs.net|Tel|+44(0|)|161|276|6122|
|Forename|:|ACCESSIBILITY|
|Surname|:|ZZZTESTA|
|Report|No|:|R24|-|0WH7|-|1|
|Hospital|No|:|Not|provided|
|Other|No|:|Not|provided|
|Sex|:|Unspecified|Other|No|:|Not|provided|
|DoB|:|21/10/1978|Sample|:|Blood|Sample2|:|Blood|
|DoB|:|21/10/1978|Sample|:|Blood|
|NHS|No|:|999|001|2563|
|Collected|:|Not|provided|
|Collected2|:|25/06/2024|
|Received|:|
|Received2|:|
|08/10/2023|08:18|
|26/06/2024|14:21|
|Activated|:|26/06/2024|
|Reported|:|
|Referred|by|:|Dr|Anna|Castleton|,|Haematology|,|The|Christie|Hospital|,|Manchester|,|M20|4BX|,|(|chn-tr.Haematology.Results@nhs.net|)|
|Genomics|Laboratory|Report|
|Reason|for|Testing|
|MRD|follow|up|.|
|RESULT|SUMMARY|:|
|BCR::ABL1|Major|(|e14a2|/|e13a2|)|–|Detected|and|quantitated

### Extracting Named Entities

In [62]:

 for ent in doc.ents:
    print(ent.text, ent.label_)

North West Genomic Laboratory Hub ORG
Manchester Site PERSON
Manchester Centre PERSON
6th Floor ORG
St Mary's PERSON
Manchester GPE
M13 PERSON
Scientific Operational ORG
E. Howard PERSON
Tel PERSON
161 CARDINAL
276 6122 DATE
Sex: Unspecified Other No: Not WORK_OF_ART
DoB ORG
999 CARDINAL
001 2563 DATE
08/10/2023 GPE
26/06/2024 DATE
Anna Castleton PERSON
Haematology ORG
The Christie Hospital ORG
Manchester PERSON
M20 PERSON
Genomics Laboratory Report
Reason for Testing ORG
XX GPE
XX GPE
Hannah Reed
Pre-Registration Scientist PERSON
1 CARDINAL
2 CARDINAL
21/10/1978 CARDINAL
the International Scale (IS ORG
IRIS ORG
TKI ORG
CML ORG
ABL1 PERSON
2 CARDINAL
MMR ORG
3 CARDINAL
MR3 GPE
0.1% PERCENT
CML ORG
two CARDINAL
0.0001% PERCENT
0.00001% PERCENT
at least 5ml CARDINAL
1ml ORDINAL
24 hours TIME
48 hours TIME
Baccarani PERSON
Blood GPE
122 CARDINAL
872 CARDINAL
Maxwell RSC simplyRNA PERSON
NWGLH ORG
2.1 CARDINAL
3B CARDINAL
BlackBio Biotech India Ltd ORG
TRUPCR® Europe Ltd ORG
RNA ORG
Applie

In [63]:

displacy.render(doc, style='ent')

## [scispaCy](https://github.com/allenai/scispacy)

A full spaCy pipeline and models for scientific/biomedical documents. Note: For clinical documents consider using [medspaCy](https://spacy.io/universe/project/medspacy)

In [64]:
import scispacy
from scispacy.abbreviation import AbbreviationDetector

import pyobo

from scispacy.linking import EntityLinker
from tabulate import tabulate



nlp = spacy.load("en_core_sci_sm")

nlp.add_pipe("abbreviation_detector")

doc = nlp(clean_text)

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
    print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")


Abbreviation 	 Definition
IS 	 (497, 498) International Scale
IS 	 (890, 891) International Scale
IS 	 (221, 222) International Scale
IS 	 (453, 454) International Scale
IS 	 (331, 332) International Scale
TKI 	 (354, 355) tyrosine kinase inhibitor
CCyR 	 (418, 419) complete cytogenetic remission
MMR 	 (436, 437) major
molecular response
DMR 	 (476, 477) deep molecular responses
Ltd 	 (743, 744) Ltd
Ltd 	 (750, 751) Ltd
RT-qPCR 	 (781, 782) Reverse transcription quantitative polymerase chain reaction
RT-qPCR 	 (315, 316) Reverse transcription quantitative polymerase chain reaction
MRD 	 (171, 172) Minimal/Measurable Residual Disease
MRD 	 (968, 969) Minimal/Measurable Residual Disease


Named Entities

In [65]:
 for ent in doc.ents:
    print(ent.text,  ent.label_)

North West Genomic Laboratory Hub ENTITY
Manchester Site ENTITY
Genomic Medicine
6th Floor ENTITY
St Mary's Hospital ENTITY
Manchester ENTITY
M13 9WL ENTITY
E. Howard
https://mft.nhs.uk/nwglh/ mft.genomics@nhs.net Tel +44(0 ENTITY
Forename ENTITY
ACCESSIBILITY
Surname ENTITY
ZZZTESTA ENTITY
R24 ENTITY
0WH7 ENTITY
Sex ENTITY
DoB ENTITY
Blood Sample2 ENTITY
Blood
DoB ENTITY
Blood
NHS No ENTITY
Collected ENTITY
Collected2 ENTITY
Received2 ENTITY
Activated ENTITY
Referred ENTITY
Dr Anna Castleton ENTITY
Haematology ENTITY
Christie Hospital ENTITY
Manchester ENTITY
M20 4BX ENTITY
Haematology ENTITY
Results@nhs.net ENTITY
Laboratory Report
 ENTITY
Testing ENTITY
MRD ENTITY
follow up ENTITY
e13a2 ENTITY
Detected ENTITY
quantitated ENTITY
graph overleaf ENTITY
Result ENTITY
Interpretation
 ENTITY
bone ENTITY
marrow/peripheral blood sample ENTITY
patient ENTITY
XX/XX/XX ENTITY
detectable ENTITY
BCR::ABL1 ENTITY
transcripts ENTITY
X%
BCR::ABL1 [IS ENTITY
TESTING ENTITY
COPY ENTITY
REPORTING REFE

In [66]:
displacy.render(doc, jupyter=True, style='ent')

In [67]:
import spacy
import scispacy

from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_sm")

# This line takes a while, because we have to download ~1GB of data
# and load a large JSON file (the knowledge base). Be patient!
# Thankfully it should be faster after the first time you use it, because
# the downloads are cached.
# NOTE: The resolve_abbreviations parameter is optional, and requires that
# the AbbreviationDetector pipe has already been added to the pipeline. Adding
# the AbbreviationDetector pipe and setting resolve_abbreviations to True means
# that linking will only be performed on the long form of abbreviations.
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

doc = nlp(clean_text)

# Let's look at a random entity!
entity = doc.ents[1]

print("Name: ", entity)

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
linker = nlp.get_pipe("scispacy_linker")
for umls_ent in entity._.kb_ents:
    print(linker.kb.cui_to_entity[umls_ent[0]])

Name:  Manchester Site
