In this notebook the first section shows different libraries for reading PDF files.

This was influenced by [Blueprints for Text Analytics Using Python](https://www.oreilly.com/library/view/blueprints-for-text/9781492074076/) chapters 3 `Extracting Data`

The remaining sections are influenced by chapter 4 `Preparing Textual Data for Statistics and Machine Learning`

# Extracting Data from PDF files

In [17]:

import fhirclient.models.binary as bin
import json

from pypdf import PdfReader
import pymupdf
import tabula
import base64
import io
import pymupdf4llm
import pathlib

report_list = ['ORU_R01_DLIMS.txt','ORU_R01_R125.1_R0A.txt','ORU_R01_R125.1_RBS.txt','ORU_R01_R125.1_REP.txt','ORU_R01_R125.1_RR8.txt','ORU_R01_R125.1_RX1.txt','ORU_R01_R125.1_SG9.txt','ORU_R01_R125.1_ZT001.txt','ORU_R01_R125.1_7A3.txt','ORU_R01_R125.1_RPY.txt','ORU_R01_R125.1_RXK.txt', 'SHIRE_ORU_R01_RM3.txt', 'WALES_ORU_R01_TX.txt']

# For testing

report_list = ['ORU_R01_R125.1_R0A.txt','ORU_R01_DLIMS.txt']


pdfReaderEnabled = False
pdfPymupdfEnabled = False
pdfpymupdf4llmEnabled = True
pdfTableEnabled = False


for file in report_list:
    with open('Output/FHIR/R01/' + file+ '.json', 'rb') as g:
        rawJson = g.read()
        jsonBundle = json.loads(rawJson)

        #bundle = b.Bundle(jsonBundle)
        for entry in jsonBundle['entry']:
            #print(entry['resource']['resourceType'])
            if entry['resource']['resourceType'] == 'Binary':
                binary = bin.Binary(entry['resource'])
                encoded = binary.data
                #print(encoded)
                decode = base64.b64decode(encoded)
                pdf_stream = io.BytesIO(decode)
                if pdfpymupdf4llmEnabled:
                    doc = pymupdf.open("pdf", pdf_stream)
                    md_text = pymupdf4llm.to_markdown(doc)
                    #print(md_text)
                    pathlib.Path('Output/Markdown/R01/' + file+ '.md').write_bytes(md_text.encode())
                if pdfPymupdfEnabled:
                    # https://pymupdf.readthedocs.io/en/latest/the-basics.html

                    # Initial Tests indicate this is the best method to use.

                    doc = pymupdf.open("pdf", pdf_stream)
                    print('Pages: '+ str(len(doc)))
                    print('============ pymupdf ' + file + ' ==============')
                    for page in doc: # iterate the document pages
                        text = page.get_text().encode("utf8")
                        print(text)
                if pdfReaderEnabled:
                    pdf = PdfReader(pdf_stream)
                    print('Pages: '+ str(len(pdf.pages)))
                    print('============ PDF Reader ' + file + ' ==============')
                    for page in pdf.pages:
                        text = page.extract_text()
                        # print(text)
                if pdfTableEnabled:
                    tables = tabula.read_pdf(pdf_stream
                                             , pages='all', multiple_tables=True)
                    print('============ Tabula ' + file + ' ==============')
                    for table in tables:
                        tab1 = table
                        #print(table)
                with open('Output/PDF/R01/' + file+ '.pdf', 'wb') as pdf:
                    pdf.write(decode)


pdf_list = ['R24-0WH7-1.pdf','R24-1A4W-1.pdf']

for file in pdf_list:
    with open('Output/PDF/R01/' + file, 'rb') as g:
        pdf_stream = g.read()
        if pdfpymupdf4llmEnabled:
            doc = pymupdf.open("pdf", pdf_stream)
            md_text = pymupdf4llm.to_markdown(doc)
            print(md_text)
            pathlib.Path('Output/Markdown/R01/' + file+ '.md').write_bytes(md_text.encode())


pdf_list = ['genomic-medicine-device-test-order-form-cancer-v1.22.pdf']

for file in pdf_list:
    with open('Output/PDF/O21/' + file, 'rb') as g:
        pdf_stream = g.read()
        if pdfpymupdf4llmEnabled:
            doc = pymupdf.open("pdf", pdf_stream)
            md_text = pymupdf4llm.to_markdown(doc)
            print(md_text)
            pathlib.Path('Output/Markdown/O21/' + file+ '.md').write_bytes(md_text.encode())



NHS Genomic Medicine Service, WGS Test Request Cancer, July 2024 v1.22 to be used for WGS go-live.
This document is subject to version control and is regularly updated. Please confirm you are using the current version by
contacting your local Genomic Laboratory Hub

~~**Genomic Medicine Service**~~








|Whole Genome Sequencing (WGS) Test Request PLEASE DO NOT USE FOR NON-WGS TESTS CANCER|Col2|
|---|---|
|**Requesting organisation:**<br> <br>|**Requesting organisation:**<br> <br>|
|<br>~~**GLH laboratory to receive sample:**~~<br>North West|~~Test Required~~<br>**Whole Genome Sequencing **|







































|GLH laboratory to receive sample: North West|Col2|Col3|Col4|Test Required Whole Genome Sequencing|Col6|Col7|
|---|---|---|---|---|---|---|
|~~Patient first name~~<br> <br>Myrcella|~~Patient first name~~<br> <br>Myrcella|~~Patient first name~~<br> <br>Myrcella|~~Patient first name~~<br> <br>Myrcella|~~Ethnicity~~<br>|~~Ethnicity~~<br>|~~Ethnicity~~<br>|
|~~P

## Preparing Textual Data for Statistics and Machine Learning

### Extracting Data from a Laboratory Report

In [18]:
import markdown

# Karotype testing
file = 'R24-0WH7-1.pdf'

# this is typical of reports coming at present from iGene
file = 'ORU_R01_R125.1_R0A.txt.pdf'

folder = 'Output/PDF/R01/'

with open(folder + file, 'rb') as g:
    pdf_stream = g.read()

    doc = pymupdf.open("pdf", pdf_stream)
    md_text = pymupdf4llm.to_markdown(doc)
    html_text = markdown.markdown(md_text)


### Cleaning Text Data

In [41]:
from io import StringIO
from html.parser import HTMLParser
import re

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def getItem(field_name, text):
    pattern = rf"{re.escape(field_name)}:*([^\r\n]+)"
    match = re.search(pattern, text)
    if match:
        report_no = match.group(1)
        return report_no.strip()
    return ""

def getItemWithStopWord(field_name, stop_word, text):

    pattern = rf"{re.escape(field_name)}:*([^\r\n]+?)({re.escape(stop_word)})"
    match = re.search(pattern, text)
    if match:
        report_no = match.group(1)
        return report_no.strip()

    return getItem(field_name, text)

clean_text = strip_tags(html_text)

print("Report No = "+getItem('Report No', clean_text))
print("Hospital No = "+getItem('Hospital No', clean_text))
print("NHS No = "+getItemWithStopWord('NHS No','Hive', clean_text))
print("Order Number (HIVE) = "+getItemWithStopWord('Hive Order ID','Path', clean_text))
print("Sample type = "+getItemWithStopWord('Sample','Sample', clean_text))
print("Sample2 type = "+getItem('Sample2', clean_text))
print("Pathology No. = "+getItem('Path. No', clean_text))
print("Specimen Collected = "+getItemWithStopWord('Collected','Activated', clean_text))
print("Specimen Collected2 = "+getItemWithStopWord('Collected2','Received2', clean_text))
print("Specimen Received2 = "+getItemWithStopWord('Received2','Received2', clean_text))

Report No = R25-00EK-3
Hospital No = Not provided
NHS No = :
Order Number (HIVE) = 1000152866
Sample type = Blood
Sample2 type = 
Pathology No. = ACC1810A1
Specimen Collected = Not provided
Specimen Collected2 = 
Specimen Received2 = 


### [spaCy](https://spacy.io/)

In [20]:
import spacy


print('============ Spacy ' + file + ' ==============')
nlp = spacy.load('en_core_web_sm')

#doc = pdf_reader(folder + file, nlp)
doc = nlp(clean_text)

for token in doc:
    print(token,end="|")

North|West|Genomic|Laboratory|Hub|(|Manchester|Site|)|
|Manchester|Centre|for|Genomic|Medicine|
|6th|Floor|,|St|Mary|'s|Hospital|,|Manchester|,|M13|9WL|
|Scientific|Operational|Director|:|Dr|E.|Howard|
|https://mft.nhs.uk/nwglh/|mft.genomics@nhs.net|Tel|+44(0|)|161|276|6122|
|Forename|:|Jack|
|Surname|:|HIM|
|Sex|:|
|Report|No|:|R25|-|00EK-3|
|Hospital|No|:|Not|provided|
|Other|No|:|Not|provided|
|DoB|:|18/02/1987|Sample|:|Blood|
|NHS|No|:|
|Hive|Order|ID|:|1000152866|Path|.|No|:|ACC1810A1|
|Collected|:|Not|provided|Activated|:|14/10/2025|
|Reported|:|14/10/2025|
|Referred|by|:|Jonathan|Edgerley|,|Clinical|Genetics|,|North|West|Genomic|Laboratory|Hub|(|Manchester|)|,|Manchester|,|M13|9WL|,|(|jonathan|.|
|edgerley@mft.nhs.uk|)|
|Genomics|Laboratory|Report|
|Reason|for|Testing|
|[|REASON|FOR|TESTING|-|COPY|BELOW|TEXT|TO|'|REPORTING|REFERRAL|INFORMATION|TEXT|'|BOX|THEN|DELETE|]|
|[|Insert|referring|clinician|notes|]|
|Clinical|Indication|:|[|Insert|Clinical|Indication|Name|(|RXXX|)|]|.|
|

### Extracting Named Entities

In [21]:

 for ent in doc.ents:
    print(ent.text, ent.label_)

North West Genomic Laboratory Hub ORG
Manchester Site PERSON
Manchester Centre PERSON
6th Floor ORG
St Mary's PERSON
Manchester GPE
M13 PERSON
Scientific Operational ORG
E. Howard PERSON
Tel PERSON
161 CARDINAL
276 6122 DATE
Jack
Surname PERSON
Hive Order PERSON
1000152866 CARDINAL
Jonathan Edgerley PERSON
Clinical Genetics PERSON
North West Genomic Laboratory Hub ORG
Manchester PERSON
Manchester PERSON
M13 PERSON
jonathan PERSON
Genomics Laboratory Report
Reason for Testing ORG
Insert PERSON
Insert Clinical Indication Name ORG
Appendix PERSON
Jonathan Edgerley
Clinical Scientist PERSON
Jonathan Edgerley
Clinical Scientist
Page PERSON
1 CARDINAL
2 CARDINAL
18/02/1987 DATE
Thoracic ORG
R125.1 NORP
R125.2 GPE
Chemagic ORG
NWGLH
Enrichment ORG
550 CARDINAL
Congenica NORP
Sentieon PERSON
CNV ORG
Exome Depth PERSON
Thoracic ORG
R125.1 NORP
R125.2 GPE
ABL1 PERSON
ACTA2 PERSON
BGN ORG
CBS ORG
EFEMP2 PRODUCT
FBN1 ORG
FBN2 ORG
FLNA ORG
FOXE3 ORG
LOX ORG
MFAP5 GPE
MYLK ORG
NOTCH1 ORG
PLOD1 ORG
P

In [22]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=False)

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    North West Genomic Laboratory Hub\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n (\n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Manchester Site\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>\n</mark>\n)<br>\n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Manchester Centre\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin

## [scispaCy](https://github.com/allenai/scispacy)

A full spaCy pipeline and models for scientific/biomedical documents. Note: For clinical documents consider using [medspaCy](https://spacy.io/universe/project/medspacy)

In [23]:
import scispacy
from scispacy.abbreviation import AbbreviationDetector

import pyobo

from scispacy.linking import EntityLinker




nlp = spacy.load("en_core_sci_sm")

nlp.add_pipe("abbreviation_detector")

doc = nlp(clean_text)

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
    print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")


Abbreviation 	 Definition
SNVs 	 (483, 484) single nucleotide variants
SNVs 	 (797, 798) single nucleotide variants
INDELs 	 (813, 814) insertions and deletions
INDELs 	 (491, 492) insertions and deletions
mtDNA 	 (641, 642) mitochondrial DNA


Named Entities

In [24]:
 for ent in doc.ents:
    print(ent.text,  ent.label_)

North West Genomic Laboratory Hub ENTITY
Manchester Site ENTITY
Genomic Medicine
6th Floor ENTITY
St Mary's Hospital ENTITY
Manchester ENTITY
M13 9WL ENTITY
E. Howard
https://mft.nhs.uk/nwglh/ mft.genomics@nhs.net Tel +44(0 ENTITY
Forename ENTITY
Jack
Surname ENTITY
HIM ENTITY
Sex ENTITY
DoB ENTITY
Sample ENTITY
Blood
NHS No ENTITY
ID ENTITY
ACC1810A1
Collected ENTITY
Activated ENTITY
Reported ENTITY
Referred ENTITY
Clinical Genetics ENTITY
North West Genomic Laboratory Hub ENTITY
Manchester ENTITY
Manchester ENTITY
M13 9WL ENTITY
Laboratory Report
 ENTITY
Testing
 ENTITY
TESTING ENTITY
COPY ENTITY
REPORTING REFERRAL INFORMATION TEXT' ENTITY
BOX ENTITY
DELETE ENTITY
clinician ENTITY
notes ENTITY
Clinical Indication ENTITY
RXXX ENTITY
Genetic ENTITY
Result ENTITY
Interpretation
 ENTITY
pathogenic ENTITY
variants ENTITY
genetic ENTITY
individual's ENTITY
clinical symptoms ENTITY
identified ENTITY
diagnosis ENTITY
genetic disorder ENTITY
DNA abnormalities ENTITY
technology
 ENTITY
Appendi

In [43]:
displacy.render(doc, jupyter=False, style='ent')

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    North West Genomic Laboratory Hub\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ENTITY</span>\n</mark>\n (\n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Manchester Site\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ENTITY</span>\n</mark>\n)<br>Manchester Centre for \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Genomic Medicine\n6th Floor\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vert

In [27]:
import spacy
import scispacy

from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_sm")

# This line takes a while, because we have to download ~1GB of data
# and load a large JSON file (the knowledge base). Be patient!
# Thankfully it should be faster after the first time you use it, because
# the downloads are cached.
# NOTE: The resolve_abbreviations parameter is optional, and requires that
# the AbbreviationDetector pipe has already been added to the pipeline. Adding
# the AbbreviationDetector pipe and setting resolve_abbreviations to True means
# that linking will only be performed on the long form of abbreviations.
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

doc = nlp(clean_text)

# Let's look at a random entity!
entity = doc.ents[1]

print("Name: ", entity)

# Each entity is linked to UMLS with a score
# (currently just char-3gram matching).
linker = nlp.get_pipe("scispacy_linker")
for umls_ent in entity._.kb_ents:
    print(linker.kb.cui_to_entity[umls_ent[0]])

Name:  Manchester Site
