<a href="https://colab.research.google.com/github/osman-mo94/Sarcopenia-NLP-project/blob/main/synthetic_letters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and import packages

In [32]:
pip install docx2txt



In [33]:
pip install spacy




In [34]:
pip install negspacy



In [35]:
pip install scispacy



In [36]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz (120.2 MB)


In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import docx2txt
import spacy
from spacy.matcher import PhraseMatcher
from negspacy.negation import Negex
from negspacy.termsets import termset
import scispacy
from scispacy.abbreviation import AbbreviationDetector
from spacy import displacy

In [38]:
#Initialize nlp pipeline with scispacy model (for processing biomedical, scientific and clinical text)
nlp = spacy.load("en_ner_bc5cdr_md")
#Add abbreviation detector for medical abbreviations
nlp.add_pipe("abbreviation_detector")

<scispacy.abbreviation.AbbreviationDetector at 0x7fc242d31ad0>

In [39]:
#View components of nlp pipeline
nlp.component_names

['tok2vec',
 'tagger',
 'attribute_ruler',
 'lemmatizer',
 'parser',
 'ner',
 'abbreviation_detector']

In [40]:
#Mount google drive so that colab can access files in my google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import letters for analysis

In [41]:
#Import letters (note that these letters do not refer to real patients)
letter_A = docx2txt.process('/content/drive/MyDrive/NLP projects/Dummy letters/dummy letters/Letter A.docx')
letter_B = docx2txt.process('/content/drive/MyDrive/NLP projects/Dummy letters/dummy letters/Letter B.docx')

In [42]:
print(letter_A)

Mr A Smith

567 Ghengis Khan Drive, Newcastle NE4 5XX



Diagnoses:

Poor mobility due to chronic pain, low confidence and previous falls 

Weight loss, anaemia and raised inflammatory markers of unknown aetiology 

Low mood secondary to poor mobility 

Breathlessness and elevated BNP awaiting echo 

Chronic back pain with degenerative changes on MRI 

Urinary frequency and incontinence 



Other diagnoses:

Complex partial epileptic seizures

Hypertension 

Osteoarthritis with bilateral total hip replacements

Atrial fibrillation 

Asthma 

Patent foramen ovale 

Previous cerebellar stroke 



Medications:

Atorvastatin

Docusate

Ferrous fumarate

Vitamin-D3

Furosemide

Gaviscon

Flutiform inhaler

Salbutamol inhaler

Lansoprazole

Losartan 

Paracetamol 

Phenytoin

Tegretol slow release

Codeine

Warfarin



Suggested changes to medication 

Reduce codeine 15 mg dose but try and use regularly 2-3 times per day 



Follow up arrangements

 I will organise ultrasound of the abdomen 

In [43]:
print(letter_B)

Mrs B Smith

Flat 1, Farringdon Road, Newcastle NE2 5DH

Date of Birth: 01/01/1932





Diagnoses: 

Falls due to gait and balance disorder 

Hyperthyroidism due to thyroxine over-replacement

Sarcopenia 

Orthostatic hypotension 



Existing diagnoses: 

Hypertension 

Hypothyroidism 

Vitamin B12 deficiency 

Previous fractured wrist

Visual impairment due to cataracts 



Medications: 

Alendronic acid 

Calcium and vitamin-D 

Bendroflumethiazide 

Vitamin B12 

Simvastatin 

Ramipril 



Medication changes: 

Please reduce thyroxine dose to 75mcg once daily



Follow up arrangements: 

I will write back when I see the results of her 24 hour electrocardiogram



For Primary care 

Could you please forward me a copy of the 24 hour blood pressure monitor that she says she had in your surgery recently? 



I saw Mrs Smith for a face-to-face appointment at the Belsay Clinic today; she was accompanied by her son. She gives a history of two falls over the last few months and feels unstea

In [44]:
#Apply nlp pipeline to letter A
doc_A = nlp(letter_A)

  global_matches = self.global_matcher(doc)


In [45]:
#Apply nlp pipeline to letter B
doc_B = nlp(letter_B)

  global_matches = self.global_matcher(doc)


In [46]:
#Visualise entities in Letter A
displacy.render(doc_A, style='ent', jupyter = True)

# Build PhraseMatcher

In [47]:
#Define a list of terms indicative of muscle weakness
weakness_list = ["muscle weakness", "weak", "uses a stick", "uses a walking stick", 
                    "uses a frame", "uses a zimmer frame", "uses a walker", "uses a walking aid",
                    "furniture walks", "difficulty mobilising", "difficulty walking", "wheelchair"
                    "difficulty standing", "difficulty climbing stairs", "cannot climb stairs", "housebound",
                    "bedbound", "hoist transfer", "slowed up", "limited mobility", "poor mobility"
                    "needs assistance", "difficulty carrying", "falls", "fallen",
                    "found on floor", "long lie"]

In [48]:
#Initialize matcher
matcher = PhraseMatcher(nlp.vocab)

#Apply spaCy nlp pipeline to list of weakness terms
weakness_terms = [nlp(i) for i in weakness_list]


  global_matches = self.global_matcher(doc)


In [49]:
#Add weakness terms to PhraseMatcher
matcher.add("WEAKNESS TERM", weakness_terms)

In [50]:
#Apply matcher to letter A
matchesA = matcher(doc_A)

for match_id, start, end in matchesA: 
  span = doc_A[start:end]
  match_id_string = nlp.vocab.strings[match_id]
  print("Match:",match_id_string, "-", span.text, "( Location = ", start, end, ")")

Match: WEAKNESS TERM - falls ( Location =  27 28 )
Match: WEAKNESS TERM - falls ( Location =  249 250 )
Match: WEAKNESS TERM - fallen ( Location =  297 298 )
Match: WEAKNESS TERM - housebound ( Location =  315 316 )


In [51]:
#Apply matcher to letter B
matchesB = matcher(doc_B)

for match_id, start, end in matchesB: 
  span = doc_B[start:end]
  match_id_string = nlp.vocab.strings[match_id]
  print("Match:",match_id_string, "-", span.text, "( Location = ", start, end, ")")

Match: WEAKNESS TERM - falls ( Location =  172 173 )
Match: WEAKNESS TERM - falls ( Location =  203 204 )
Match: WEAKNESS TERM - falls ( Location =  664 665 )
Match: WEAKNESS TERM - falls ( Location =  720 721 )


# Set-up negation detection

In [52]:
#Define termset as clinical
ts = termset("en_clinical")

#Add negex to nlp pipeline
nlp.add_pipe("negex", config={
    "neg_termset":ts.get_patterns()
})

<negspacy.negation.Negex at 0x7fc2121d90d0>

In [53]:
#View termset patterns in use
print(ts.get_patterns())

{'pseudo_negations': ['no further', 'not able to be', 'not certain if', 'not certain whether', 'not necessarily', 'without any further', 'without difficulty', 'without further', 'might not', 'not only', 'no increase', 'no significant change', 'no change', 'no definite change', 'not extend', 'not cause', 'gram negative', 'not rule out', 'not ruled out', 'not been ruled out', 'not drain', 'no suspicious change', 'no interval change', 'no significant interval change'], 'preceding_negations': ['absence of', 'declined', 'denied', 'denies', 'denying', 'no sign of', 'no signs of', 'not', 'not demonstrate', 'symptoms atypical', 'doubt', 'negative for', 'no', 'versus', 'without', "doesn't", 'doesnt', "don't", 'dont', "didn't", 'didnt', "wasn't", 'wasnt', "weren't", 'werent', "isn't", 'isnt', "aren't", 'arent', 'cannot', "can't", 'cant', "couldn't", 'couldnt', 'never', 'patient was not', 'without indication of', 'without sign of', 'without signs of', 'without any reactions or signs of', 'no comp