In [1]:
import spacy
import medspacy
from medspacy.visualization import visualize_ent

# Overview
In this notebook, we'll look at how to extract clinical concepts and attributes from text.
- Target matching
- Section detection
- Context analysis

In [2]:
with open("./discharge_summary.txt") as f:
    text = f.read()

In [3]:
nlp = spacy.load("en_core_web_sm", disable=["ner"])

# Target extraction
In this step, we'll write rules to extract the main concepts we're interested in.

In this example, we'll use two utilities provided in `medspacy.ner` for rule-based matching: the `TargetMatcher` and `TargetRule`. However, you can use any spaCy components for adding spans to `doc.ents`, including pre-trained NER models or other [spaCy rule-based matching components](https://spacy.io/usage/rule-based-matching/).

## Target concepts
In our text, we'll extract the following concepts:
- Diagnoses 
- Medications
In addition, we'll show a few examples of how to add a custom spaCy attribute to a target rule to add an ICD-10 diagnosis code as an attribute of an entity.

In [4]:
from medspacy.ner import TargetMatcher, TargetRule

In [5]:
target_matcher = TargetMatcher(nlp)

In [6]:
nlp.add_pipe(target_matcher)

In [7]:
target_rules1 = [
    TargetRule(literal="abdominal pain", category="PROBLEM"),
    TargetRule("stroke", "PROBLEM"),
    TargetRule("hemicolectomy", "TREATMENT"),
    TargetRule("Hydrochlorothiazide", "TREATMENT"),
    TargetRule("colon cancer", "PROBLEM"),
    TargetRule("radiotherapy", "PROBLEM",
              pattern=[{"LOWER": "xrt"}]),
    TargetRule("metastasis", "PROBLEM"),
    
]

In [8]:
target_matcher.add(target_rules1)

In [9]:
doc = nlp(text)

In [10]:
visualize_ent(doc)

In [11]:
for ent in doc.ents:
    print(ent, ent.label_, ent._.target_rule, sep="  |  ")
    print()

Hydrochlorothiazide  |  TREATMENT  |  TargetRule(literal="Hydrochlorothiazide", category="TREATMENT", pattern=None, attributes=None, on_match=None)

Abdominal pain  |  PROBLEM  |  TargetRule(literal="abdominal pain", category="PROBLEM", pattern=None, attributes=None, on_match=None)

stroke  |  PROBLEM  |  TargetRule(literal="stroke", category="PROBLEM", pattern=None, attributes=None, on_match=None)

abdominal pain  |  PROBLEM  |  TargetRule(literal="abdominal pain", category="PROBLEM", pattern=None, attributes=None, on_match=None)

metastasis  |  PROBLEM  |  TargetRule(literal="metastasis", category="PROBLEM", pattern=None, attributes=None, on_match=None)

Colon cancer  |  PROBLEM  |  TargetRule(literal="colon cancer", category="PROBLEM", pattern=None, attributes=None, on_match=None)

hemicolectomy  |  TREATMENT  |  TargetRule(literal="hemicolectomy", category="TREATMENT", pattern=None, attributes=None, on_match=None)

XRT  |  PROBLEM  |  TargetRule(literal="radiotherapy", category="PR

In [12]:
from spacy.tokens import Span

In [13]:
Span.set_extension("icd10", default="")

In [14]:
target_rules2 = [
    TargetRule("Type II Diabetes Mellitus", "PROBLEM", 
              pattern=[
                  {"LOWER": "type"},
                  {"LOWER": {"IN": ["2", "ii", "two"]}},
                  {"LOWER": {"IN": ["dm", "diabetes"]}},
                  {"LOWER": "mellitus", "OP": "?"}
              ],
              attributes={"icd10": "E11.9"}),
    TargetRule("Hypertension", "PROBLEM",
              pattern=[{"LOWER": {"IN": ["htn", "hypertension"]}}],
              attributes={"icd10": "I10"}),
    
    
]

In [15]:
target_matcher.add(target_rules2)

In [16]:
doc = nlp(text)

In [17]:
for ent in doc.ents:
    if ent._.icd10 != "":
        print(ent, ent._.icd10)

type 2 dm E11.9
Type II Diabetes Mellitus E11.9
Hypertension I10
Type 2 DM E11.9
HTN I10


# Context

In [18]:
from medspacy.context import ConTextComponent, ConTextItem

In [19]:
context = ConTextComponent(nlp, rules="default")

In [20]:
nlp.add_pipe(context)

In [21]:
nlp.pipe_names

['tagger', 'parser', 'target_matcher', 'context']

In [22]:
doc = nlp("Mother with stroke at age 82.")

In [23]:
visualize_ent(doc)

In [24]:
from medspacy.visualization import visualize_dep

In [25]:
visualize_dep(doc)

In [26]:
short_doc = nlp("Colon cancer diagnosed in 2012")

In [27]:
item_data = [
    ConTextItem("diagnosed in <YEAR>", "HISTORICAL", 
               pattern=[
                   {"LOWER": "diagnosed"},
                   {"LOWER": "in"},
                   {"LOWER": {"REGEX": "^[\d]{4}$"}}
               ])
]

In [28]:
context.add(item_data)

In [29]:
short_doc = nlp("Colon cancer diagnosed in 2012")

In [30]:
visualize_ent(short_doc)

In [31]:
visualize_dep(short_doc)

In [32]:
for ent in doc.ents:
    if any([ent._.is_negated, ent._.is_uncertain, ent._.is_historical, ent._.is_family, ent._.is_hypothetical, ]):
        print("'{0}' modified by {1} in: '{2}'".format(ent, ent._.modifiers, ent.sent))
        print()

'stroke' modified by (<TagObject> [Mother, FAMILY],) in: 'Mother with stroke at age 82.'



# Section detection

In [33]:
from medspacy.section_detection import Sectionizer

In [34]:
sectionizer = Sectionizer(nlp, patterns="default")

In [35]:
nlp.add_pipe(sectionizer)

In [36]:
doc = nlp(text)

In [51]:
doc._.section_titles

[None,
 'allergy',
 'chief_complaint',
 'other',
 'present_illness',
 'past_medical_history',
 'sexual_and_social_history',
 'family_history',
 'hospital_course',
 'medication',
 'observation_and_plan',
 'patient_instructions',
 'signature']

In [37]:
visualize_ent(doc)

In [54]:
print(text)

Admission Date:  [**2573-5-30**]              Discharge Date:   [**2573-7-1**]

Date of Birth:  [**2498-8-19**]             Sex:   F

Service: SURGERY

Allergies:
Hydrochlorothiazide

Attending:[**First Name3 (LF) 1893**]
Chief Complaint:
Abdominal pain

Major Surgical or Invasive Procedure:
PICC line [**6-25**]
ERCP w/ sphincterotomy [**5-31**]


History of Present Illness:
74y female with type 2 dm and a recent stroke affecting her
speech, who presents with 2 days of abdominal pain. Imaging shows no evidence of metastasis.

Past Medical History:
1. Colon cancer dx'd in [**2554**], tx'd with hemicolectomy, XRT,
chemo. Last colonoscopy showed: Last CEA was in the 8 range
(down from 9)
2. Type II Diabetes Mellitus
3. Hypertension

Social History:
Married, former tobacco use. No alcohol or drug use.

Family History:
Mother with stroke at age 82. no early deaths.
2 daughters- healthy


Brief Hospital Course:
Ms. [**Known patient lastname 2004**] was admitted on [**2573-5-30**]. Ultrasound

In [48]:
section_patterns = [
    {"section_title": "hospital_course", "pattern": "Brief Hospital Course:"}
]

In [49]:
sectionizer.add(section_patterns)

In [55]:
visualize_ent(nlp("""
Brief Hospital Course:
Ms. [**Known patient lastname 2004**] was admitted on [**2573-5-30**]. Ultrasound at the time of
admission demonstrated pancreatic duct dilitation and an
edematous gallbladder. She was admitted to the ICU.
"""))

For each section detected in the note, we'll print out the **normalized section title**, **section header**, and **the first 25 tokens of the section**:

In [56]:
for (section_title, section_header, section) in doc._.sections:
    print(section_title, section_header)
    print(section[:25])
    print("----------------")

None None
Admission Date:  [**2573-5-30**]              Discharge Date:   [**2573-
----------------
allergy Allergies:
Allergies:
Hydrochlorothiazide

Attending:[**First Name3 (LF) 1893**]

----------------
chief_complaint Chief Complaint:
Chief Complaint:
Abdominal pain

Major Surgical or Invasive
----------------
other Procedure:
Procedure:
PICC line [**6-25**]
ERCP w/ sphincterotomy [**5-31*
----------------
present_illness History of Present Illness:
History of Present Illness:
74y female with type 2 dm and a recent stroke affecting her
speech, who presents with 2
----------------
past_medical_history Past Medical History:
Past Medical History:
1. Colon cancer dx'd in [**2554**], tx'd with hemicolectomy, XRT,
----------------
sexual_and_social_history Social History:
Social History:
Married, former tobacco use. No alcohol or drug use.


----------------
family_history Family History:
Family History:
Mother with stroke at age 82. no early deaths.
2 daughters- healthy



------------

Each entity now has a section assigned to it as well:

In [39]:
for ent in doc.ents:
    print(ent, ent._.section_title)
    print()

Hydrochlorothiazide allergy

Abdominal pain chief_complaint

type 2 dm present_illness

stroke present_illness

abdominal pain present_illness

metastasis present_illness

Colon cancer past_medical_history

hemicolectomy past_medical_history

XRT past_medical_history

Type II Diabetes Mellitus past_medical_history

Hypertension past_medical_history

stroke family_history

Type 2 DM observation_and_plan

HTN observation_and_plan

abdominal pain patient_instructions

abdominal pain patient_instructions



# Postprocessing

In [40]:
from medspacy.postprocess import Postprocessor, PostprocessingRule, PostprocessingPattern
from medspacy.postprocess import postprocessing_functions

In [41]:
postprocessor = Postprocessor(debug=False)

In [42]:
nlp.add_pipe(postprocessor)

In [43]:
postprocess_rules = [
    PostprocessingRule(
        patterns=[
            PostprocessingPattern(condition=lambda ent: ent._.section_title == "patient_instructions"),
        ],
        action=postprocessing_functions.remove_ent,
        description="Remove any entities from the instructions section."
    ),
    
]

In [44]:
print("Before:")
print(doc.ents[-5:])

Before:
(stroke, Type 2 DM, HTN, abdominal pain, abdominal pain)


In [45]:
postprocessor.add(postprocess_rules)

In [46]:
doc = nlp(text)

In [47]:
print("After:")
print(doc.ents[-5:])

After:
(Type II Diabetes Mellitus, Hypertension, stroke, Type 2 DM, HTN)
