In [1]:
import sys

In [2]:
sys.path.insert(0, "..")

In [3]:
import spacy
from spacy.tokens import Span

import medspacy
from medspacy.preprocess import PreprocessingRule, Preprocessor
from medspacy.ner import TargetRule
from medspacy.context import ConTextRule
from medspacy.section_detection import Sectionizer
from medspacy.postprocess import PostprocessingRule, PostprocessingPattern, Postprocessor
from medspacy.postprocess import postprocessing_functions
from medspacy.visualization import visualize_ent, visualize_dep


import re

# Overview
In this notebook, we'll show how to use a pretrained model for target concept extraction instead of defining rules. We'll then add our additional components to show how medSpaCy can be used to combine statistical NLP with other rule-based components.

As an example, we'll download the [med7](https://github.com/kormilitzin/med7) transformers model which can be used with spacy 3. This won't get all the concepts we're interested in, but will extract drug-related information like names and doses.

We can install this model with `pip` using this GitHub link:
```bash
pip install https://med7.s3.eu-west-2.amazonaws.com/en_core_med7_trf.tar.gz
```

We'll also need to install `spacy-transformers`:
```bash
pip install spacy-transformers
```

In [6]:
# !pip install https://med7.s3.eu-west-2.amazonaws.com/en_core_med7_trf.tar.gz

In [63]:
# !pip install spacy-transformers

In [7]:
with open("./discharge_summary.txt") as f:
    text = f.read()

This model now can be loaded as any other spaCy model. We'll use `medspacy.load()` and pass in this model name.

In [49]:
nlp = medspacy.load("en_core_med7_trf")

In [64]:
nlp.pipe_names

['transformer',
 'ner',
 'medspacy_pyrush',
 'medspacy_target_matcher',
 'medspacy_context']

In [65]:
ner = nlp.get_pipe("ner")

In [66]:
ner.labels

('DOSAGE', 'DRUG', 'DURATION', 'FORM', 'FREQUENCY', 'ROUTE', 'STRENGTH')

In [55]:
doc = nlp(text)

In [56]:
doc.ents

(Hydrochlorothiazide,
 chemo,
 Miconazole Nitrate,
 2 %,
 Powder,
 One (1),
 Appl,
 Topical,
 BID,
 Heparin Sodium (Porcine),
 5,000 unit/mL,
 Solution,
 One (1),
 Injection,
 TID (3 times a day),
 Acetaminophen,
 160 mg/5 mL,
 Elixir,
 One (1),
 PO)

## Process our text
Similar to the last notebook, we'll add new rules to some of our components. Let's first look at what our model extracts out of the box:

In [17]:
visualize_ent(doc)

### Preprocessing

In [18]:
preprocessor = Preprocessor(nlp.tokenizer)

In [19]:
nlp.tokenizer = preprocessor

In [20]:
preprocess_rules = [
    
    PreprocessingRule(
        re.compile("\[\*\*[\d]{1,4}-[\d]{1,2}(-[\d]{1,2})?\*\*\]"),
        repl="01-01-2010",
        desc="Replace MIMIC date brackets with a generic date."
    ),
    
    PreprocessingRule(
        re.compile("\[\*\*[\d]{4}\*\*\]"),
        repl="2010",
        desc="Replace MIMIC year brackets with a generic year."
    ),
    
    PreprocessingRule(
        re.compile("dx'd"), repl="Diagnosed", 
                  desc="Replace abbreviation"
    ),
    
    PreprocessingRule(
        re.compile("tx'd"), repl="Treated", 
                  desc="Replace abbreviation"
    ),
    
        PreprocessingRule(
        re.compile("\[\*\*[^\]]+\]"), 
        desc="Remove all other bracketed placeholder text from MIMIC"
    )
]

  re.compile("\[\*\*[\d]{1,4}-[\d]{1,2}(-[\d]{1,2})?\*\*\]"),
  re.compile("\[\*\*[\d]{4}\*\*\]"),
  re.compile("\[\*\*[^\]]+\]"),


In [21]:
preprocessor.add(preprocess_rules)

### Target Rules
The trained NER will add some new concepts that we weren't getting before, but we can customize with rules.

In [57]:
target_rules = [
    TargetRule(literal="abdominal pain", category="PROBLEM"),
    TargetRule("stroke", "PROBLEM"),
    TargetRule("hemicolectomy", "TREATMENT"),
    TargetRule("Hydrochlorothiazide", "TREATMENT"),
    TargetRule("colon cancer", "PROBLEM"),
    TargetRule("radiotherapy", "PROBLEM",
              pattern=[{"LOWER": "xrt"}]),
    TargetRule("metastasis", "PROBLEM"),
    
    TargetRule("Type II Diabetes Mellitus", "PROBLEM", 
              pattern=[
                  {"LOWER": "type"},
                  {"LOWER": {"IN": ["2", "ii", "two"]}},
                  {"LOWER": {"IN": ["dm", "diabetes"]}},
                  {"LOWER": "mellitus", "OP": "?"}
              ],
              attributes={"icd10": "E11.9"}),
    TargetRule("Hypertension", "PROBLEM",
              pattern=[{"LOWER": {"IN": ["htn", "hypertension"]}}],
              attributes={"icd10": "I10"}),
    
]

In [61]:
target_matcher = nlp.get_pipe("medspacy_target_matcher")

In [62]:
target_matcher.add(target_rules)

### Context

In [23]:
context = nlp.get_pipe("medspacy_context")

In [24]:
context_rules = [
    ConTextRule("diagnosed in <YEAR>", "HISTORICAL", 
               pattern=[
                   {"LOWER": "diagnosed"},
                   {"LOWER": "in"},
                   {"LOWER": {"REGEX": "^[\d]{4}$"}}
               ])
]

  {"LOWER": {"REGEX": "^[\d]{4}$"}}


In [25]:
context.add(context_rules)

### Section detection

In [28]:
sectionizer = nlp.add_pipe("medspacy_sectionizer")

ValueError: [E007] 'medspacy_sectionizer' already exists in pipeline. Existing names: ['transformer', 'ner', 'medspacy_pyrush', 'medspacy_context', 'medspacy_sectionizer']

In [31]:
from medspacy.section_detection import SectionRule

In [33]:
section_rule = SectionRule("Brief Hospital Course:", "hospital-course")

In [34]:
sectionizer.add(section_rule)

### Postprocessing
Here, we'll show another example of how postprocessing can be used. The NER component extracts **"married"** as a **"TREATMENT"** entity. While some might agree with this in a philosophical sense, it doesn't match our clinical definition very well. This shows a challenge of statistical NLP: we have relatively little control over what concepts are extracted by our model. But we can use some postprocessing rules to clean this up.

Postprocessing can be used to remove or clean up entities which we know are incorrect. In this example, we'll just remove any entity where the text is **"married"**:

In [37]:
postprocessor = nlp.add_pipe("medspacy_postprocessor")

In [38]:
postprocess_rules = [
    PostprocessingRule(
        patterns=[
            PostprocessingPattern(condition=lambda ent: ent.text.lower() == "married"),
        ],
        action=postprocessing_functions.remove_ent,
        description="Remove a specific misclassified span of text."
    ),
    
]

In [39]:
postprocessor.add(postprocess_rules)

# Process our document
Now, let's process the text with our complete pipeline and show the results:

In [40]:
nlp.pipe_names

['transformer',
 'ner',
 'medspacy_pyrush',
 'medspacy_context',
 'medspacy_sectionizer',
 'medspacy_postprocessor']

In [41]:
doc = nlp(text)

In [42]:
visualize_ent(doc)

In [46]:
short_text = "Colon cancer dx'd in [**2554**], tx'd with hemicolectomy, chemo"
short_doc = nlp(short_text)

In [47]:
visualize_ent(short_doc)

In [48]:
visualize_dep(short_doc)