In [1]:
import os, sys, glob
import spacy
import pandas as pd

from spacy.pipeline import EntityRuler

In [2]:
sys.path.append('..')

In [3]:
from cycontext.item_data import ItemData
from cycontext.tag_object import TagObject
from cycontext.context_component import ConTextComponent

In [4]:
%load_ext autoreload
%autoreload 2

In [148]:
nlp = spacy.load("en_core_web_sm")
_ = nlp.remove_pipe("ner")

# Overview
- Read in data
- Read in modifiers

# I. Read in Knowledge Base

## I.I Targets

In [6]:
targets_filepath = os.path.join(".", "pneumonia", "kb", "pneumonia_targets.tsv")
targets_df = pd.read_csv(targets_filepath, sep="\t")

In [7]:
targets_df.head()

Unnamed: 0,Lex,Type,Regex,Direction,Codes
0,pneumonia,EVIDENCE_OF_PNEUMONIA,pneumonias?,,
1,consolidation,EVIDENCE_OF_PNEUMONIA,,,
2,infiltrate,EVIDENCE_OF_PNEUMONIA,infiltrat(e|es|ion),,
3,effusion,EVIDENCE_OF_PNEUMONIA,effusion,,


In [8]:
target_patterns = []
for i, row in targets_df.fillna('').iterrows():
    literal = row["Lex"]
    category = row["Type"]
    regex = row["Regex"]
    
    if regex == '':
        pattern = [{"LOWER": literal}]
    else:
        pattern = [{"LOWER": {"REGEX": regex}}]
    target_patterns.append({"label": category, "pattern": pattern})
        

In [9]:
# Use the EntityRuler class for rule-based NER
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(target_patterns)

In [149]:
try:
    _ = nlp.remove_pipe("entity_ruler")
except:
    pass
nlp.add_pipe(ruler)

## I.II Modifiers

In [11]:
tsv_mods = pd.read_csv(os.path.join('.', "pneumonia", "kb", "pneumonia_modifiers.tsv"), sep="\t")

In [12]:
mod_patterns = []
for i, row in tsv_mods.fillna("").iterrows():
    d = {"literal": row["Lex"],
        "category": row["Type"],
        "pattern": row["Regex"],
        "rule": row["Direction"]}
    if d["pattern"] == "":
        d["pattern"] = None
    else:
        d["pattern"] = [d["pattern"]]
    mod_patterns.append(d)

In [13]:
import json

In [14]:
data = {"patterns": mod_patterns}

In [125]:
outpath = os.path.join(".", "pneumonia", "kb", "pneumonia_modifiers.json")
# with open(outpath, "w") as f:
#     json.dump(data, f, indent=4)

In [126]:
with open(outpath) as f:
    modifier_data = json.load(f)

In [127]:
modifier_data

{'patterns': [{'literal': 'are ruled out',
   'category': 'DEFINITE_NEGATED_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'be ruled out',
   'category': 'INDICATION',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'being ruled out',
   'category': 'INDICATION',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'can be ruled out',
   'category': 'DEFINITE_NEGATED_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'cannot be excluded',
   'category': 'AMBIVALENT_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'cannot totally be excluded',
   'category': 'PROBABLE_NEGATED_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'could be ruled out',
   'category': 'DEFINITE_NEGATED_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'examination',
   'category': 'INDICATION',
   'pattern': [{'LOWER': {'REGEX': '(examination|exam|study)'}}],
   'rule': 'backward'},
  {'literal

In [128]:
item_data = []
for data in modifier_data["patterns"]:
    item = ItemData(**data)
    item_data.append(item)

In [129]:
item_data

[ItemData: [are ruled out, definite_negated_existence, None, backward],
 ItemData: [be ruled out, indication, None, backward],
 ItemData: [being ruled out, indication, None, backward],
 ItemData: [can be ruled out, definite_negated_existence, None, backward],
 ItemData: [cannot be excluded, ambivalent_existence, None, backward],
 ItemData: [cannot totally be excluded, probable_negated_existence, None, backward],
 ItemData: [could be ruled out, definite_negated_existence, None, backward],
 ItemData: [examination, indication, [{'LOWER': {'REGEX': '(examination|exam|study)'}}], backward],
 ItemData: [free, definite_negated_existence, None, backward],
 ItemData: [has been ruled out, definite_negated_existence, None, backward],
 ItemData: [have been ruled out, definite_negated_existence, None, backward],
 ItemData: [is in the differential, ambivalent_existence, [{'LOWER': 'is'}, {'LOWER': 'in'}, {'LOWER': 'the'}, {'LOWER': 'differential'}], backward],
 ItemData: [is negative, definite_negat

# II. Read Data

In [31]:
txt_files = glob.glob(os.path.join(".", "pneumonia", "training_v2", "*.txt"))

In [32]:
texts = []
for file in txt_files:
    texts.append(open(file).read())

# III. Process with NLP

## III.II Add ConText

In [130]:
context = ConTextComponent(item_data, nlp)

In [150]:
try:
    nlp.add_pipe(context)
except:
    nlp.remove_pipe("context")
    nlp.add_pipe(context)

### III.I Custom sentence segmentation
The spaCy default works poorly with MIMIC data. Should eventually replace with PyRUSH.

In [132]:
from nltk.tokenize import PunktSentenceTokenizer

In [133]:
def set_custom_boundaries(doc, tokenizer):
    sent_spans = tokenizer.span_tokenize(doc.text)
    for token in doc:
        token.is_sent_start = False
    for (start, end) in sent_spans:
        sent = doc.char_span(start, end)
        sent[0].is_sent_start = True
    return doc

In [151]:
try:
    nlp.remove_pipe('set_custom_boundaries')
except:
    pass
nlp.add_pipe(set_custom_boundaries, before="parser")

In [135]:
tokenizer = PunktSentenceTokenizer()

In [136]:
cfg = {'set_custom_boundaries': {"tokenizer": tokenizer}}

In [137]:
doc = nlp(texts[12], component_cfg=cfg)

for sent in doc.sents:
    print("---", sent)
    print()

<TagObject> [REASON FOR, conj]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [Increased, historical]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

--- 


     DATE: [**2919-8-31**] 5:14 PM
     CHEST (PA & LAT)                                                Clip # [**Clip Number (Radiology) 11222**]
     Reason: infiltrate vs.

--- pulmonary edema
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
      50 year old man with cough and hemoptysis and now progressive O2 requirement,
      signs of CHF on exam.

--- PMH of MVR.
     

--- REASON FOR THIS EXAMINATION:


## 3.3 Process documents

In [138]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x11f85e7f0>),
 ('set_custom_boundaries',
  <function __main__.set_custom_boundaries(doc, tokenizer)>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x12a9aed08>),
 ('context', <cycontext.context_component.ConTextComponent at 0x126868c88>)]

In [139]:
docs = list(nlp.pipe(texts, component_cfg=cfg))

<TagObject> [REASON FOR, conj]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR, conj]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR, conj]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [increased, historical]

<TagObject> [excluded, definite_negated_existence]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR, conj]

<TagObject> [

<TagObject> [REASON FOR, conj]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [prior study, historical]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR, conj]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR, conj]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR, conj]

<TagObje

<TagObject> [REASON FOR, conj]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [No definite evidence of, definite_negated_existence]

<TagObject> [No definite evidence of, definite_negated_existence]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR, conj]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXA

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [increase, historical]

<TagObject> [increase, historical]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [REASON FOR THIS EXAMINATION, indication]

<TagObject> [increase, historical]

<TagObject> [REASON FOR THIS EXAMINATION, indication]



In [66]:
for doc in docs:
    print(doc.ents)
    print(doc._.context_graph.modifiers)
    print(doc._.context_graph.edges)
    print()

(effusion, effusion)
[<TagObject> [old, historical], <TagObject> [REASON FOR, conj], <TagObject> [REASON FOR THIS EXAMINATION, indication], <TagObject> [EXAMINATION, indication], <TagObject> [INDICATION, indication], <TagObject> [examination, indication], <TagObject> [appear, probable_existence], <TagObject> [unchanged, historical], <TagObject> [unchanged, historical], <TagObject> [unchanged, historical], <TagObject> [again, historical], <TagObject> [No, definite_negated_existence], <TagObject> [Improving, historical], <TagObject> [stable, historical], <TagObject> [unchanged, historical]]
[(effusion, <TagObject> [unchanged, historical]), (effusion, <TagObject> [stable, historical])]

(consolidation, effusion, pneumonia)
[<TagObject> [old, historical], <TagObject> [REASON FOR, conj], <TagObject> [REASON FOR THIS EXAMINATION, indication], <TagObject> [EXAMINATION, indication], <TagObject> [INDICATION, indication], <TagObject> [consistent with, definite_existence], <TagObject> [unchanged,

# Visualization

In [140]:
from cycontext import viz

In [141]:
from spacy import displacy

In [142]:
doc = docs[12]

In [143]:
doc._.context_graph.modifiers

[<TagObject> [Reason, indication],
 <TagObject> [old, historical],
 <TagObject> [signs of, probable_existence],
 <TagObject> [exam, indication],
 <TagObject> [REASON, indication],
 <TagObject> [REASON FOR, conj],
 <TagObject> [REASON FOR THIS EXAMINATION, indication],
 <TagObject> [EXAMINATION, indication],
 <TagObject> [previous, historical],
 <TagObject> [prior, historical],
 <TagObject> [increased, historical],
 <TagObject> [increased, historical],
 <TagObject> [involving, conj],
 <TagObject> [history of, historical],
 <TagObject> [worrisome, probable_existence],
 <TagObject> [chronic, historical],
 <TagObject> [evaluation, indication],
 <TagObject> [could represent, probable_existence],
 <TagObject> [no, definite_negated_existence],
 <TagObject> [evidence, definite_existence],
 <TagObject> [No, definite_negated_existence],
 <TagObject> [Increased, historical],
 <TagObject> [Increased, historical],
 <TagObject> [involving, conj]]

In [144]:
doc._.context_graph.edges

[]

In [152]:
doc = list(nlp.pipe(["Reason for this examination: infiltrate"], component_cfg=cfg))[0]

ValueError: [E084] Error assigning label ID 5227382582633743683 to span: not in StringStore.

In [147]:
viz.visualize_targets(doc
                     , colors ={"EVIDENCE_OF_PNEUMONIA": "orange",
                                          "DEFINITE_NEGATED_EXISTENCE": "blue",
                               "indication": "yellow"})

In [97]:
for sent in doc.sents:
    print("---", sent)
    print()

--- 


     DATE: [**2919-8-31**] 5:14 PM
     CHEST (PA & LAT)                                                Clip # [**Clip Number (Radiology) 11222**]
     Reason: infiltrate vs.

--- pulmonary edema
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
      50 year old man with cough and hemoptysis and now progressive O2 requirement,
      signs of CHF on exam.

--- PMH of MVR.
     

--- REASON FOR THIS EXAMINATION:
      infiltrate vs.

--- pulmonary edema
     ______________________________________________________________________________
                                     FINAL REPORT
     INDICATIONS:  Cough and hemoptysis.
     
     

--- PA AND LATERAL CHEST:  Comparison is made to previous films from  [**2919-8-17**].
     

--- The patient has prior MBR and median sternotomy.
     
     

--- Note is made of increased nodular and linear opacities involving the right
     upper lobe and left lower lobe.

--

In [73]:
viz.visualize_modifiers(
    nlp("The patchy opacity in the left lower lobe could represent bronchopneumonia.",component_cfg=component_cfg))

TypeError: set_custom_boundaries() got an unexpected keyword argument 'batch_size'