In [1]:
import os, sys, glob
import spacy
import pandas as pd

from spacy.pipeline import EntityRuler
import json

In [2]:
sys.path.append('..')

In [3]:
from cycontext import item_data
from cycontext.item_data import ItemData
from cycontext.tag_object import TagObject
from cycontext.context_component import ConTextComponent

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
nlp = spacy.load("en_core_web_sm")
_ = nlp.remove_pipe("ner")

# Overview
- Read in data
- Read in modifiers

# I. Read in Knowledge Base

## I.I Targets

In [6]:
targets_filepath = os.path.join(".", "pneumonia", "kb", "pneumonia_targets.json")

In [7]:
with open(targets_filepath) as f:
    target_patterns = json.loads(f.read())["patterns"]

In [8]:
target_patterns

[{'label': 'EVIDENCE_OF_PNEUMONIA',
  'pattern': [{'LOWER': {'REGEX': 'pneumonias?'}}]},
 {'label': 'EVIDENCE_OF_PNEUMONIA', 'pattern': [{'LOWER': {'REGEX': 'pna'}}]},
 {'label': 'EVIDENCE_OF_PNEUMONIA',
  'pattern': [{'LOWER': {'REGEX': 'consolidations?'}}]},
 {'label': 'EVIDENCE_OF_PNEUMONIA',
  'pattern': [{'LOWER': {'REGEX': 'infiltrat(e|es|ion)'}}]},
 {'label': 'EVIDENCE_OF_PNEUMONIA',
  'pattern': [{'LOWER': {'REGEX': 'effusion'}}]},
 {'label': 'EVIDENCE_OF_PNEUMONIA',
  'pattern': [{'LOWER': {'REGEX': 'opacit(y|ies)'}}]},
 {'label': 'CONDITION',
  'pattern': [{'LOWER': 'congestive'},
   {'LOWER': 'heart'},
   {'LOWER': 'failure'}]},
 {'label': 'CONDITION', 'pattern': [{'LOWER': {'REGEX': 'chf'}}]},
 {'label': 'CONDITION', 'pattern': [{'LOWER': 'svt'}]},
 {'label': 'CONDITION',
  'pattern': [{'LOWER': 'supraventricular'}, {'LOWER': 'tachycardia'}]},
 {'label': 'CONDITION',
  'pattern': [{'LOWER': 'cardiac'}, {'LOWER': 'failure'}]}]

In [9]:
# Use the EntityRuler class for rule-based NER
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(target_patterns)

In [10]:
try:
    _ = nlp.remove_pipe("entity_ruler")
except:
    pass
nlp.add_pipe(ruler)

## I.II Modifiers

In [11]:
filepath = os.path.join(".", "pneumonia", "kb", "pneumonia_modifiers.json")

In [12]:
item_data = item_data.from_json(filepath)

# II. Read Data

In [2]:
txt_files = glob.glob(os.path.join(".", "pneumonia", "training_v2", "*.txt"))

In [3]:
texts = []
for file in txt_files:
    texts.append(open(file).read())

In [34]:
texts[2]

'\n\n\n     DATE: [**3106-8-10**] 11:46 AM\n     CHEST (PORTABLE AP)                                             Clip # [**Clip Number (Radiology) 7189**]\n     Reason: pneumonia                                                   \n     ______________________________________________________________________________\n     UNDERLYING MEDICAL CONDITION:\n             70 year old woman S/P MVA                                                \n             \n     REASON FOR THIS EXAMINATION:\n      pneumonia                                                                       \n     ______________________________________________________________________________\n                                     FINAL REPORT\n     PORTABLE CHEST:\n     \n     Compared to 1 day earlier.\n     \n     INDICATION: Pneumonia. S/P motor vehicle accident.\n     \n     An ETT and NG tube remain in satisfactory position. There is widening of the\n     mediastinum with increased soft tissue density noted lateral to t

In [32]:
print(texts[2])




     DATE: [**3106-8-10**] 11:46 AM
     CHEST (PORTABLE AP)                                             Clip # [**Clip Number (Radiology) 7189**]
     Reason: pneumonia                                                   
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
             70 year old woman S/P MVA                                                
             
     REASON FOR THIS EXAMINATION:
      pneumonia                                                                       
     ______________________________________________________________________________
                                     FINAL REPORT
     PORTABLE CHEST:
     
     Compared to 1 day earlier.
     
     INDICATION: Pneumonia. S/P motor vehicle accident.
     
     An ETT and NG tube remain in satisfactory position. There is widening of the
     mediastinum with increased soft tissue density noted lateral to the aortic
     knob ca

In [None]:
t

# III. Process with NLP

## III.II Add ConText

In [15]:
context = ConTextComponent(item_data, nlp)

In [16]:
try:
    nlp.add_pipe(context, last=True)
except:
    nlp.remove_pipe("context")
    nlp.add_pipe(context, last=True)

### III.I Custom sentence segmentation
The spaCy default works poorly with MIMIC data. Should eventually replace with PyRUSH.

In [17]:
from nltk.tokenize import PunktSentenceTokenizer

In [18]:
class SentenceTokenizer:
    
    def __init__(self):
        self.tokenizer = PunktSentenceTokenizer()
        
    def __call__(self, doc):
        sent_spans = self.tokenizer.span_tokenize(doc.text)
        for token in doc:
            token.is_sent_start = False
        for (start, end) in sent_spans:
            sent = doc.char_span(start, end)
            sent[0].is_sent_start = True
        return doc

In [19]:
sent_tokenizer = SentenceTokenizer()

In [20]:
try:
    nlp.remove_pipe('set_custom_boundaries')
except:
    pass
nlp.add_pipe(sent_tokenizer, before="parser")

In [21]:
doc = nlp(texts[12])

## 3.3 Process documents

In [22]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x11be9cdd8>),
 ('SentenceTokenizer', <__main__.SentenceTokenizer at 0x11e1ae7f0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x11c027a68>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x11bdacba8>),
 ('context', <cycontext.context_component.ConTextComponent at 0x11e1cd160>)]

In [23]:
docs = list(nlp.pipe(texts))

# Visualization

In [24]:
from cycontext import viz

In [25]:
from spacy import displacy

In [26]:
doc = docs[45]

In [27]:
doc._.context_graph.edges

[(effusion, <TagObject> [r/o, indication]),
 (effusion, <TagObject> [r/o, indication]),
 (effusion, <TagObject> [R/O, indication]),
 (cardiac failure,
  <TagObject> [No definite evidence of, definite_negated_existence])]

In [28]:
viz.visualize_ent(doc, colors ={"EVIDENCE_OF_PNEUMONIA": "orange",
                                          "DEFINITE_NEGATED_EXISTENCE": "blue",
                               "indication": "yellow"})

In [29]:
doc = nlp("left lower lobe process suggesting pneumonia")

In [33]:
viz.visualize_dep(doc)