In [1]:
import os, sys, glob
import spacy
import pandas as pd

from spacy.pipeline import EntityRuler

In [2]:
sys.path.append('..')

In [3]:
from cycontext.item_data import ItemData
from cycontext.tag_object import TagObject
from cycontext.context_component import ConTextComponent

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
nlp = spacy.load("en_core_web_sm")
_ = nlp.remove_pipe("ner")

# Overview
- Read in data
- Read in modifiers

# I. Read in Knowledge Base

## I.I Targets

In [6]:
targets_filepath = os.path.join(".", "pneumonia", "kb", "pneumonia_targets.tsv")
targets_df = pd.read_csv(targets_filepath, sep="\t")

In [7]:
targets_df.head()

Unnamed: 0,Lex,Type,Regex,Direction,Codes
0,pneumonia,EVIDENCE_OF_PNEUMONIA,pneumonias?,,
1,pneumonia,EVIDENCE_OF_PNEUMONIA,pna,,
2,consolidation,EVIDENCE_OF_PNEUMONIA,,,
3,infiltrate,EVIDENCE_OF_PNEUMONIA,infiltrat(e|es|ion),,
4,effusion,EVIDENCE_OF_PNEUMONIA,effusion,,


In [8]:
target_patterns = []
for i, row in targets_df.fillna('').iterrows():
    literal = row["Lex"]
    category = row["Type"]
    regex = row["Regex"]
    
    if regex == '':
        pattern = [{"LOWER": literal}]
    else:
        pattern = [{"LOWER": {"REGEX": regex}}]
    target_patterns.append({"label": category, "pattern": pattern})


In [9]:
# Use the EntityRuler class for rule-based NER
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(target_patterns)

In [10]:
try:
    _ = nlp.remove_pipe("entity_ruler")
except:
    pass
nlp.add_pipe(ruler)

## I.II Modifiers

In [11]:
filepath = os.path.join(".", "pneumonia", "kb", "pneumonia_modifiers.json")

In [12]:
import json
with open(filepath) as f:
    modifier_data = json.load(f)

In [13]:
modifier_data["patterns"][:10]

[{'literal': 'are ruled out',
  'category': 'DEFINITE_NEGATED_EXISTENCE',
  'pattern': None,
  'rule': 'backward'},
 {'literal': 'be ruled out',
  'category': 'INDICATION',
  'pattern': None,
  'rule': 'backward'},
 {'literal': 'being ruled out',
  'category': 'INDICATION',
  'pattern': None,
  'rule': 'backward'},
 {'literal': 'can be ruled out',
  'category': 'DEFINITE_NEGATED_EXISTENCE',
  'pattern': None,
  'rule': 'backward'},
 {'literal': 'cannot be excluded',
  'category': 'AMBIVALENT_EXISTENCE',
  'pattern': None,
  'rule': 'backward'},
 {'literal': 'cannot totally be excluded',
  'category': 'PROBABLE_NEGATED_EXISTENCE',
  'pattern': None,
  'rule': 'backward'},
 {'literal': 'could be ruled out',
  'category': 'DEFINITE_NEGATED_EXISTENCE',
  'pattern': None,
  'rule': 'backward'},
 {'literal': 'examination',
  'category': 'INDICATION',
  'pattern': [{'LOWER': {'REGEX': '(examination|exam|study)'}}],
  'rule': 'backward'},
 {'literal': 'free',
  'category': 'DEFINITE_NEGATED_EX

In [14]:
item_data = []
for data in modifier_data["patterns"]:
    item = ItemData(**data)
    item_data.append(item)

In [15]:
item_data[:10]

[ItemData: [are ruled out, definite_negated_existence, None, backward],
 ItemData: [be ruled out, indication, None, backward],
 ItemData: [being ruled out, indication, None, backward],
 ItemData: [can be ruled out, definite_negated_existence, None, backward],
 ItemData: [cannot be excluded, ambivalent_existence, None, backward],
 ItemData: [cannot totally be excluded, probable_negated_existence, None, backward],
 ItemData: [could be ruled out, definite_negated_existence, None, backward],
 ItemData: [examination, indication, [{'LOWER': {'REGEX': '(examination|exam|study)'}}], backward],
 ItemData: [free, definite_negated_existence, None, backward],
 ItemData: [has been ruled out, definite_negated_existence, None, backward]]

# II. Read Data

In [16]:
txt_files = glob.glob(os.path.join(".", "pneumonia", "training_v2", "*.txt"))

In [17]:
texts = []
for file in txt_files:
    texts.append(open(file).read())

# III. Process with NLP

## III.II Add ConText

In [18]:
context = ConTextComponent(item_data, nlp)

In [19]:
try:
    nlp.add_pipe(context, last=True)
except:
    nlp.remove_pipe("context")
    nlp.add_pipe(context, last=True)

### III.I Custom sentence segmentation
The spaCy default works poorly with MIMIC data. Should eventually replace with PyRUSH.

In [20]:
from nltk.tokenize import PunktSentenceTokenizer

In [21]:
class SentenceTokenizer:
    
    def __init__(self):
        self.tokenizer = PunktSentenceTokenizer()
        
    def __call__(self, doc):
        sent_spans = self.tokenizer.span_tokenize(doc.text)
        for token in doc:
            token.is_sent_start = False
        for (start, end) in sent_spans:
            sent = doc.char_span(start, end)
            sent[0].is_sent_start = True
        return doc

In [22]:
sent_tokenizer = SentenceTokenizer()

In [23]:
try:
    nlp.remove_pipe('set_custom_boundaries')
except:
    pass
nlp.add_pipe(sent_tokenizer, before="parser")

In [24]:
doc = nlp(texts[12])

## 3.3 Process documents

In [25]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x110279cf8>),
 ('SentenceTokenizer', <__main__.SentenceTokenizer at 0x116595400>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x110408a68>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x11258b198>),
 ('context', <cycontext.context_component.ConTextComponent at 0x1164d60f0>)]

In [26]:
docs = list(nlp.pipe(texts))

# Visualization

In [27]:
from cycontext import viz

In [28]:
from spacy import displacy

In [29]:
doc = docs[41]

In [30]:
doc._.context_graph.edges

[(infiltrate, <TagObject> [Reason, indication]),
 (PNA, <TagObject> [Reason, indication]),
 (infiltrate, <TagObject> [REASON FOR THIS EXAMINATION, indication]),
 (pneumonia, <TagObject> [consistent with, definite_existence])]

In [31]:
viz.visualize_targets(doc
                     , colors ={"EVIDENCE_OF_PNEUMONIA": "orange",
                                          "DEFINITE_NEGATED_EXISTENCE": "blue",
                               "indication": "yellow"})

In [32]:
doc = nlp("Evaluate for pneumonia and effusion")

In [33]:
# doc = nlp(list(doc.sents)[-3].text)

In [34]:
doc._.context_graph.modifiers

[<TagObject> [Evaluate for, indication]]

In [35]:
doc._.context_graph.edges

[(pneumonia, <TagObject> [Evaluate for, indication]),
 (effusion, <TagObject> [Evaluate for, indication])]

In [36]:
viz.visualize_modifiers(doc)