In [1]:
import os, sys, glob
import spacy
import pandas as pd

from spacy.pipeline import EntityRuler

In [2]:
sys.path.append('..')

In [3]:
from cycontext.item_data import ItemData
from cycontext.tag_object import TagObject
from cycontext.context_component import ConTextComponent

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
nlp = spacy.load("en_core_web_sm")
_ = nlp.remove_pipe("ner")

# Overview
- Read in data
- Read in modifiers

# I. Read in Knowledge Base

## I.I Targets

In [6]:
targets_filepath = os.path.join(".", "pneumonia", "kb", "pneumonia_targets.tsv")
targets_df = pd.read_csv(targets_filepath, sep="\t")

In [7]:
targets_df.head()

Unnamed: 0,Lex,Type,Regex,Direction,Codes
0,pneumonia,EVIDENCE_OF_PNEUMONIA,pneumonias?,,
1,consolidation,EVIDENCE_OF_PNEUMONIA,,,
2,infiltrate,EVIDENCE_OF_PNEUMONIA,infiltrat(e|es|ion),,
3,effusion,EVIDENCE_OF_PNEUMONIA,effusion,,


In [8]:
target_patterns = []
for i, row in targets_df.fillna('').iterrows():
    literal = row["Lex"]
    category = row["Type"]
    regex = row["Regex"]
    
    if regex == '':
        pattern = [{"LOWER": literal}]
    else:
        pattern = [{"LOWER": {"REGEX": regex}}]
    target_patterns.append({"label": category, "pattern": pattern})
        

In [9]:
# Use the EntityRuler class for rule-based NER
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(target_patterns)

In [10]:
try:
    _ = nlp.remove_pipe("entity_ruler")
except:
    pass
nlp.add_pipe(ruler)

## I.II Modifiers

In [35]:
item_data = [
    ItemData("no", "DEFINITE_NEGATED_EXISTENCE", rule="forward"),
    ItemData("no evidence of", "DEFINITE_NEGATED_EXISTENCE", rule="forward"),
    ItemData("evidence of", "POSITIVE_EXISTENCE", rule="forward"),
]


In [36]:
item_data

[ItemData: [no, definite_negated_existence, None, forward],
 ItemData: [no evidence of, definite_negated_existence, None, forward],
 ItemData: [evidence of, positive_existence, None, forward]]

In [37]:
context = ConTextComponent(item_data, nlp)

# II. Read Data

In [16]:
# txt_files = glob.glob(os.path.join(".", "pneumonia", "training_v2", "*.txt"))

In [19]:
texts = ["No evidence of pneumonia."]

# III. Process with NLP

In [38]:
try:
    nlp.add_pipe(context)
except:
    nlp.remove_pipe("context")
    nlp.add_pipe(context)

In [39]:
docs = list(nlp.pipe(texts))

In [40]:
doc = docs[0]

In [41]:
phrase_matcher = context.phrase_matcher
matches = phrase_matcher(doc)

In [53]:
modifiers

[<TagObject> [No, definite_negated_existence],
 <TagObject> [No evidence of, definite_negated_existence],
 <TagObject> [evidence of, positive_existence]]

In [66]:
modifiers = list(doc._.context_graph.modifiers)

unpruned = sorted(modifiers, key=lambda x: (x.start, x.end))

In [67]:
curr_mod = unpruned[0]
next_mod = unpruned[1]

In [68]:
curr_mod.span[0] in next_mod.span

True

In [60]:
next_mod.span

No evidence of

In [79]:
def overlaps(self, other):
    if self.span[0] in other.span:
        return True
    if self.span[-1] in other.span:
        return True
    if other.span[0] in self.span:
        return True
    if other.span[-1] in self.span:
        return True
    return False

In [84]:
unpruned

[]

In [138]:
modifiers = list(doc._.context_graph.modifiers)
modifiers = sorted(modifiers, key=lambda x: (x.start, x.end))

def prune_overlapping_modifiers(modifiers):
    # Don't prune a single modifier
    if len(modifiers) == 1:
        return modifiers
    
    # Make a copy
    unpruned = list(modifiers)
    pruned = []
    num_mods = len(unpruned)
    curr_mod = unpruned.pop(0)
    
    
    while True:
        if len(unpruned) == 0:
            pruned.append(curr_mod)
            break
        if len(unpruned) == 1:
            pruned.append(unpruned.pop(0))
            break
        next_mod = unpruned.pop(0)
        
        # Check if they overlap
        if curr_mod.overlaps(next_mod):
            # Choose the larger
            longer_span = max(curr_mod, next_mod, key=lambda x: (x.end - x.start))
            pruned.append(longer_span)
            curr_mod = next_mod
        else:
            pruned.append(curr_mod)
            curr_mod = unpruned.pop(0)
    
    # Recursion base point
    if len(pruned) == num_mods:
        return pruned
    else:
        return prune_overlapping_modifiers(pruned)

In [139]:
rslt = prune_overlapping_modifiers(modifiers)

In [140]:
rslt

[<TagObject> [evidence of, positive_existence]]

In [113]:
overlaps(rslt[0], rslt[1])

True

In [76]:
pruned

[<TagObject> [No evidence of, definite_negated_existence],
 <TagObject> [evidence of, positive_existence]]

In [44]:
matches

[(746762829127501960, 0, 1),
 (5533571732986600803, 0, 3),
 (15180167692696242062, 1, 3)]

In [32]:
doc._.context_graph.modifiers

[<TagObject> [No, definite_negated_existence],
 <TagObject> [No evidence of, definite_negated_existence]]

In [224]:
for doc in docs:
    print(doc.ents)
    print(doc._.context_graph.modifiers)
    print(doc._.context_graph.edges)
    print()

()
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[]

(consolidation, pneumonia)
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[]

(pneumonia, pneumonia, Pneumonia)
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[(pneumonia, <TagObject> [REASON, indication])]

()
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[]

(infiltrate, infiltrate)
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[(infiltrate, <TagObject> [Reason, indication])]

()
[<TagObject> [Reason, indication], <TagObject> [but, terminate], <TagObject> [but, terminate], <TagObject> [REASON, indication], <TagObject> [but, terminate], <TagObject> [but, terminate]]
[]

()
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[]

()
[<TagObject> [Reason, indication], <TagObject> [REASON, indication], <TagObject> [REASON, indication]]
[]

(infiltrate, infiltrate, pneumonia, consolidation)
[<TagObject> [Reaso

In [57]:
pos_docs = [doc for doc in docs if len(doc.ents) > 0]

# Visualization

In [225]:
from cycontext import viz

In [226]:
from spacy import displacy

In [241]:
doc = docs[8]

In [242]:
viz.visualize_targets(doc
                     , colors ={"EVIDENCE_OF_PNEUMONIA": "orange",
                                          "DEFINITE_NEGATED_EXISTENCE": "blue"})

In [190]:
viz.visualize_modifiers(doc)