In [28]:
import os, sys, glob
import spacy
import pandas as pd

from spacy.pipeline import EntityRuler

In [10]:
sys.path.append('..')

In [11]:
from cycontext.item_data import ItemData
from cycontext.tag_object import TagObject
from cycontext.context_component import ConTextComponent

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
nlp = spacy.load("en_core_web_sm")
_ = nlp.remove_pipe("ner")

# Overview
- Read in data
- Read in modifiers

# I. Read in Knowledge Base

## I.I Targets

In [15]:
targets_filepath = os.path.join(".", "pneumonia", "kb", "pneumonia_targets.tsv")
targets_df = pd.read_csv(targets_filepath, sep="\t")

In [16]:
targets_df.head()

Unnamed: 0,Lex,Type,Regex,Direction,Codes
0,pneumonia,EVIDENCE_OF_PNEUMONIA,pneumonias?,,
1,consolidation,EVIDENCE_OF_PNEUMONIA,,,
2,infiltrate,EVIDENCE_OF_PNEUMONIA,infiltrat(e|es|ion),,


In [25]:
target_patterns = []
for i, row in targets_df.fillna('').iterrows():
    literal = row["Lex"]
    category = row["Type"]
    regex = row["Regex"]
    
    if regex == '':
        pattern = [{"LOWER": literal}]
    else:
        pattern = [{"LOWER": {"REGEX": regex}}]
    target_patterns.append({"label": category, "pattern": pattern})
        

In [26]:
# Use the EntityRuler class for rule-based NER
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(target_patterns)

In [27]:
try:
    _ = nlp.remove_pipe("entity_ruler")
except:
    pass
nlp.add_pipe(ruler)

## I.II Modifiers

In [164]:
tsv_mods = pd.read_csv(os.path.join('.', "pneumonia", "kb", "pneumonia_modifiers.tsv"), sep="\t")

In [169]:
mod_patterns = []
for i, row in tsv_mods.fillna("").iterrows():
    d = {"literal": row["Lex"],
        "category": row["Type"],
        "pattern": row["Regex"],
        "rule": row["Direction"]}
    if d["pattern"] == "":
        d["pattern"] = None
    else:
        d["pattern"] = [d["pattern"]]
    mod_patterns.append(d)

In [170]:
import json

In [171]:
data = {"patterns": mod_patterns}

In [172]:
# outpath = os.path.join(".", "pneumonia", "kb", "pneumonia_modifiers.json")
# with open(outpath, "w") as f:
#     json.dump(data, f, indent=4)

In [199]:
with open(outpath) as f:
    modifier_data = json.load(f)

JSONDecodeError: Expecting ',' delimiter: line 1334 column 10 (char 38831)

In [176]:
modifier_data

{'patterns': [{'literal': 'are ruled out',
   'category': 'DEFINITE_NEGATED_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'be ruled out',
   'category': 'INDICATION',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'being ruled out',
   'category': 'INDICATION',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'can be ruled out',
   'category': 'DEFINITE_NEGATED_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'cannot be excluded',
   'category': 'AMBIVALENT_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'cannot totally be excluded',
   'category': 'PROBABLE_NEGATED_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'could be ruled out',
   'category': 'DEFINITE_NEGATED_EXISTENCE',
   'pattern': None,
   'rule': 'backward'},
  {'literal': 'examination',
   'category': 'INDICATION',
   'pattern': [{'LOWER': {'REGEX': '(examination|exam|study)'}}],
   'rule': 'backward'}]}

In [177]:
item_data = []
for data in modifier_data["patterns"]:
    item = ItemData(**data)
    item_data.append(item)

In [178]:
item_data

[ItemData: [are ruled out, definite_negated_existence, None, backward],
 ItemData: [be ruled out, indication, None, backward],
 ItemData: [being ruled out, indication, None, backward],
 ItemData: [can be ruled out, definite_negated_existence, None, backward],
 ItemData: [cannot be excluded, ambivalent_existence, None, backward],
 ItemData: [cannot totally be excluded, probable_negated_existence, None, backward],
 ItemData: [could be ruled out, definite_negated_existence, None, backward],
 ItemData: [examination, indication, [{'LOWER': {'REGEX': '(examination|exam|study)'}}], backward]]

In [179]:
doc._.context_graph.edges

[(pneumonia, <TagObject> [REASON, indication])]

In [180]:
context = ConTextComponent(item_data, nlp)

In [197]:
doc = nlp("might represent")

In [198]:
for token in doc:
    print(token.lemma_)

may
represent


# II. Read Data

In [68]:
txt_files = glob.glob(os.path.join(".", "pneumonia", "training_v2", "*.txt"))

In [69]:
texts = []
for file in txt_files:
    texts.append(open(file).read())

# III. Process with NLP

In [70]:
docs = list(nlp.pipe(texts))

In [90]:
for doc in docs:
    print(doc.ents)
    print(doc._.context_graph.modifiers)
    print(doc._.context_graph.edges)
    print()

()
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[]

(consolidation, pneumonia)
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[]

(pneumonia, pneumonia, Pneumonia)
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[(pneumonia, <TagObject> [REASON, indication])]

()
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[]

(infiltrate, infiltrate)
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[(infiltrate, <TagObject> [Reason, indication])]

()
[<TagObject> [Reason, indication], <TagObject> [but, terminate], <TagObject> [but, terminate], <TagObject> [REASON, indication], <TagObject> [but, terminate], <TagObject> [but, terminate]]
[]

()
[<TagObject> [Reason, indication], <TagObject> [REASON, indication]]
[]

()
[<TagObject> [Reason, indication], <TagObject> [REASON, indication], <TagObject> [REASON, indication]]
[]

(infiltrate, infiltrate, pneumonia, consolidation)
[<TagObject> [Reaso

In [57]:
pos_docs = [doc for doc in docs if len(doc.ents) > 0]

# Visualization

In [37]:
from cycontext import viz

In [39]:
from spacy import displacy

In [118]:
doc = docs[2]

In [185]:
displacy.render(doc, "ent", options={"colors": {"EVIDENCE_OF_PNEUMONIA": "orange", "REASON": "yellow"}})

In [186]:
doc.ents

(pneumonia,)

In [187]:
doc._.context_graph.modifiers

[<TagObject> [examination, indication]]

In [188]:
doc._.context_graph.edges

[(pneumonia, <TagObject> [examination, indication])]

In [189]:
viz.visualize_targets(doc
                     , colors ={"EVIDENCE_OF_PNEUMONIA": "orange",
                                          "DEFINITE_NEGATED_EXISTENCE": "blue"})

In [190]:
viz.visualize_modifiers(doc)