In [1]:
import nltk
import re
from spacy import displacy
from IPython.core.display import display, HTML

from simple_sentence_segment import sentence_segment
from clinical_concept_extraction import clinical_concept_extraction



In [2]:
sample_text = """
This is an 119 year old woman with a history of diabetes 
who has a CT-scan at 2020-20-20. Insulin is prescribed
for the type-2 diabetes. Within the past year, the diabetic
symptoms have progressively gotten worse.
"""

In [3]:
def parse_text(text):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    all_sentences = []
    all_spans = []
    start = 0
    normalized_text = ''
    for span in sentence_segment(text):
        sentence = sample_text[span[0]:span[1]]
        sentence = re.sub('\n', ' ', sentence)
        sentence = re.sub(r'\ +', ' ', sentence)
        sentence = sentence.strip()

        if len(sentence) > 0:
            tokens_span = tokenizer.span_tokenize(sentence)
            tokens = []
            spans = []
            for span in tokens_span:
                tokens.append(sentence[span[0]:span[1]])
                spans.append([start + span[0], start + span[1]])
                
            all_sentences.append(tokens)
            all_spans.append(spans)
            
            start += len(sentence) + 1
            normalized_text += sentence + '\n'
    return all_sentences, all_spans, normalized_text.strip()

In [4]:
tokenized_sentences, all_spans, normalized_text = parse_text(sample_text)
print(normalized_text)

This is an 119 year old woman with a history of diabetes who has a CT-scan at 2020-20-20.
Insulin is prescribed for the type-2 diabetes.
Within the past year, the diabetic symptoms have progressively gotten worse.


In [5]:
all_annotations = clinical_concept_extraction(tokenized_sentences)

Instructions for updating:
Use the `axis` argument instead
USING SKIP CONNECTIONS
Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead
INFO:tensorflow:Restoring parameters from /home/henghuiz/Downloads/cce/blstm/model


In [6]:
all_annotations

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-problem',
  'O',
  'O',
  'B-test',
  'I-test',
  'O',
  'O',
  'O'],
 ['B-treatment', 'O', 'O', 'O', 'B-problem', 'I-problem', 'I-problem', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B-problem',
  'I-problem',
  'I-problem',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [7]:
def build_display_elements(tokens, annotations, spans):
    all_ann = []

    for sent_id, sent_info in enumerate(tokens):
        sent_length = len(tokens[sent_id])

        last_ann = 'O'
        last_start = None
        last_end = None
        for token_id in range(sent_length):
            this_ann = annotations[sent_id][token_id]

            # separated cases:
            if this_ann != last_ann:
                if last_ann != 'O':
                    # write last item
                    new_ent = {}
                    new_ent['start'] = last_start
                    new_ent['end'] = last_end
                    new_ent['label'] = last_ann[2:]
                    all_ann.append(new_ent)

                # record this instance
                last_ann = 'O' if this_ann == 'O' else 'I' + this_ann[1:]
                last_start = spans[sent_id][token_id][0]
                last_end = spans[sent_id][token_id][1]

            else:
                last_ann = this_ann
                last_end = spans[sent_id][token_id][1]

        if last_ann != 'O':
            new_ent = {}
            new_ent['start'] = last_start
            new_ent['end'] = last_end
            new_ent['label'] = last_ann[2:]
            all_ann.append(new_ent)

    return all_ann

In [8]:
ent = build_display_elements(tokenized_sentences, all_annotations, all_spans)

In [9]:
ent_inp = {
    'text': normalized_text,
    'ents': ent,
    'title': ''
}

colors = {'PROBLEM': '#fe4a49', 'TEST': '#fed766', 'TREATMENT': '#2ab7ca'}
options = {'colors': colors}

html = displacy.render(ent_inp, style='ent', manual=True, options=options)
display(HTML(html))