# An example for clinical concept extraction with visualization

We highly recommend our [sentence segment tool](https://github.com/noc-lab/simple_sentence_segment) for detecting sentence boundary if the text contains arbitrary line breaks, such as the sample text in the following. To use this package, just run
```
pip install git+https://github.com/noc-lab/simple_sentence_segment.git
```
Alternatively, you can use the sentence segmentation tool in NLTK or Spacy. Also, you can use other tokenization tools than NLTK. But this example uses NTLK for the illustrative purpose.

In [1]:
import nltk
import re
from spacy import displacy
from IPython.core.display import display, HTML

from simple_sentence_segment import sentence_segment
from clinical_concept_extraction import clinical_concept_extraction

In [2]:
# An example of a discharge summary contains arbitrary line breaks. I faked this reports.
sample_text = """
This is an 119 year old woman with a history of diabetes 
who has a CT-scan at 2020-20-20. Insulin is prescribed
for the type-2 diabetes. Within the past year, the diabetic
symptoms have progressively gotten worse.
"""

In [3]:
def parse_text(text):
    # Perform sentence segmentation, tokenization and return the lists of tokens,
    # spans, and text for every sentence respectively
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    all_sentences = []
    all_spans = []
    start = 0
    normalized_text = ''
    for span in sentence_segment(text):
        sentence = sample_text[span[0]:span[1]]
        sentence = re.sub('\n', ' ', sentence)
        sentence = re.sub(r'\ +', ' ', sentence)
        sentence = sentence.strip()

        if len(sentence) > 0:
            tokens_span = tokenizer.span_tokenize(sentence)
            tokens = []
            spans = []
            for span in tokens_span:
                tokens.append(sentence[span[0]:span[1]])
                spans.append([start + span[0], start + span[1]])
                
            all_sentences.append(tokens)
            all_spans.append(spans)
            
            start += len(sentence) + 1
            normalized_text += sentence + '\n'
    return all_sentences, all_spans, normalized_text.strip()

In [4]:
tokenized_sentences, all_spans, normalized_text = parse_text(sample_text)

print('Variable tokenized_sentences contains token lists for every sentence:')
for tokens in tokenized_sentences:
    print(tokens)
    
print('')
print('Variable all_spans contains lists of token spans for every sentence:')
for spans in all_spans:
    print(spans)
    
print('')
print('Variable normalized_text contains strings for every sentence concatented by line break:')
print(normalized_text)

Variable tokenized_sentences contains token lists for every sentence:
['This', 'is', 'an', '119', 'year', 'old', 'woman', 'with', 'a', 'history', 'of', 'diabetes', 'who', 'has', 'a', 'CT-scan', 'at', '2020-20-20', '.']
['Insulin', 'is', 'prescribed', 'for', 'the', 'type-2', 'diabetes', '.']
['Within', 'the', 'past', 'year', ',', 'the', 'diabetic', 'symptoms', 'have', 'progressively', 'gotten', 'worse', '.']

Variable all_spans contains lists of token spans for every sentence:
[[0, 4], [5, 7], [8, 10], [11, 14], [15, 19], [20, 23], [24, 29], [30, 34], [35, 36], [37, 44], [45, 47], [48, 56], [57, 60], [61, 64], [65, 66], [67, 74], [75, 77], [78, 88], [88, 89]]
[[90, 97], [98, 100], [101, 111], [112, 115], [116, 119], [120, 126], [127, 135], [135, 136]]
[[137, 143], [144, 147], [148, 152], [153, 157], [157, 158], [159, 162], [163, 171], [172, 180], [181, 185], [186, 199], [200, 206], [207, 212], [212, 213]]

Variable normalized_text contains strings for every sentence concatented by line 

In [5]:
# function clinical_concept_extraction takes the lists of tokens as input and outputs the annotations
all_annotations = clinical_concept_extraction(tokenized_sentences)

In [6]:
# see annotations for each tokens
for sent_, ann_ in zip(tokenized_sentences, all_annotations):
    for t, a in zip(sent_, ann_):
        print('%30s %s' % (t, a))
    print('='*61)

                          This O
                            is O
                            an O
                           119 O
                          year O
                           old O
                         woman O
                          with O
                             a O
                       history O
                            of O
                      diabetes B-problem
                           who O
                           has O
                             a B-test
                       CT-scan I-test
                            at O
                    2020-20-20 O
                             . O
                       Insulin B-treatment
                            is O
                    prescribed O
                           for O
                           the B-problem
                        type-2 I-problem
                      diabetes I-problem
                             . O
                        Within O
                        

In [7]:
def build_display_elements(tokens, annotations, spans):
    # convert the annotations to the format used in displacy
    all_ann = []

    for sent_id, sent_info in enumerate(tokens):
        sent_length = len(tokens[sent_id])

        last_ann = 'O'
        last_start = None
        last_end = None
        for token_id in range(sent_length):
            this_ann = annotations[sent_id][token_id]

            # separated cases:
            if this_ann != last_ann:
                if last_ann != 'O':
                    # write last item
                    new_ent = {}
                    new_ent['start'] = last_start
                    new_ent['end'] = last_end
                    new_ent['label'] = last_ann[2:]
                    all_ann.append(new_ent)

                # record this instance
                last_ann = 'O' if this_ann == 'O' else 'I' + this_ann[1:]
                last_start = spans[sent_id][token_id][0]
                last_end = spans[sent_id][token_id][1]

            else:
                last_ann = this_ann
                last_end = spans[sent_id][token_id][1]

        if last_ann != 'O':
            new_ent = {}
            new_ent['start'] = last_start
            new_ent['end'] = last_end
            new_ent['label'] = last_ann[2:]
            all_ann.append(new_ent)

    return all_ann

In [8]:
ent = build_display_elements(tokenized_sentences, all_annotations, all_spans)

In [9]:
ent_inp = {
    'text': normalized_text,
    'ents': ent,
    'title': ''
}

colors = {'PROBLEM': '#fe4a49', 'TEST': '#fed766', 'TREATMENT': '#2ab7ca'}
options = {'colors': colors}

html = displacy.render(ent_inp, style='ent', manual=True, options=options)
display(HTML(html))