In [None]:
import urllib.request
import os
import codecs
import zipfile
import pandas as pd
from IPython.display import display, HTML
import sklearn.metrics

# and also our utilities for this class
from nlp_pneumonia_utils import Annotation
from nlp_pneumonia_utils import AnnotatedDocument
from nlp_pneumonia_utils import read_brat_annotations
from nlp_pneumonia_utils import read_doc_annotations
from nlp_pneumonia_utils import read_annotations
from nlp_pneumonia_utils import calculate_prediction_metrics
from nlp_pneumonia_utils import mark_text
from nlp_pneumonia_utils import clearPyConTextRegularExpressions
from nlp_pneumonia_utils import pneumonia_annotation_html_markup

## First, we'll load in our dataset but throughout these notebooks, there are a lot of utility functions used.  

### (OPTIONAL) Feel free to look at them here in the repository : <a href="https://github.com/UUDeCART/decart_rule_based_nlp/blob/master/nlp_pneumonia_utils.py">nlp_pneumonia_utils.py</a> 

In [None]:
# First thing, let's load our training set
annotated_doc_map = read_doc_annotations('data/training_v2.zip')

# let's also use a simple list of documents as well as this map
annotated_docs = list(annotated_doc_map.values())
print('Total Annotated Documents : {0}'.format(len(annotated_docs)))

total_positives = 0
for anno_doc in annotated_docs:
    if anno_doc.positive_label:
        total_positives += 1
    
print('Total Positive Pneumonia Documents : {0}'.format(total_positives))

In [None]:
# let's find the document with the most annotations
most_annotated_doc = None
for anno_doc in annotated_docs:
    if most_annotated_doc is None or len(anno_doc.annotations) > len(most_annotated_doc.annotations):
        most_annotated_doc = anno_doc
        #print('Most Annotations so far : {}'.format(len(most_annotated_doc.annotations)))

## Next, recall annotations annotated by our expert.  Note that there are 3 total annotation types in this set : 
1. **PNEUMONIA_DOC_YES* -> Document shows **active** or **possible** case of pneumonia
2. **PNEUMONIA_DOC_NO** -> Document shows **no evidence** of pneumonia
3. **EVIDENCE_OF_PNEUMONIA** -> Spans of phrases/sentence which show positive or possible evidence of pneumonia which led the expert annotator to the final document-level conclusion

## Let's render one of our annotated documents in HTML.  When using the function 'pneumonia_annotation_html_markup' these show up as the colors:
1. **PNEUMONIA_DOC_YES** -> RED
2. **PNEUMONIA_DOC_NO** -> GREEN
3. **EVIDENCE_OF_PNEUMONIA** -> RED

In [None]:
# let's display one of our documents in HTML
display(HTML(pneumonia_annotation_html_markup(most_annotated_doc).replace('\n', '<br>')))