# Annotation Converter

This notebook converts the original XML file format into plain text with markup inserted in it's proper place surrounded with tags delimeted with '{{' and '}}

**To Do:**


In [1]:
import xml.etree.ElementTree as ET
import os
from pathlib import Path
from html import unescape

In [2]:
os.getcwd()

'/Users/seth/OneDrive - The University of Colorado Denver/Documents/Development/n2c2/2014 De-identification/training-PHI-Gold-Set1'

In [3]:
input_path = Path('../training-PHI-Gold-Set1')
output_path = Path('../data/converted')

In [4]:
# remove html encoded items and get rid of angled quotes
# it appears some files had html escaping run more than once, so have to unescape twice
def unencode(note):
    return unescape(unescape(note.replace('&#8220;', '"').replace('&#8221;', '"')))

In [5]:
def insert_tags(text, tag, tag_type, start, end):
    TAG_START = '{{' # double curly braces do not appear in original text
    TAG_END   = '}}'
    LABEL_END = '/'
    TAG_SEPARATOR = ':'
    if tag == tag_type:
        combined = tag
    else:
        combined = tag + TAG_SEPARATOR + tag_type
    offset = len(TAG_START + combined + TAG_END + TAG_START + LABEL_END + combined + TAG_END)
    return ((text[:int(start)] + 
            TAG_START + combined + TAG_END +
            text[int(start):int(end)] + 
            TAG_START + LABEL_END + combined + TAG_END +
            text[int(end):]), offset)

In [6]:
input_path = Path('.')
output_path = Path('./converted')

if not os.path.exists(output_path):
    os.makedirs(output_path)

for file in input_path.glob('*.xml'):
    with open(file) as f:
        tree = ET.parse(f)
        root = tree.getroot()
        note = root[0].text
        offset = 0
        for c in root.iter('TAGS'):
            for child in c:
                note, new_offset = insert_tags(note,
                                                   child.tag,
                                                   child.attrib['TYPE'],
                                                   int(child.attrib['start']) + offset,
                                                   int(child.attrib['end']) + offset)
                offset += new_offset
        note = unencode(note)
        
        new_file = os.path.splitext(file)[0] + '.txt'
        with open(output_path / new_file, 'w') as o:
            o.write(note)

## Rest of notebook is just looking at 1 single file and checking processing steps

In [7]:
with open('220-01.xml') as f:
    tree = ET.parse(f)
    root = tree.getroot()
    print(root)

<Element 'deIdi2b2' at 0x108ca9a48>


In [8]:
root.tag

'deIdi2b2'

In [9]:
root.attrib

{}

In [10]:
for child in root:
    print(child.tag, child.attrib)
    print(type(child))

TEXT {}
<class 'xml.etree.ElementTree.Element'>
TAGS {}
<class 'xml.etree.ElementTree.Element'>


In [11]:
note = root[0].text

In [12]:
for c in root.iter('TAGS'):
    for child in c:
        print(child.tag, child.attrib)

DATE {'id': 'P0', 'start': '16', 'end': '26', 'text': '2067-05-03', 'TYPE': 'DATE', 'comment': ''}
AGE {'id': 'P1', 'start': '50', 'end': '52', 'text': '55', 'TYPE': 'AGE', 'comment': ''}
NAME {'id': 'P2', 'start': '290', 'end': '296', 'text': 'Oakley', 'TYPE': 'DOCTOR', 'comment': ''}
DATE {'id': 'P3', 'start': '297', 'end': '303', 'text': '4/5/67', 'TYPE': 'DATE', 'comment': ''}
LOCATION {'id': 'P4', 'start': '343', 'end': '353', 'text': 'Clarkfield', 'TYPE': 'HOSPITAL', 'comment': ''}
DATE {'id': 'P5', 'start': '363', 'end': '367', 'text': '7/67', 'TYPE': 'DATE', 'comment': ''}
AGE {'id': 'P6', 'start': '637', 'end': '639', 'text': '37', 'TYPE': 'AGE', 'comment': ''}
AGE {'id': 'P7', 'start': '694', 'end': '696', 'text': '66', 'TYPE': 'AGE', 'comment': ''}
DATE {'id': 'P8', 'start': '755', 'end': '759', 'text': '2062', 'TYPE': 'DATE', 'comment': ''}
DATE {'id': 'P9', 'start': '899', 'end': '903', 'text': '4/63', 'TYPE': 'DATE', 'comment': ''}
DATE {'id': 'P10', 'start': '940', 'end'

In [13]:
tag_start = '<'
tag_end = '>'
label_end = '/'

In [14]:
new_note = root[0].text
offset = 0
for c in root.iter('TAGS'):
    for child in c:
        # print(child.attrib['start'], child.attrib['end'], child.tag, child.attrib['TYPE'])
        new_note, new_offset = insert_tags(new_note,
                                           child.tag,
                                           child.attrib['TYPE'],
                                           int(child.attrib['start']) + offset,
                                           int(child.attrib['end']) + offset)
        offset += new_offset
new_note

"\n\n\nRecord date: {{DATE}}2067-05-03{{/DATE}}\n\nNarrative History\n\n   {{AGE}}55{{/AGE}} yo woman who presents for f/u \n\n   \n\n   Seen in Cardiac rehab locally last week and BP 170/80.  They called us and we increased her HCTZ to 25 mg from 12.5 mg.  States her BP's were fine there since - 130-140/70-80.\n\n   \n\n   \n\n   Saw Dr {{NAME:DOCTOR}}Oakley{{/NAME:DOCTOR}} {{DATE}}4/5/67{{/DATE}} - she was happy with results of ETT at {{LOCATION:HOSPITAL}}Clarkfield{{/LOCATION:HOSPITAL}}.  To f/u {{DATE}}7/67{{/DATE}}.  No CP's since last admit.\n\n   \n\n   Back to work and starting to walk.  No wt loss and discouraged by this, but just starting to exercise.\n\n   \n\n   No smoking for 3 months now!\n\n   \n\n   Still with hotflashes, wakes her up at night.\n\nProblems\n\n      FH breast cancer   {{AGE}}37{{/AGE}} yo s \n\n\n\n      FH myocardial infarction   mother died {{AGE}}66{{/AGE}} yo \n\n\n\n      Hypertension\n\n\n\n      Uterine fibroids   u/s {{DATE}}2062{{/DATE}} \n\n\n\

In [15]:
int(child.attrib['start'])

2284

In [16]:
note

"\n\n\nRecord date: 2067-05-03\n\nNarrative History\n\n   55 yo woman who presents for f/u \n\n   \n\n   Seen in Cardiac rehab locally last week and BP 170/80.  They called us and we increased her HCTZ to 25 mg from 12.5 mg.  States her BP's were fine there since - 130-140/70-80.\n\n   \n\n   \n\n   Saw Dr Oakley 4/5/67 - she was happy with results of ETT at Clarkfield.  To f/u 7/67.  No CP's since last admit.\n\n   \n\n   Back to work and starting to walk.  No wt loss and discouraged by this, but just starting to exercise.\n\n   \n\n   No smoking for 3 months now!\n\n   \n\n   Still with hotflashes, wakes her up at night.\n\nProblems\n\n      FH breast cancer   37 yo s \n\n\n\n      FH myocardial infarction   mother died 66 yo \n\n\n\n      Hypertension\n\n\n\n      Uterine fibroids   u/s 2062 \n\n\n\n      Smoking\n\n\n\n      hyperlipidemia   CRF mild chol, cigs, HTN, Fhx and known hx CAD in pt. \n\n\n\n      borderline diabetes mellitus   4/63 125 , follow hgbaic \n\n\n\n      VPB 