## Step 1: convert xml to BRAT
Code from Xi Yang (University of Florida)

Modified by Lavender Jiang

In [29]:
import shutil
from pathlib import Path
import xmltodict
import os
import sys
from collections import defaultdict
import numpy as np
import pathlib

In [37]:
# folder for raw files
infiles = {"train": "/gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release",
           "test": "/gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml"}

brat_out = {"train": "brat-train",
            "test": "brat-test"}

def get_paths(d, keys):
    ptr = {}
    for key in keys:
        ptr[key] = Path(d[key])
    return ptr

def make_dir(d):
    print("making dir...")
    for key in d.keys():
        path = d[key]
        if not type(path) == pathlib.PosixPath:
            path = Path(path)
        path.mkdir(parents=True, exist_ok=True)            

splits = ["train", "test"]
in_paths = get_paths(infiles, splits)
out_paths = get_paths(brat_out, splits)
make_dir(out_paths)

making dir...


In [5]:
def copy_text(d_src, d_dest):
    for key in d_src:
        path_src = d_src[key]
        path_dest = d_dest[key]
        print(f"copying text for split {key}. source path {path_src}, dest path {path_dest}")
        gen = path_src.glob("*.txt")
        if len(list(gen)) == 0:
            raise RunTimeError(f"empty directory! please make sure you have .txt file in {path_src}")
        for each in path_src.glob("*.txt"):
            fid = each.stem.split(".")[0]
            new_file = path_dest / f"{fid}.txt"
            shutil.copyfile(each, new_file)
            print(f"copied doc {fid} to {new_file}")

copy_text(in_paths, out_paths)

copying text for split train. source path /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release, dest path brat-train
copied doc 471 to brat-train/471.txt
copied doc 596 to brat-train/596.txt
copied doc 26 to brat-train/26.txt
copied doc 192 to brat-train/192.txt
copied doc 541 to brat-train/541.txt
copied doc 682 to brat-train/682.txt
copied doc 492 to brat-train/492.txt
copied doc 331 to brat-train/331.txt
copied doc 376 to brat-train/376.txt
copied doc 86 to brat-train/86.txt
copied doc 747 to brat-train/747.txt
copied doc 776 to brat-train/776.txt
copied doc 193 to brat-train/193.txt
copied doc 462 to brat-train/462.txt
copied doc 497 to brat-train/497.txt
copied doc 11 to brat-train/11.txt
copied doc 343 to brat-train/343.txt
copied doc 307 to brat-train/307.txt
copied doc 353 to brat-train/353.txt
copied doc 267 to brat-train/267.txt
copied doc 172 to brat-train/172.txt
copied doc 636 to brat-train/636.txt
copied doc 356 to brat-train/356.txt
copied d

copied doc 358 to brat-test/358.txt
copied doc 402 to brat-test/402.txt
copied doc 562 to brat-test/562.txt
copied doc 607 to brat-test/607.txt
copied doc 261 to brat-test/261.txt
copied doc 361 to brat-test/361.txt
copied doc 368 to brat-test/368.txt
copied doc 128 to brat-test/128.txt
copied doc 383 to brat-test/383.txt
copied doc 32 to brat-test/32.txt
copied doc 293 to brat-test/293.txt
copied doc 542 to brat-test/542.txt
copied doc 617 to brat-test/617.txt
copied doc 581 to brat-test/581.txt
copied doc 347 to brat-test/347.txt
copied doc 806 to brat-test/806.txt
copied doc 712 to brat-test/712.txt
copied doc 221 to brat-test/221.txt
copied doc 687 to brat-test/687.txt
copied doc 416 to brat-test/416.txt
copied doc 263 to brat-test/263.txt
copied doc 132 to brat-test/132.txt
copied doc 327 to brat-test/327.txt
copied doc 657 to brat-test/657.txt
copied doc 208 to brat-test/208.txt
copied doc 138 to brat-test/138.txt
copied doc 171 to brat-test/171.txt
copied doc 238 to brat-test/23

In [6]:
BRAT_TEMP = "T{}\t{} {} {}\t{}"
EVENTS = {'PROBLEM', 'TEST', 'TREATMENT', 'CLINICAL_DEPT', 'EVIDENTIAL', 'OCCURRENCE'}
SPEC = {'&': 'AAMMPP'}

def load_file(file):
    with open(file, "r") as f:
        cont = f.read()
    return cont

def write_to_file(data, file):
    with open(file, "w") as f:
        f.write(data)

def xml2brat(p1, p2, verbose=False):
    # offset need to -1 on the number
    for each in p1.glob("*.xml"):
        brat_anns = []
        idx = 1
        print(f'file name is {each}')
        ofn = p2 / (each.stem.split(".")[0] + ".ann")
        xml = load_file(each)
        if verbose:
            print(xml)
        xml = xml.replace('&', 'AAMMPP')
        tags = xmltodict.parse(xml)['ClinicalNarrativeTemporalAnnotation']['TAGS']
        try:
            for k, v in tags.items():
                # only keep event tags
                if k == 'EVENT':
                    for d in v:
                        typ = d['@type']
                        if typ in EVENTS:
                            s = int(d['@start']) - 1 # convert from 1-index to 0-index
                            e = int(d['@end']) - 1
                            txt = d['@text']
                            brat_anns.append(BRAT_TEMP.format(idx, typ, s, e, txt))
                            idx += 1
        except:
            print(xml)
            print(tags)
            assert(False)
        ot = "\n".join(brat_anns)
        ot = ot.replace('AAMMPP', '&')
        # break # added for debug
        write_to_file(ot, ofn)

def dataset_xml2brat(d_in, d_out):
    for key in d_in:
        print(f"in path is {d_in[key]}, out path is {d_out[key]}")
        xml2brat(d_in[key], d_out[key])

In [7]:
dataset_xml2brat(in_paths, out_paths)

in path is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release, out path is brat-train
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/481.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/547.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/11.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/546.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/591.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/122.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/656.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/121.xml
file name is /gpfs/data/oermannlab/public_data/i2b2

file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/801.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/191.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/98.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/156.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/256.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/367.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/366.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/726.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/587.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-

file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/393.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/642.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/28.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/247.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/541.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/291.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/357.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/6.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15.original-annotation.release/468.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-07-15

file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml/206.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml/317.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml/66.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml/361.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml/288.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml/537.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml/103.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml/287.xml
file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex

file name is /gpfs/data/oermannlab/public_data/i2b2_2012/2012-08-08.test-data.event-timex-groundtruth/xml/32.xml


## Step 2: convert BRAT to BIO
Code from Xi Yang (University of Florida)

Modified by Lavender Jiang

In [8]:
import os
import json
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from shutil import copyfile
import csv

In [9]:
from spacy.lang.en import English
nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7f4277c80900>

In [38]:
# folder for brat files
infiles = {"train": "brat-train",
           "test": "brat-test"}
bio_out = {"train": "bio-train",
           "test": "bio-test"}

def get_ann_files(d_path, splits):
    res = {}
    for key in splits:
        in_path = d_path[key]
        inputfiles = set()
        for f in os.listdir(in_path):
            if f.endswith('.ann'):
                inputfiles.add(f.split('.')[0].split('_')[0])
        res[key] = inputfiles
    return res

ann_files_d = get_ann_files(infiles, ["train", "test"])
make_dir(bio_out)

making dir...


In [39]:
# list of entities to retain
select_types = ['PROBLEM', 'TEST', 'TREATMENT', 'CLINICAL_DEPT', 'EVIDENTIAL', 'OCCURRENCE']

# convert Brat format into BIO format
# function for getting entity annotations from the annotation file
def get_annotation_entities(ann_file, select_types=None):
    entities = []
    with open(ann_file, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith('T'):
                term = line.strip().split('\t')[1].split()
                if (select_types != None) and (term[0] not in select_types): continue
                if int(term[-1]) <= int(term[1]): continue
                entities.append((int(term[1]), int(term[-1]), term[0]))
    return sorted(entities, key=lambda x: (x[0], x[1]))

# function for handling overlap by keeping the entity with largest text span
def remove_overlap_entities(sorted_entities):
    keep_entities = []
    for idx, entity in enumerate(sorted_entities):
        if idx == 0:
            keep_entities.append(entity)
            last_keep = entity
            continue
        if entity[0] < last_keep[1]:
            if entity[1]-entity[0] > last_keep[1]-last_keep[0]:
                last_keep = entity
                keep_entities[-1] = last_keep
        elif entity[0] == last_keep[1]:
            last_keep = (last_keep[0], entity[1], last_keep[-1])
            keep_entities[-1] = last_keep
        else:
            last_keep = entity
            keep_entities.append(entity)
    return keep_entities

# inverse index of entity annotations
def entity_dictionary(keep_entities, txt_file):
    #print(f"txt_file is {txt_file}")
    file_name = os.path.basename(txt_file)
    #print(f"file name is {file_name}")
    f_ann = {}
    with open(txt_file, "r", encoding="utf-8") as f:
        text = f.readlines()
        text = ''.join([i for i in text])
    for entity in keep_entities:
        entity_text = text[entity[0]:entity[1]]
        doc = nlp(entity_text)
        token_starts = [(i, doc[i:].start_char) for i in range(len(doc))]
        term_type = entity[-1]
        term_offset = entity[0]
        for i, token in enumerate(doc):
            ann_offset = token_starts[i][1]+term_offset
            if ann_offset not in f_ann:
                f_ann[ann_offset] = [i, token.text, term_type]
    return f_ann

In [49]:
def brat2bio(inputfiles, inputpath, outputpath, verbose=False):
    # Brat -> BIO format conversion
    print(f"converting brat2bio {inputfiles}")
    for infile in inputfiles:
        file = f"{infile}"
        ann_file = f"{inputpath}/{file}.ann"
        txt_file = f"{inputpath}/{file}.txt"
        out_file = f"{outputpath}/{file}.bio.txt"
        if verbose:
            print(f'infile is {infile}')
            print(f'outfile is {out_file}')
        sorted_entities = get_annotation_entities(ann_file, select_types)
        keep_entities = remove_overlap_entities(sorted_entities)
        f_ann = entity_dictionary(keep_entities, txt_file)
        
        with open(out_file, "w", encoding="utf-8") as f_out:
            with open(txt_file, "r", encoding="utf-8") as f:
                sent_offset = 0
                prev_label = "O"
                for line in f:
                    if '⁄' in line:
                        line = line.replace('⁄', '/') # replace non unicode characters
                    doc = nlp(line.strip())
                    # list of tuples, first value is token index, second value is char idx
                    token_starts = [(i, doc[i:].start_char) for i in range(len(doc))]
                    for token in doc:
                        token_sent_offset = token_starts[token.i][1] # sentence level local index
                        token_doc_offset = token_starts[token.i][1] + sent_offset # document level global index
                        if token_doc_offset in f_ann:
                            if prev_label == "O" or not (prev_label.split("-")[1] == f_ann[token_doc_offset][2]): # or prev_label == f"I-{f_ann[token_doc_offset][2]}" or prev_label == f"B-{f_ann[token_doc_offset][2]}":#f_ann[token_doc_offset][0] == 0: # changed edge case to I-tag according to wikipedia
                                label = f"B-{f_ann[token_doc_offset][2]}"
                            else:
                                label = f"I-{f_ann[token_doc_offset][2]}"
                            if not (f_ann[token_doc_offset][1] == token.text_with_ws.rstrip()):
                                print('{} does not match {}'.format(f_ann[token_doc_offset][1], token.text_with_ws.rstrip()))
                                assert(False)
                        else:
                            label = f"O"
                        prev_label = label # update prev_label
                        f_out.write(f"{token.text} {token_sent_offset} {token_sent_offset+len(token.text)} {token_doc_offset} {token_doc_offset+len(token.text)} {label}\n")
                    f_out.write('\n')
                    sent_offset += (len(line))  

def brat2bio_dict(ann_files_d, infiles_d, bio_out_d):
    for key in ann_files_d:
        ann_file = ann_files_d[key]
        infiles = infiles_d[key]
        bio_out = bio_out_d[key]
        brat2bio(ann_file, infiles, bio_out)

In [50]:
brat2bio_dict(ann_files_d, infiles, bio_out)

converting brat2bio {'273', '247', '308', '722', '252', '366', '422', '676', '96', '462', '721', '42', '577', '302', '472', '726', '346', '337', '321', '496', '491', '72', '313', '291', '426', '386', '116', '212', '152', '517', '612', '756', '36', '791', '236', '458', '123', '191', '681', '272', '482', '173', '156', '237', '201', '332', '86', '666', '168', '502', '107', '411', '1', '582', '432', '193', '407', '433', '717', '576', '636', '427', '417', '188', '786', '167', '126', '697', '512', '248', '348', '622', '777', '707', '166', '186', '522', '481', '6', '16', '757', '637', '492', '311', '367', '801', '642', '471', '256', '438', '192', '611', '682', '271', '357', '307', '807', '81', '388', '413', '547', '38', '242', '647', '92', '23', '2', '596', '318', '177', '316', '736', '151', '93', '47', '343', '203', '396', '182', '51', '8', '352', '373', '121', '153', '218', '452', '546', '701', '656', '141', '98', '521', '797', '497', '331', '26', '336', '382', '68', '163', '353', '437', '3

## Step 3: Combine files to dataset

In [57]:
import random 

random_state = 13
train_ids, dev_ids = train_test_split(list(ann_files_d['train']), train_size=0.7, random_state=random_state, shuffle=True)
test_ids = list(ann_files_d['test'])
random.Random(random_state).shuffle(test_ids)
print(f"train size {len(train_ids)}, val size {len(dev_ids)}, test size {len(test_ids)}")
i2b2_datasets = {"train":train_ids, "dev":dev_ids, "test":test_ids}
json.dump(i2b2_datasets, open("i2b2_2012_datasets.json", "w", encoding="utf-8"))

train size 133, val size 57, test size 120


In [60]:
def make_if_nonexist(dir_s):
    if not os.path.exists(dir_s):
        os.makedirs(dir_s)

data_dir = "dataset"
make_if_nonexist(data_dir)

# Merge BIO format train, validation and test datasets
for split in ["train", "dev", "test"]:
    if split in ["train", "dev"]:
        outputpath = bio_out["train"]
    else:
        outputpath = bio_out["test"]
    split_dir = os.path.join(data_dir, split)
    make_if_nonexist(split_dir)
    with open(os.path.join(data_dir, f"{split}.txt"), "w", encoding="utf-8") as f:
        for fid in i2b2_datasets[split]:
            copyfile(f"{outputpath}/{fid}.bio.txt", os.path.join(split_dir,f"{fid}.bio.txt"))
            with open(f"{outputpath}/{fid}.bio.txt", "r", encoding="utf-8") as fr:
                txt = fr.read().strip()
                if txt != '':
                    f.write(txt)
                    f.write("\n\n")

## Step 4: Bio 2 Nemo
Code from nVidia NeMo

In [2]:
!python bio2nemo.py --data_file dataset/train.txt

[NeMo I 2022-05-06 13:10:54 bio2nemo:119] Processing dataset/train.txt
[NeMo I 2022-05-06 13:10:54 bio2nemo:124] Processing of the dataset/train.txt is complete


In [5]:
!python bio2nemo.py --data_file dataset/dev.txt

[NeMo I 2022-05-06 13:11:51 bio2nemo:119] Processing dataset/dev.txt
[NeMo I 2022-05-06 13:11:51 bio2nemo:124] Processing of the dataset/dev.txt is complete


In [6]:
!python bio2nemo.py --data_file dataset/test.txt

[NeMo I 2022-05-06 13:11:57 bio2nemo:119] Processing dataset/test.txt
[NeMo I 2022-05-06 13:11:58 bio2nemo:124] Processing of the dataset/test.txt is complete
