In [1]:
import os

In [2]:
# First, make sur to download the corpora, and set the following paths accordingly
# LINNAEUS : https://github.com/BaderLab/Transfer-Learning-BNER-Bioinformatics-2018/blob/master/corpora/LINNAEUS_GSC_brat.tar.gz
# COPIOUS : http://www.nactem.ac.uk/copious/copious_published.zip

LINNAEUS_DIR = "./LINNAEUS_GSC_brat"
S800_DIR = "./S800_GSC_brat"
COPIOUS_DIR = "./COPIOUS_GSC_brat"
BB_DIR = "./BB_GSC_brat"

In [3]:
from corpus_processing import *

In [4]:
PATH_TO_COPIOUS_GSC_BRAT = os.path.join(COPIOUS_DIR, 'copious_published')
PATH_TO_COPIOUS_CLEAN = os.path.join(COPIOUS_DIR, 'copious_clean')
PATH_TO_COPIOUS_ASCII = os.path.join(COPIOUS_DIR, 'copious_ascii')

process_dataset(PATH_TO_COPIOUS_GSC_BRAT, PATH_TO_COPIOUS_CLEAN, clean_corpus)
process_dataset(PATH_TO_COPIOUS_CLEAN, PATH_TO_COPIOUS_ASCII, utf8_to_ascii)

In [5]:
PATH_TO_LINNAEUS_GSC_BRAT = os.path.join(LINNAEUS_DIR, 'LINNAEUS')
PATH_TO_LINNAEUS_CLEAN = os.path.join(LINNAEUS_DIR, 'linnaeus_clean')
PATH_TO_LINNAEUS_ASCII = os.path.join(LINNAEUS_DIR, 'linnaeus_ascii')

process_dataset(PATH_TO_LINNAEUS_GSC_BRAT, PATH_TO_LINNAEUS_CLEAN, clean_corpus, dev="valid")
process_dataset(PATH_TO_LINNAEUS_GSC_BRAT, PATH_TO_LINNAEUS_ASCII, function=utf8_to_ascii, dev="valid")

In [6]:
PATH_TO_BB_CLEAN = os.path.join(BB_DIR, 'bb_clean')
PATH_TO_BB_ASCII = os.path.join(BB_DIR, 'bb_ascii')

process_dataset(PATH_TO_BB_CLEAN, PATH_TO_BB_ASCII, function=utf8_to_ascii)

In [2]:
# Utility function : copy and apply a function to the train, test, and dev subsets

def process_dataset(input_dir, output_dir, function, train="train", test="test", dev="dev"):
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        
    if dev:
        shutil.copytree(os.path.join(input_dir, dev), os.path.join(output_dir, 'dev'))
        function(os.path.join(output_dir, 'dev'))
    if train:
        shutil.copytree(os.path.join(input_dir, train), os.path.join(output_dir, 'train'))
        function(os.path.join(output_dir, 'train'))
    if test:
        shutil.copytree(os.path.join(input_dir, test), os.path.join(output_dir, 'test'))
        function(os.path.join(output_dir, 'test'))

    if train and dev:
        shutil.copytree(os.path.join(output_dir, train), os.path.join(output_dir, 'train_dev'))
        for item in os.listdir(os.path.join(output_dir, 'dev')):
            s = os.path.join(output_dir, 'dev', item)
            d = os.path.join(output_dir, 'train_dev', item)
            shutil.copy2(s, d)

### Clean corpora : remove \r\n newline, trim whitespaces, remove overlapping entities

In [11]:
def split_offsets(row):
    items = row["offsets"].split(" ")
    row["type"] = items[0]    
    row["start"] = int(items[1])
    row["end"] = int(items[-1])
    return row

# Read annotations (Standoff format)
def get_ann(ann_filename):
    try:
        doc_ann = pd.read_csv(ann_filename, sep = "\t", header=None)
    except EmptyDataError:
        return pd.DataFrame()
    else:
        if not doc_ann.empty:
            doc_ann.columns = ["index", "offsets", "text"]
            doc_ann = doc_ann.set_index("index")
            doc_ann = doc_ann.dropna()
            doc_ann = doc_ann.apply(split_offsets, axis=1)
            doc_ann = doc_ann.drop(["offsets"], axis=1)
            doc_ann = doc_ann[(doc_ann['type'] == "Taxon") | (doc_ann['type'] == "Microorganism")]
            doc_ann = doc_ann.replace("Taxon", "LIVB")
            doc_ann = doc_ann.replace("Microorganism", "LIVB")
    return doc_ann

In [12]:
# Remove \r\n newlines       
def remove_newlines(txtfile):
    with open(txtfile, "r",  newline="") as f:
        data = f.read().replace("\r\n", "\n")
    with open(txtfile, "w") as f:
        f.write(data)
    return data

In [13]:
# Trim whitespaces from the start and the end of entities
def trim_whitespaces(start, end, text):
    invalid_span_tokens = re.compile(r'\s')
    valid_start = start
    valid_end = end
    ent_start = 0
    ent_end = len(text)
    while ent_start < len(text) and invalid_span_tokens.match(
        text[ent_start]):
        ent_start += 1
    while ent_end > 1 and invalid_span_tokens.match(
        text[ent_end - 1]):
        ent_end -= 1
    return valid_start+ent_start, valid_start+ent_end

In [14]:
# From a list of intervals (boundaries), 
# replace a set of overlapping entities by their union
def get_max_non_overlapping(intervals):
    i = 0
    while i < len(intervals)-1:
        extended = None
        j = i+1
        while j < len(intervals):
            extended = overlap(intervals[i], intervals[j])
            if extended:
                break
            else:
                j += 1
        if extended:
            intervals[i] = extended
            intervals.pop(j)
        else:
            i += 1
    return intervals
            
# Return True if the boundaries of two entities overlap
def overlap(ival, jval):
    if jval[0] <= ival[0] and ival[1] <= jval[1]:
        return jval
    if ival[0] <= jval[0] and jval[1] <= ival[1]:
        return ival
    if ival[0] < jval[0] and jval[0] < ival[1] and ival[1] < jval[1]:
        return (ival[0], jval[1])
    if jval[0] < ival[0] and ival[0] < jval[1] and jval[1] < ival[1]:
        return (jval[0], ival[1])
    return None

In [15]:
def clean_corpus(input_dir):

    data = []
    for document in glob(os.path.join(input_dir, "*.txt")):

        filename = os.path.basename(document)
        ann_filename = document[:-3]+"ann"
        doc_ann = get_ann(ann_filename)
        
        text = remove_newlines(document)
        
        offsets = [(row['start'], row['end']) for _, row in doc_ann.iterrows()]
        offsets = get_max_non_overlapping(offsets)
        
        entities = []
        for offset in offsets:
            start, end = trim_whitespaces(offset[0], offset[1], text[offset[0]:offset[1]])
            entities += [{"offsets":" ".join(["LIVB", str(start), str(end)]), "text":text[start:end].replace("\n"," ")}] 
                
        df = pd.DataFrame(entities)
        df = df.rename('T{}'.format)

        df.to_csv(ann_filename, sep = "\t", header=None, quoting=csv.QUOTE_NONE)

In [16]:
PATH_TO_COPIOUS_GSC_BRAT = '../corpora/COPIOUS_GSC_brat/copious_published'
PATH_TO_COPIOUS_CLEAN = '../corpora/COPIOUS_GSC_brat/copious_clean_no_overlap'

process_dataset(PATH_TO_COPIOUS_GSC_BRAT, PATH_TO_COPIOUS_CLEAN, clean_corpus)

In [69]:
PATH_TO_LINNAEUS_GSC_BRAT = '../corpora/LINNAEUS_GSC_brat/LINNAEUS'
PATH_TO_LINNAEUS_CLEAN = '../corpora/LINNAEUS_GSC_brat/LINNAEUS_clean'

process_dataset(PATH_TO_LINNAEUS_GSC_BRAT, PATH_TO_LINNAEUS_CLEAN, clean_corpus, dev="valid")

In [65]:
PATH_TO_BB_CLEAN = '../corpora/BB_GSC_brat/BB'
PATH_TO_BB_OVERL = '../corpora/BB_GSC_brat/bb_clean_no_overlap'

process_dataset(PATH_TO_BB_CLEAN, PATH_TO_BB_OVERL, clean_corpus, test=None)

[{'offsets': 'LIVB 19 49', 'text': 'L. lactis subsp. cremoris B697'}, {'offsets': 'LIVB 230 268', 'text': 'L. lactis subsp. cremoris strain B1157'}, {'offsets': 'LIVB 458 462', 'text': 'B697'}]
[{'offsets': 'LIVB 52 66', 'text': 'Staphylococcus'}, {'offsets': 'LIVB 68 79', 'text': 'Micrococcus'}, {'offsets': 'LIVB 81 94', 'text': 'Psychrobacter'}, {'offsets': 'LIVB 96 108', 'text': 'Enterococcus'}, {'offsets': 'LIVB 113 127', 'text': 'Brevibacterium'}, {'offsets': 'LIVB 545 559', 'text': 'Staphylococcus'}, {'offsets': 'LIVB 561 572', 'text': 'Micrococcus'}, {'offsets': 'LIVB 574 585', 'text': 'Macrococcus'}, {'offsets': 'LIVB 587 599', 'text': 'Enterococcus'}, {'offsets': 'LIVB 601 614', 'text': 'Lactobacillus'}, {'offsets': 'LIVB 616 630', 'text': 'Carnobacterium'}, {'offsets': 'LIVB 632 643', 'text': 'Leuconostoc'}, {'offsets': 'LIVB 645 659', 'text': 'Brevibacterium'}, {'offsets': 'LIVB 661 676', 'text': 'Corynebacterium'}, {'offsets': 'LIVB 678 689', 'text': 'Brochothrix'}, {'offse

[{'offsets': 'LIVB 217 220', 'text': 'HIV'}, {'offsets': 'LIVB 565 568', 'text': 'HIV'}, {'offsets': 'LIVB 695 716', 'text': 'Chlamydia trachomatis'}, {'offsets': 'LIVB 968 977', 'text': 'chlamydia'}]
[{'offsets': 'LIVB 102 134', 'text': 'Mycobacterium tuberculosis H37Rv'}, {'offsets': 'LIVB 517 543', 'text': 'Mycobacterium tuberculosis'}]
[{'offsets': 'LIVB 68 89', 'text': 'Penicillium crustosum'}, {'offsets': 'LIVB 129 155', 'text': 'Penicillium brevicompactum'}, {'offsets': 'LIVB 258 279', 'text': 'Penicillium crustosum'}, {'offsets': 'LIVB 340 352', 'text': 'P. crustosum'}, {'offsets': 'LIVB 479 495', 'text': 'Escherichia coli'}, {'offsets': 'LIVB 1101 1112', 'text': 'Penicillium'}]
[{'offsets': 'LIVB 42 73', 'text': 'L. lactis subsp. cremoris B1157'}, {'offsets': 'LIVB 166 187', 'text': 'C. ammoniagenes B1506'}]
[{'offsets': 'LIVB 181 198', 'text': 'Coxiella burnetii'}, {'offsets': 'LIVB 250 267', 'text': 'Coxiella burnetii'}, {'offsets': 'LIVB 376 387', 'text': 'C. burnetii'}, {'

[{'offsets': 'LIVB 69 83', 'text': 'Staphylococcus'}, {'offsets': 'LIVB 88 98', 'text': 'micrococci'}, {'offsets': 'LIVB 99 113', 'text': 'corynebacteria'}, {'offsets': 'LIVB 154 164', 'text': 'E. faecium'}, {'offsets': 'LIVB 219 230', 'text': 'enterococci'}, {'offsets': 'LIVB 245 257', 'text': 'Enterococcus'}, {'offsets': 'LIVB 382 394', 'text': 'Lactobacilli'}, {'offsets': 'LIVB 510 521', 'text': 'Pseudomonas'}, {'offsets': 'LIVB 626 637', 'text': 'Pseudomonas'}]
[{'offsets': 'LIVB 68 91', 'text': 'Microbacterium foliorum'}, {'offsets': 'LIVB 93 110', 'text': 'Psychrobacter sp.'}, {'offsets': 'LIVB 115 131', 'text': 'Proteus vulgaris'}]
[{'offsets': 'LIVB 59 81', 'text': 'Listeria monocytogenes'}]
[{'offsets': 'LIVB 9 21', 'text': 'mycobacteria'}, {'offsets': 'LIVB 181 193', 'text': 'mycobacteria'}, {'offsets': 'LIVB 417 446', 'text': 'M. fortuitum-chelonei complex'}, {'offsets': 'LIVB 500 510', 'text': 'M. marinum'}, {'offsets': 'LIVB 530 541', 'text': 'M. ulcerans'}]
[{'offsets': '

[{'offsets': 'LIVB 54 72', 'text': 'Lactococcus lactis'}, {'offsets': 'LIVB 198 219', 'text': 'Debaryomyces hansenii'}, {'offsets': 'LIVB 221 240', 'text': 'Geotrichum candidum'}, {'offsets': 'LIVB 242 259', 'text': 'Kluyveromyces sp.'}, {'offsets': 'LIVB 264 283', 'text': 'Yarrowia lipolytica'}, {'offsets': 'LIVB 630 656', 'text': 'Brevibacterium aurantiacum'}, {'offsets': 'LIVB 658 673', 'text': 'Corynebacterium'}, {'offsets': 'LIVB 675 687', 'text': 'Arthrobacter'}, {'offsets': 'LIVB 692 711', 'text': 'Staphylococcus spp.'}]
[{'offsets': 'LIVB 609 631', 'text': 'Pseudomonas aeruginosa'}, {'offsets': 'LIVB 633 653', 'text': 'Enterobacter cloacae'}, {'offsets': 'LIVB 655 662', 'text': 'E. coli'}, {'offsets': 'LIVB 770 793', 'text': 'Acinetobacter baumannii'}, {'offsets': 'LIVB 902 914', 'text': 'Enterococcus'}, {'offsets': 'LIVB 919 926', 'text': 'Candida'}, {'offsets': 'LIVB 928 940', 'text': 'Enterococcus'}]
[{'offsets': 'LIVB 50 76', 'text': 'Streptococcus thermophilus'}, {'offsets

[{'offsets': 'LIVB 41 60', 'text': 'Lb. plantarum (FH3)'}, {'offsets': 'LIVB 62 76', 'text': 'H. alvei (B16)'}, {'offsets': 'LIVB 78 95', 'text': 'Lc. lactis (D5.3)'}, {'offsets': 'LIVB 136 147', 'text': 'Leuconostoc'}, {'offsets': 'LIVB 149 160', 'text': 'Lactococcus'}, {'offsets': 'LIVB 162 175', 'text': 'Lactobacillus'}, {'offsets': 'LIVB 209 220', 'text': 'Pseudomonas'}, {'offsets': 'LIVB 306 317', 'text': 'Pseudomonas'}]
[{'offsets': 'LIVB 57 70', 'text': 'Campylobacter'}, {'offsets': 'LIVB 214 227', 'text': 'Campylobacter'}, {'offsets': 'LIVB 432 445', 'text': 'Campylobacter'}, {'offsets': 'LIVB 1002 1020', 'text': 'Campylobacter coli'}, {'offsets': 'LIVB 1407 1420', 'text': 'Campylobacter'}, {'offsets': 'LIVB 1558 1571', 'text': 'Campylobacter'}]
[{'offsets': 'LIVB 85 101', 'text': 'Proteus vulgaris'}, {'offsets': 'LIVB 106 126', 'text': 'Alcaligenes faecalis'}, {'offsets': 'LIVB 193 204', 'text': 'P. vulgaris'}, {'offsets': 'LIVB 238 249', 'text': 'P. vulgaris'}, {'offsets': 'L

[{'offsets': 'LIVB 0 12', 'text': 'Methylocella'}, {'offsets': 'LIVB 98 121', 'text': 'Methylocella silvestris'}, {'offsets': 'LIVB 308 320', 'text': 'Methylocella'}]
[{'offsets': 'LIVB 32 46', 'text': 'Actinobacteria'}, {'offsets': 'LIVB 52 73', 'text': 'Corynebacterium casei'}, {'offsets': 'LIVB 81 93', 'text': 'C. variabile'}, {'offsets': 'LIVB 198 225', 'text': 'Brachybacterium alimentarum'}, {'offsets': 'LIVB 235 249', 'text': 'Brevibacterium'}, {'offsets': 'LIVB 326 337', 'text': 'B. permense'}, {'offsets': 'LIVB 376 385', 'text': 'B. linens'}, {'offsets': 'LIVB 427 441', 'text': 'B. aurantiacum'}, {'offsets': 'LIVB 480 500', 'text': 'Brevibacterium  spp.'}, {'offsets': 'LIVB 596 609', 'text': 'staphylococci'}, {'offsets': 'LIVB 628 656', 'text': 'Staphylococcus saprophyticus'}, {'offsets': 'LIVB 698 712', 'text': 'Staph. equorum'}, {'offsets': 'LIVB 812 828', 'text': 'Proteus vulgaris'}, {'offsets': 'LIVB 833 853', 'text': 'Alcaligenes faecalis'}, {'offsets': 'LIVB 1197 1216', '

[{'offsets': 'LIVB 41 64', 'text': 'Serratia marcescens MG1'}, {'offsets': 'LIVB 87 106', 'text': 'Serratia marcescens'}, {'offsets': 'LIVB 199 216', 'text': 'S. marcescens MG1'}, {'offsets': 'LIVB 1513 1526', 'text': 'S. marcescens'}, {'offsets': 'LIVB 1659 1676', 'text': 'S. marcescens MG1'}]
[{'offsets': 'LIVB 16 33', 'text': 'Coxiella burnetii'}, {'offsets': 'LIVB 85 102', 'text': 'Coxiella burnetii'}, {'offsets': 'LIVB 313 324', 'text': 'C. burnetii'}, {'offsets': 'LIVB 347 358', 'text': 'C. burnetii'}, {'offsets': 'LIVB 850 861', 'text': 'C. burnetii'}]
[{'offsets': 'LIVB 28 42', 'text': 'H. alvei (B16)'}]
[{'offsets': 'LIVB 54 63', 'text': 'L. lactis'}, {'offsets': 'LIVB 109 133', 'text': 'Streptococcus pneumoniae'}, {'offsets': 'LIVB 161 184', 'text': 'Lactobacillus rhamnosus'}]
[{'offsets': 'LIVB 58 79', 'text': 'Brevibacterium linens'}, {'offsets': 'LIVB 168 188', 'text': 'Brevibacterium  spp.'}]
[{'offsets': 'LIVB 20 43', 'text': 'Proteus vulgaris  1 M10'}, {'offsets': 'LIVB

### Replace non-ascii characters in corpus

In [4]:
# See : https://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space

def utf8_to_ascii(input_dir):
    for document in glob(os.path.join(input_dir, "*.txt")):
        with open(document, "r") as f:
            data = f.read()
            asciidata = re.sub(r'[^\x00-\x7F]+',' ', data)
        with open(document, "w") as f:
            f.write(asciidata)

In [18]:
PATH_TO_COPIOUS_OVERL = '../corpora/COPIOUS_GSC_brat/copious_clean_no_overlap'
PATH_TO_COPIOUS_ASCII = '../corpora/COPIOUS_GSC_brat/copious_ascii'

process_dataset(PATH_TO_COPIOUS_OVERL, PATH_TO_COPIOUS_ASCII, utf8_to_ascii)

In [5]:
PATH_TO_BB_OVERL = '../corpora/BB_GSC_brat/bb_clean_no_overlap'
PATH_TO_BB_ASCII = '../corpora/BB_GSC_brat/bb_ascii'

process_dataset(PATH_TO_BB_OVERL, PATH_TO_BB_ASCII, function=utf8_to_ascii)

In [None]:
PATH_TO_LINNAEUS_GSC_BRAT = '../corpora/LINNAEUS_GSC_brat/LINNAEUS'
PATH_TO_LINNAEUS_ASCII = '../corpora/LINNAEUS_GSC_brat/LINNAEUS_ascii'

process_dataset(PATH_TO_LINNAEUS_GSC_BRAT, PATH_TO_LINNAEUS_ASCII, function=utf8_to_ascii, dev="valid")

### Build CRAFT corpus

In [34]:
import os
from glob import glob
import shutil
import pandas as pd

In [79]:
path_to_craft = "./CRAFT-4.0.1"
path_to_craft_brat = "./CRAFT_GSC_brat"

In [80]:
output_dir = os.path.join(path_to_craft_brat, "craft")
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)
for src in glob(os.path.join(path_to_craft, "articles", "txt", "*.txt")):
    shutil.copy(src, output_dir)

In [81]:
import xml.etree.ElementTree as ET 

def iter_class(annotations):
    class_dict = {}
    for ann in annotations:
        if ann.tag == "classMention":
            yield {"id": ann.attrib["id"], "class": ann[0].attrib["id"]} 

def iter_ann(annotations):
    ann_dict = {}
    for ann in annotations:
        if ann.tag == "annotation":
            for item in ann:
                print(item.tag, item.attrib, item.text)
            yield {"offset": f"LIVB {ann[2].attrib['start']} {ann[2].attrib['end']}", "text": ann[3].text, "id": ann[0].attrib["id"]}

def xml_to_ann(xmlfile):
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    df = pd.DataFrame(list(iter_ann(tree.getroot())))
    df.set_index("id", inplace=True)
    class_df = pd.DataFrame(list(iter_class(tree.getroot())))
    class_df.set_index("id", inplace=True)
    df["id"] = class_df["class"]
    df = df.reset_index(drop=True)
    df = df.rename('T{}'.format)
    return df

In [83]:
for src in glob(os.path.join(path_to_craft, "concept-annotation/NCBITaxon/NCBITaxon/knowtator", "*.xml")):
    df = xml_to_ann(src)
    ann_filename = os.path.basename(src).split(".")[0]+".ann"
    df.to_csv(os.path.join(output_dir, ann_filename), sep="\t", header=False)

mention {'id': 'organism_Instance_100042'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '35036', 'end': '35040'} None
spannedText {} Mice
mention {'id': 'organism_Instance_23274'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '2342', 'end': '2346'} None
spannedText {} mice
mention {'id': 'organism_Instance_23278'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '6104', 'end': '6111'} None
spannedText {} animals
mention {'id': 'organism_Instance_23282'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '6255', 'end': '6259'} None
spannedText {} mice
mention {'id': 'organism_Instance_23286'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '8175', 'end': '8179'} None
spannedText {} mice
mention {'id': 'organism_Instance_23290'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '9136', 'end': '9140'} None
spannedText {} mice
mention {'

spannedText {} mice
mention {'id': 'organism_Instance_52819'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '20520', 'end': '20524'} None
spannedText {} mice
mention {'id': 'organism_Instance_52824'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '20598', 'end': '20602'} None
spannedText {} mice
mention {'id': 'organism_Instance_52829'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '21213', 'end': '21217'} None
spannedText {} mice
mention {'id': 'organism_Instance_52834'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '21510', 'end': '21514'} None
spannedText {} mice
mention {'id': 'organism_Instance_52839'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '22026', 'end': '22031'} None
spannedText {} mouse
mention {'id': 'organism_Instance_52844'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '22330', 'end': '22334'} None
spa

mention {'id': 'organism_Instance_270387'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '6003', 'end': '6013'} None
spannedText {} adenoviral
mention {'id': 'organism_Instance_270392'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '44591', 'end': '44601'} None
spannedText {} retrovirus
mention {'id': 'organism_Instance_270402'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '45273', 'end': '45279'} None
spannedText {} humans
mention {'id': 'organism_Instance_270407'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '53893', 'end': '53898'} None
spannedText {} Mouse
mention {'id': 'organism_Instance_270412'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '53924', 'end': '53928'} None
spannedText {} Mice
mention {'id': 'organism_Instance_270417'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '55996', 'end': '56001'} None
spann

annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '28863', 'end': '28867'} None
spannedText {} mice
mention {'id': 'organism_Instance_38304'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '29201', 'end': '29205'} None
spannedText {} mice
mention {'id': 'organism_Instance_38308'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '29536', 'end': '29545'} None
spannedText {} bacterial
mention {'id': 'organism_Instance_38312'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '31070', 'end': '31075'} None
spannedText {} mouse
mention {'id': 'organism_Instance_38316'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '31671', 'end': '31675'} None
spannedText {} mice
mention {'id': 'organism_Instance_250035'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '20581', 'end': '20601'} None
spannedText {} Arabidopsis thaliana
mention {'id': 'organism_Ins

span {'start': '70780', 'end': '70784'} None
spannedText {} mice
mention {'id': 'organism_Instance_45210'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '70981', 'end': '70985'} None
spannedText {} mice
mention {'id': 'organism_Instance_45215'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '71158', 'end': '71162'} None
spannedText {} mice
mention {'id': 'organism_Instance_45220'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '71447', 'end': '71451'} None
spannedText {} Mice
mention {'id': 'organism_Instance_45225'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '71547', 'end': '71551'} None
spannedText {} mice
mention {'id': 'organism_Instance_45230'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '71694', 'end': '71698'} None
spannedText {} mice
mention {'id': 'organism_Instance_45235'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span

span {'start': '6609', 'end': '6614'} None
spannedText {} human
mention {'id': 'organism_Instance_39997'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '6874', 'end': '6878'} None
spannedText {} mice
mention {'id': 'organism_Instance_40002'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '6965', 'end': '6969'} None
spannedText {} mice
mention {'id': 'organism_Instance_40007'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7371', 'end': '7375'} None
spannedText {} mice
mention {'id': 'organism_Instance_40012'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7890', 'end': '7894'} None
spannedText {} mice
mention {'id': 'organism_Instance_40017'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7964', 'end': '7968'} None
spannedText {} mice
mention {'id': 'organism_Instance_40022'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': 

mention {'id': 'organism_Instance_59478'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '21115', 'end': '21119'} None
spannedText {} mice
mention {'id': 'organism_Instance_59483'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '21245', 'end': '21249'} None
spannedText {} mice
mention {'id': 'organism_Instance_59488'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '21315', 'end': '21319'} None
spannedText {} mice
mention {'id': 'organism_Instance_59498'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '21481', 'end': '21485'} None
spannedText {} mice
mention {'id': 'organism_Instance_59503'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '21598', 'end': '21602'} None
spannedText {} mice
mention {'id': 'organism_Instance_59508'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '21936', 'end': '21940'} None
spannedText {} mice
ment

mention {'id': 'organism_Instance_40837'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5569', 'end': '5573'} None
spannedText {} mice
mention {'id': 'organism_Instance_40842'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5727', 'end': '5732'} None
spannedText {} mouse
mention {'id': 'organism_Instance_40847'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5759', 'end': '5767'} None
spannedText {} organism
mention {'id': 'organism_Instance_40852'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5874', 'end': '5879'} None
spannedText {} mouse
mention {'id': 'organism_Instance_40857'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5969', 'end': '5975'} None
spannedText {} murine
mention {'id': 'organism_Instance_40862'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '6115', 'end': '6120'} None
spannedText {} mouse
mention

span {'start': '44835', 'end': '44847'} None
spannedText {} Homo sapiens
mention {'id': 'organism_Instance_141761'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '44882', 'end': '44894'} None
spannedText {} Mus musculus
mention {'id': 'organism_Instance_141766'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '44929', 'end': '44941'} None
spannedText {} Mus musculus
mention {'id': 'organism_Instance_141771'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '44975', 'end': '44987'} None
spannedText {} Mus musculus
mention {'id': 'organism_Instance_141776'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '45007', 'end': '45028'} None
spannedText {} Oryctolagus cuniculus
mention {'id': 'organism_Instance_141781'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '45047', 'end': '45058'} None
spannedText {} Danio rerio
mention {'id': 'organism_Instance_141786'} N

spannedText {} Drosophila
mention {'id': 'organism_Instance_46450'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5326', 'end': '5333'} None
spannedText {} mammals
mention {'id': 'organism_Instance_46455'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5514', 'end': '5519'} None
spannedText {} mouse
mention {'id': 'organism_Instance_46460'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5713', 'end': '5723'} None
spannedText {} Drosophila
mention {'id': 'organism_Instance_46465'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5932', 'end': '5942'} None
spannedText {} Drosophila
mention {'id': 'organism_Instance_46470'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '6113', 'end': '6123'} None
spannedText {} Drosophila
mention {'id': 'organism_Instance_46475'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '6424', 'end': '

annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '56329', 'end': '56334'} None
spannedText {} mouse
mention {'id': 'organism_Instance_34334'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '56510', 'end': '56515'} None
spannedText {} mouse
mention {'id': 'organism_Instance_34338'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '56685', 'end': '56690'} None
spannedText {} mouse
mention {'id': 'organism_Instance_34342'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '56851', 'end': '56856'} None
spannedText {} mouse
mention {'id': 'organism_Instance_34346'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '57020', 'end': '57025'} None
spannedText {} mouse
mention {'id': 'organism_Instance_34354'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '57177', 'end': '57182'} None
spannedText {} mouse
mention {'id': 'organism_Instance_34358'} Non

annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '11486', 'end': '11490'} None
spannedText {} mice
mention {'id': 'organism_Instance_45520'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '11517', 'end': '11521'} None
spannedText {} mice
mention {'id': 'organism_Instance_45525'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '11567', 'end': '11571'} None
spannedText {} mice
mention {'id': 'organism_Instance_45530'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '11706', 'end': '11710'} None
spannedText {} mice
mention {'id': 'organism_Instance_45535'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '11735', 'end': '11739'} None
spannedText {} mice
mention {'id': 'organism_Instance_45540'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '11831', 'end': '11836'} None
spannedText {} mouse
mention {'id': 'organism_Instance_45545'} None
ann

mention {'id': 'organism_Instance_32978'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '44784', 'end': '44789'} None
spannedText {} mouse
mention {'id': 'organism_Instance_32982'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '44806', 'end': '44811'} None
spannedText {} mouse
mention {'id': 'organism_Instance_32986'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '47396', 'end': '47402'} None
spannedText {} murine
mention {'id': 'organism_Instance_32990'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '47630', 'end': '47635'} None
spannedText {} mouse
mention {'id': 'organism_Instance_32994'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '49115', 'end': '49120'} None
spannedText {} mouse
mention {'id': 'organism_Instance_33002'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '50947', 'end': '50953'} None
spannedText {} ani

annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '33735', 'end': '33739'} None
spannedText {} mice
mention {'id': 'organism_Instance_160360'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '34053', 'end': '34057'} None
spannedText {} Mice
mention {'id': 'organism_Instance_160367'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '34589', 'end': '34596'} None
spannedText {} animals
mention {'id': 'organism_Instance_160387'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '38306', 'end': '38310'} None
spannedText {} Mice
mention {'id': 'organism_Instance_160393'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '38711', 'end': '38715'} None
spannedText {} Mice
mention {'id': 'organism_Instance_160398'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '38913', 'end': '38917'} None
spannedText {} mice
mention {'id': 'organism_Instance_160403'} 

mention {'id': 'organism_Instance_58868'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '62727', 'end': '62733'} None
spannedText {} animal
mention {'id': 'organism_Instance_58883'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '62969', 'end': '62973'} None
spannedText {} mice
mention {'id': 'organism_Instance_58903'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '64205', 'end': '64210'} None
spannedText {} mouse
mention {'id': 'organism_Instance_58913'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '64679', 'end': '64683'} None
spannedText {} mice
mention {'id': 'organism_Instance_58918'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '64754', 'end': '64758'} None
spannedText {} mice
mention {'id': 'organism_Instance_58923'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '64809', 'end': '64813'} None
spannedText {} mice
m

mention {'id': 'organism_Instance_63971'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '16095', 'end': '16099'} None
spannedText {} mice
mention {'id': 'organism_Instance_63976'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '16360', 'end': '16364'} None
spannedText {} mice
mention {'id': 'organism_Instance_63981'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '16501', 'end': '16505'} None
spannedText {} mice
mention {'id': 'organism_Instance_63986'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '16667', 'end': '16671'} None
spannedText {} mice
mention {'id': 'organism_Instance_63991'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '16899', 'end': '16903'} None
spannedText {} mice
mention {'id': 'organism_Instance_63996'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '17138', 'end': '17142'} None
spannedText {} mice
ment

mention {'id': 'organism_Instance_270705'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '37014', 'end': '37025'} None
spannedText {} A. thaliana
mention {'id': 'organism_Instance_270710'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '37248', 'end': '37259'} None
spannedText {} Arabidopsis
mention {'id': 'organism_Instance_270735'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '42639', 'end': '42644'} None
spannedText {} human
mention {'id': 'organism_Instance_270740'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '42916', 'end': '42923'} None
spannedText {} chicken
mention {'id': 'organism_Instance_270745'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '44771', 'end': '44776'} None
spannedText {} mouse
mention {'id': 'organism_Instance_270750'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '44782', 'end': '44787'} None

span {'start': '8631', 'end': '8635'} None
spannedText {} mice
mention {'id': 'organism_Instance_61856'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '8669', 'end': '8673'} None
spannedText {} mice
mention {'id': 'organism_Instance_61861'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '8766', 'end': '8770'} None
spannedText {} mice
mention {'id': 'organism_Instance_61866'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '8803', 'end': '8807'} None
spannedText {} mice
mention {'id': 'organism_Instance_61871'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '8853', 'end': '8857'} None
spannedText {} mice
mention {'id': 'organism_Instance_61876'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '9114', 'end': '9118'} None
spannedText {} mice
mention {'id': 'organism_Instance_61881'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '

span {'start': '6838', 'end': '6845'} None
spannedText {} animals
mention {'id': 'organism_Instance_32170'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7001', 'end': '7006'} None
spannedText {} mouse
mention {'id': 'organism_Instance_32174'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7102', 'end': '7106'} None
spannedText {} mice
mention {'id': 'organism_Instance_32178'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7398', 'end': '7402'} None
spannedText {} mice
mention {'id': 'organism_Instance_32182'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7548', 'end': '7552'} None
spannedText {} mice
mention {'id': 'organism_Instance_32186'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7565', 'end': '7569'} None
spannedText {} mice
mention {'id': 'organism_Instance_32190'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start

span {'start': '43294', 'end': '43298'} None
spannedText {} mice
mention {'id': 'organism_Instance_40652'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '43399', 'end': '43406'} None
spannedText {} animals
mention {'id': 'organism_Instance_40657'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '45961', 'end': '45966'} None
spannedText {} mouse
mention {'id': 'organism_Instance_40662'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '48816', 'end': '48822'} None
spannedText {} rabbit
mention {'id': 'organism_Instance_40667'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '48906', 'end': '48911'} None
spannedText {} mouse
mention {'id': 'organism_Instance_40672'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '49009', 'end': '49013'} None
spannedText {} goat
mention {'id': 'organism_Instance_40677'} None
annotator {'id': 'organism_Instance_10001'} Mike Ba

annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '36222', 'end': '36226'} None
spannedText {} mice
mention {'id': 'organism_Instance_28570'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '36353', 'end': '36360'} None
spannedText {} animals
mention {'id': 'organism_Instance_28574'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '36608', 'end': '36612'} None
spannedText {} mice
mention {'id': 'organism_Instance_28578'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '36791', 'end': '36797'} None
spannedText {} Murine
mention {'id': 'organism_Instance_28582'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '36939', 'end': '36943'} None
spannedText {} mice
mention {'id': 'organism_Instance_28586'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '37442', 'end': '37446'} None
spannedText {} mice
mention {'id': 'organism_Instance_28590'} None

mention {'id': 'NCBITAXON_2015_03_02_Instance_110'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '19589', 'end': '19595'} None
spannedText {} limpet
mention {'id': 'organism_Instance_120178'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '203', 'end': '209'} None
spannedText {} humans
mention {'id': 'organism_Instance_120190'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '1880', 'end': '1886'} None
spannedText {} humans
mention {'id': 'organism_Instance_120197'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '2000', 'end': '2010'} None
spannedText {} C. elegans
mention {'id': 'organism_Instance_120202'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '2016', 'end': '2031'} None
spannedText {} D. melanogaster
mention {'id': 'organism_Instance_120232'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '6841', 'end': '6851'} Non

span {'start': '18758', 'end': '18762'} None
spannedText {} mice
mention {'id': 'organism_Instance_48682'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '19267', 'end': '19271'} None
spannedText {} mice
mention {'id': 'organism_Instance_48691'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '19997', 'end': '20001'} None
spannedText {} mice
mention {'id': 'organism_Instance_140584'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '87', 'end': '91'} None
spannedText {} mice
mention {'id': 'organism_Instance_140618'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '9503', 'end': '9507'} None
spannedText {} Mice
mention {'id': 'organism_Instance_140641'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '12585', 'end': '12589'} None
spannedText {} Mice
mention {'id': 'organism_Instance_26970'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'st

span {'start': '23337', 'end': '23341'} None
spannedText {} mice
mention {'id': 'organism_Instance_31966'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '23538', 'end': '23542'} None
spannedText {} mice
mention {'id': 'organism_Instance_31970'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '24460', 'end': '24467'} None
spannedText {} animals
mention {'id': 'organism_Instance_31974'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '24505', 'end': '24512'} None
spannedText {} animals
mention {'id': 'organism_Instance_31990'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '24995', 'end': '24999'} None
spannedText {} mice
mention {'id': 'organism_Instance_31994'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '25148', 'end': '25155'} None
spannedText {} animals
mention {'id': 'organism_Instance_32002'} None
annotator {'id': 'organism_Instance_10001'} Mike 

mention {'id': 'organism_Instance_65296'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '4080', 'end': '4085'} None
spannedText {} human
mention {'id': 'organism_Instance_65301'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '4144', 'end': '4149'} None
spannedText {} Mouse
mention {'id': 'organism_Instance_65306'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '4383', 'end': '4388'} None
spannedText {} human
mention {'id': 'organism_Instance_65311'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '4431', 'end': '4439'} None
spannedText {} nematode
mention {'id': 'organism_Instance_65316'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '4513', 'end': '4518'} None
spannedText {} Human
mention {'id': 'organism_Instance_65321'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '4770', 'end': '4774'} None
spannedText {} mice
mention 

mention {'id': 'NCBITAXON_2015_03_02_Instance_17'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5146', 'end': '5148'} None
spannedText {} sp
mention {'id': 'organism_Instance_21824'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '25', 'end': '29'} None
spannedText {} mice
mention {'id': 'organism_Instance_21828'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '252', 'end': '261'} None
spannedText {} organisms
mention {'id': 'organism_Instance_21832'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '335', 'end': '344'} None
spannedText {} jellyfish
mention {'id': 'organism_Instance_21836'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '388', 'end': '393'} None
spannedText {} mouse
mention {'id': 'organism_Instance_21840'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '792', 'end': '796'} None
spannedText {} mice
mention {'

span {'start': '16830', 'end': '16834'} None
spannedText {} mice
mention {'id': 'organism_Instance_51910'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '17481', 'end': '17485'} None
spannedText {} mice
mention {'id': 'organism_Instance_51915'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '20070', 'end': '20074'} None
spannedText {} mice
mention {'id': 'organism_Instance_51920'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '20312', 'end': '20316'} None
spannedText {} mice
mention {'id': 'organism_Instance_51925'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '20394', 'end': '20398'} None
spannedText {} mice
mention {'id': 'organism_Instance_51930'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '20584', 'end': '20588'} None
spannedText {} mice
mention {'id': 'organism_Instance_51935'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span

mention {'id': 'organism_Instance_31454'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '30051', 'end': '30058'} None
spannedText {} animals
mention {'id': 'organism_Instance_31458'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '30088', 'end': '30095'} None
spannedText {} animals
mention {'id': 'organism_Instance_31462'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '30284', 'end': '30288'} None
spannedText {} mice
mention {'id': 'organism_Instance_31466'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '30665', 'end': '30669'} None
spannedText {} mice
mention {'id': 'organism_Instance_31470'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '32252', 'end': '32257'} None
spannedText {} human
mention {'id': 'organism_Instance_31474'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '33943', 'end': '33950'} None
spannedText {} an

mention {'id': 'organism_Instance_22526'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '5443', 'end': '5447'} None
spannedText {} mice
mention {'id': 'organism_Instance_22530'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7641', 'end': '7646'} None
spannedText {} human
mention {'id': 'organism_Instance_22534'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '7973', 'end': '7978'} None
spannedText {} human
mention {'id': 'organism_Instance_22538'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '8003', 'end': '8008'} None
spannedText {} human
mention {'id': 'organism_Instance_22542'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '8576', 'end': '8580'} None
spannedText {} mice
mention {'id': 'organism_Instance_22546'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '8736', 'end': '8741'} None
spannedText {} human
mention {'id

mention {'id': 'organism_Instance_61601'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '49210', 'end': '49213'} None
spannedText {} rat
mention {'id': 'organism_Instance_61606'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '49267', 'end': '49270'} None
spannedText {} rat
mention {'id': 'organism_Instance_61611'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '49328', 'end': '49334'} None
spannedText {} rabbit
mention {'id': 'organism_Instance_61616'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '49400', 'end': '49405'} None
spannedText {} mouse
mention {'id': 'organism_Instance_61621'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '49472', 'end': '49478'} None
spannedText {} rabbit
mention {'id': 'organism_Instance_61626'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '49558', 'end': '49563'} None
spannedText {} mouse


annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '20475', 'end': '20479'} None
spannedText {} mice
mention {'id': 'organism_Instance_35330'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '22240', 'end': '22244'} None
spannedText {} mice
mention {'id': 'organism_Instance_35334'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '24860', 'end': '24864'} None
spannedText {} mice
mention {'id': 'organism_Instance_35338'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '25934', 'end': '25944'} None
spannedText {} Drosophila
mention {'id': 'organism_Instance_35342'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '26223', 'end': '26233'} None
spannedText {} Drosophila
mention {'id': 'organism_Instance_35346'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '26238', 'end': '26249'} None
spannedText {} vertebrates
mention {'id': 'organism_Instanc

annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '831', 'end': '836'} None
spannedText {} human
mention {'id': 'organism_Instance_49846'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '873', 'end': '877'} None
spannedText {} Mice
mention {'id': 'organism_Instance_49851'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '953', 'end': '959'} None
spannedText {} animal
mention {'id': 'organism_Instance_49856'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '1082', 'end': '1087'} None
spannedText {} mouse
mention {'id': 'organism_Instance_49861'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '1135', 'end': '1139'} None
spannedText {} mice
mention {'id': 'organism_Instance_49866'} None
annotator {'id': 'organism_Instance_10001'} Mike Bada
span {'start': '1374', 'end': '1379'} None
spannedText {} human
mention {'id': 'organism_Instance_49871'} None
annotator {'id': 

In [85]:
from corpus_processing import *

split_brat_standoff(output_dir, train_size=0.7, test_size=0.2, valid_size=0.1, random_seed=42)

[INFO] Moving to directory: ./CRAFT_GSC_brat/craft
[INFO] Getting all filenames in dataset... DONE
[INFO] Splitting corpus into 70.0% train, 20.0% test, 10.0% valid... Done.
[INFO] Creating train/test/valid directories at ./CRAFT_GSC_brat/craft if they do not already exist...


In [5]:
PATH_TO_CRAFT_GSC_BRAT = '../corpora/CRAFT_GSC_brat/craft'
PATH_TO_CRAFT_ASCII = '../corpora/CRAFT_GSC_brat/craft_ascii'

process_dataset(PATH_TO_CRAFT_GSC_BRAT, PATH_TO_CRAFT_ASCII, function=utf8_to_ascii, dev="valid")