In [14]:
from taxonerd import TaxoNERD

In [15]:
import logging
logging.basicConfig(level=logging.ERROR)

### Build TaxoNERD docker image

In [16]:
import docker

In [17]:
DOCKERFILE_PATH = "../images/TaxoNERD"

client = docker.from_env()
client.images.build(path=DOCKERFILE_PATH, tag="taxonerd:latest")

(<Image: 'taxonerd:latest'>, <itertools._tee at 0x7f65d4a29c80>)

### Run TaxoNERD and parse results

In [26]:
import shutil
from glob import glob

def run_taxonerd(input_dir, output_dir):
    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)
    temp_dir = output_dir+"_temp"
    shutil.copytree(input_dir, temp_dir)
    for ann in glob(os.path.join(temp_dir, "*.ann")):
        os.remove(ann)
    volume = {os.path.abspath(temp_dir): {'bind': '/home/taxonerd/input'}, os.path.abspath(output_dir): {'bind': '/home/taxonerd/output'}}
    client = docker.from_env()
    image = client.images.get("taxonerd:latest")
    response = client.containers.run(image, "taxonerd ask --focus-on speed -i /home/taxonerd/input -o /home/taxonerd/output", volumes=volume, remove=True)
    if os.path.isdir(temp_dir):
        shutil.rmtree(temp_dir)
    return response.decode("utf-8")

### Eval TaxoNERD on test corpora

In [19]:
from eval_utils import *

#### Eval on LINNAEUS GSC

In [20]:
PATH_TO_LINNAEUS_GT = '../corpora/LINNAEUS_GSC_brat/linnaeus_clean/test'
PATH_TO_LINNAEUS_PRED = "./output/TAXONERD_MD/LINNAEUS_pred"

In [21]:
run_taxonerd(PATH_TO_LINNAEUS_GT, PATH_TO_LINNAEUS_PRED)

{'/home/leguilln/workspace/INFORMATION_EXTRACTION/snr_tools_and_methods/eval/output/TAXONERD_MD/LINNAEUS_pred_temp': {'bind': '/home/taxonerd/input'}, '/home/leguilln/workspace/INFORMATION_EXTRACTION/snr_tools_and_methods/eval/output/TAXONERD_MD/LINNAEUS_pred': {'bind': '/home/taxonerd/output'}}


''

In [22]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)

$51.75$ & $26.56$ & $35.10$


Unnamed: 0,precision,recall,f1-score
Taxon,0.517471,0.265585,0.351016


In [23]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)

$56.57$ & $29.08$ & $38.42$


Unnamed: 0,precision,recall,f1-score
Taxon,0.565724,0.290847,0.384181


In [25]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)
FP

[{'start': 4711, 'end': 4713, 'text': 'OT'},
 {'start': 5636, 'end': 5640, 'text': 'SLPS'},
 {'start': 6494, 'end': 6508, 'text': 'ministries)[23'},
 {'start': 1554, 'end': 1562, 'text': 'ABSTRACT'},
 {'start': 3432, 'end': 3444, 'text': 'tetrazolides'},
 {'start': 3860, 'end': 3897, 'text': 'oligonucleoside methylphosphophonates'},
 {'start': 5682, 'end': 5692, 'text': 'y-32P]-ATP'},
 {'start': 7142,
  'end': 7186,
  'text': 'Protected Oligonucleoside Methylphosphonates'},
 {'start': 10379, 'end': 10383, 'text': 'MSNT'},
 {'start': 14194, 'end': 14200, 'text': 'buffer'},
 {'start': 18949, 'end': 18958, 'text': 'y-32P]ATP'},
 {'start': 19513, 'end': 19532, 'text': 'Gel Electrophoresis'},
 {'start': 21177,
  'end': 21221,
  'text': 'Protected Oligonucleoside Methylphosphonates'},
 {'start': 22083, 'end': 22088, 'text': 'DMTrO'},
 {'start': 32586, 'end': 32596, 'text': 'Ol igomers'},
 {'start': 32684, 'end': 32694, 'text': 'Ol igomers'},
 {'start': 33082, 'end': 33091, 'text': 'AAAGCAAGc

#### Eval on S800 GSC

In [24]:
PATH_TO_S800_GT = '../corpora/S800_GSC_brat/s800/test'
PATH_TO_S800_PRED = "./output/TAXONERD_MD/S800_pred"

In [25]:
run_taxonerd(PATH_TO_S800_GT, PATH_TO_S800_PRED)

{'/home/leguilln/workspace/INFORMATION_EXTRACTION/snr_tools_and_methods/eval/output/TAXONERD_MD/S800_pred_temp': {'bind': '/home/taxonerd/input'}, '/home/leguilln/workspace/INFORMATION_EXTRACTION/snr_tools_and_methods/eval/output/TAXONERD_MD/S800_pred': {'bind': '/home/taxonerd/output'}}


''

In [27]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)

$45.74$ & $45.50$ & $45.62$


Unnamed: 0,precision,recall,f1-score
Taxon,0.457405,0.45502,0.456209


In [28]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)

$54.26$ & $54.05$ & $54.15$


Unnamed: 0,precision,recall,f1-score
Taxon,0.542595,0.54047,0.54153


In [31]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)
FP

[{'start': 0, 'end': 25, 'text': 'Methanobacterium arcticum'},
 {'start': 586, 'end': 602, 'text': 'Methanobacterium'},
 {'start': 928, 'end': 953, 'text': 'Methanobacterium arcticum'},
 {'start': 1003, 'end': 1016, 'text': 'VKM B-2371(T)'},
 {'start': 760, 'end': 767, 'text': 'Paridae'},
 {'start': 910, 'end': 921, 'text': 'Apicomplexa'},
 {'start': 1190, 'end': 1202, 'text': 'colpodellids'},
 {'start': 82, 'end': 100, 'text': 'Peridiniopsis spp.'},
 {'start': 102, 'end': 113, 'text': 'Dinophyceae'},
 {'start': 194, 'end': 212, 'text': 'Peridiniopsis spp.'},
 {'start': 456,
  'end': 496,
  'text': 'Peridiniopsis penardii var. robusta var.'},
 {'start': 517, 'end': 528, 'text': 'Dinophyceae'},
 {'start': 618, 'end': 629, 'text': 'P. penardii'},
 {'start': 791, 'end': 802, 'text': 'P. penardii'},
 {'start': 807, 'end': 831, 'text': 'P. penardii var. robusta'},
 {'start': 858, 'end': 866, 'text': 'P. kevei'},
 {'start': 980, 'end': 991, 'text': 'P. penardii'},
 {'start': 996, 'end': 1004

#### Eval on COPIOUS GSC

In [10]:
PATH_TO_COPIOUS_GT = '../corpora/COPIOUS_GSC_brat/copious_clean/test'
PATH_TO_COPIOUS_PRED = "./output/TAXONERD_MD/COPIOUS_pred"

In [11]:
run_taxonerd(PATH_TO_COPIOUS_GT, PATH_TO_COPIOUS_PRED)

In [12]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)

$75.77$ & $67.45$ & $71.37$


Unnamed: 0,precision,recall,f1-score
Taxon,0.757709,0.67451,0.713693


In [13]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)

$82.49$ & $74.09$ & $78.06$


Unnamed: 0,precision,recall,f1-score
Taxon,0.82489,0.740851,0.780615


In [37]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)
FP

[{'start': 0, 'end': 3, 'text': 'MAI'},
 {'start': 346, 'end': 366, 'text': 'S. parvlfolius Bastw'},
 {'start': 393, 'end': 411, 'text': 'S. tetonensis Nels'},
 {'start': 562, 'end': 580, 'text': "Stotiiach ?'ecords"},
 {'start': 2315, 'end': 2340, 'text': 'Taxodinm ascendens Brongn'},
 {'start': 184, 'end': 188, 'text': 'deer'},
 {'start': 1544, 'end': 1548, 'text': 'deer'},
 {'start': 4, 'end': 12, 'text': 'BARBICAN'},
 {'start': 284, 'end': 305, 'text': 'Black-throated Barbet'},
 {'start': 1533, 'end': 1548, 'text': 'Bucco rufifrons'},
 {'start': 382, 'end': 397, 'text': 'Saipan Okinawan'},
 {'start': 2419, 'end': 2434, 'text': 'Salpan Okinawan'},
 {'start': 42, 'end': 70, 'text': 'Archamia bleekeri (Glinther)'},
 {'start': 1409, 'end': 1424, 'text': 'Ambassis lung-i'},
 {'start': 1679, 'end': 1691, 'text': 'fruit thrush'},
 {'start': 1758, 'end': 1777, 'text': 'Cerithium macrotoma'},
 {'start': 1912, 'end': 1929, 'text': 'Oolina pupiformis'},
 {'start': 2286, 'end': 2300, 'text': '

#### Eval on BB task corpus

In [38]:
PATH_TO_BB_GT = '../corpora/BB_GSC_brat/bb_clean/test'
PATH_TO_BB_PRED = "./output/TAXONERD_MD/BB_pred"

In [39]:
run_taxonerd(PATH_TO_BB_GT, PATH_TO_BB_PRED)

In [40]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)

$73.11$ & $77.50$ & $75.24$


Unnamed: 0,precision,recall,f1-score
Taxon,0.731132,0.775,0.752427


In [41]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)

$76.42$ & $81.20$ & $78.74$


Unnamed: 0,precision,recall,f1-score
Taxon,0.764151,0.81203,0.787363


In [42]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)
FP

[{'start': 19, 'end': 44, 'text': 'L. lactis subsp. cremoris'},
 {'start': 230, 'end': 255, 'text': 'L. lactis subsp. cremoris'},
 {'start': 0, 'end': 9, 'text': 'Serotypes'},
 {'start': 513, 'end': 522, 'text': 'serotypes'},
 {'start': 41, 'end': 66, 'text': 'Burkholderia pseudomallei'},
 {'start': 258, 'end': 270, 'text': 'Burkholderia'},
 {'start': 272, 'end': 297, 'text': 'Pseudomonas) pseudomallei'},
 {'start': 521, 'end': 529, 'text': 'biotypes'},
 {'start': 1003, 'end': 1007, 'text': 'LPSs'},
 {'start': 1021, 'end': 1029, 'text': 'biotypes'},
 {'start': 1379, 'end': 1396, 'text': 'hyperimmune mouse'},
 {'start': 365, 'end': 373, 'text': 'Scimudin'},
 {'start': 46, 'end': 56, 'text': 'Gorgonzola'},
 {'start': 61, 'end': 69, 'text': 'Scimudin'},
 {'start': 239, 'end': 249, 'text': 'Gorgonzola'},
 {'start': 254, 'end': 262, 'text': 'Scimudin'},
 {'start': 194, 'end': 203, 'text': 'pneumonia'},
 {'start': 105, 'end': 115, 'text': 'Gorgonzola'},
 {'start': 130, 'end': 138, 'text': 'S