In [1]:
import os

### Build TAXONFINDER docker image

In [2]:
import docker

In [3]:
DOCKERFILE_PATH = "../images/TaxonFinder"

client = docker.from_env()
client.images.build(path=DOCKERFILE_PATH, tag="taxonfinder:latest")

(<Image: 'taxonfinder:latest'>, <itertools._tee at 0x7f15aa65fc00>)

### Run TAXONFINDER and parse results

In [4]:
import shutil

def run_taxonfinder(input_dir, output_dir):
    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    volume = {os.path.abspath(input_dir): {'bind': '/home/taxonfinder/corpus', 'mode': 'ro'}}
    client = docker.from_env()
    image = client.images.get("taxonfinder:latest")
    response = client.containers.run(image, "nodejs taxonfinder.js", volumes=volume, remove=True)
    return response.decode("utf-8").split("\n")[2:]

In [5]:
import yaml
import pandas as pd
import shutil

def taxonfinder_tags_to_df(tags):
    data = yaml.load(tags, Loader=yaml.BaseLoader)
    data = [{"start": item["offsets"][0], "end": item["offsets"][1], "name" : item["name"]} for item in data]
    return pd.DataFrame(data)

def parse_taxonfinder(response, output_dir):
    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    tags = ""
    for item in response:
        if item.endswith(".txt"):
            document = os.path.basename(item)
        else:
            tags += item
        if item.endswith("]"):
            doc_ann = taxonfinder_tags_to_df(tags)
            doc_ann = doc_ann.rename('T{}'.format)
            doc_ann.insert(0, "type", ["Taxon"]*doc_ann.shape[0])
            ann_filename = document.split(".")[0]+".ann"
            doc_ann.to_csv(os.path.join(output_dir, ann_filename), sep="\t", header=False)
            tags = ""

### Eval TAXONFINDER on test corpora

In [6]:
from eval_utils import *

#### Eval on LINNAEUS GSC

In [7]:
PATH_TO_LINNAEUS_GT = '../corpora/LINNAEUS_GSC_brat/linnaeus_clean/test'
PATH_TO_LINNAEUS_PRED = "./output/TAXONFINDER/LINNAEUS_pred"

In [8]:
response = run_taxonfinder(PATH_TO_LINNAEUS_GT, PATH_TO_LINNAEUS_PRED)
parse_taxonfinder(response, PATH_TO_LINNAEUS_PRED)

In [9]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)

$86.83$ & $20.84$ & $33.61$


Unnamed: 0,precision,recall,f1-score
Taxon,0.868327,0.208369,0.336088


In [10]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)

$86.83$ & $20.84$ & $33.61$


Unnamed: 0,precision,recall,f1-score
Taxon,0.868327,0.208369,0.336088


In [11]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)
FP

[{'start': 0, 'end': 9, 'text': 'Cytotoxic'},
 {'start': 786, 'end': 795, 'text': 'Cytotoxic'},
 {'start': 3682, 'end': 3691, 'text': 'Cytotoxic'},
 {'start': 6831, 'end': 6840, 'text': 'Cytotoxic'},
 {'start': 10442, 'end': 10450, 'text': 'Capsicum'},
 {'start': 10814, 'end': 10823, 'text': 'Echinacea'},
 {'start': 2814, 'end': 2819, 'text': 'Magos'},
 {'start': 20832, 'end': 20837, 'text': 'Magos'},
 {'start': 26118, 'end': 26123, 'text': 'Magos'},
 {'start': 194, 'end': 201, 'text': 'Diptera'},
 {'start': 203, 'end': 214, 'text': 'Psychodidae'},
 {'start': 1478, 'end': 1488, 'text': 'Drosophila'},
 {'start': 1663, 'end': 1676, 'text': 'Phlebotominae'},
 {'start': 1998, 'end': 2008, 'text': 'Leishmania'},
 {'start': 3028, 'end': 3038, 'text': 'Drosophila'},
 {'start': 3506, 'end': 3516, 'text': 'Drosophila'},
 {'start': 3521, 'end': 3531, 'text': 'Bractocera'},
 {'start': 3653, 'end': 3663, 'text': 'Drosophila'},
 {'start': 3801, 'end': 3811, 'text': 'Drosophila'},
 {'start': 8156, '

#### Eval on S800 GSC

In [12]:
PATH_TO_S800_GT = '../corpora/S800_GSC_brat/s800/test'
PATH_TO_S800_PRED = "./output/TAXONFINDER/S800_pred"

In [13]:
response = run_taxonfinder(PATH_TO_S800_GT, PATH_TO_S800_PRED)
parse_taxonfinder(response, PATH_TO_S800_PRED)

In [14]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)

$57.04$ & $40.68$ & $47.49$


Unnamed: 0,precision,recall,f1-score
Taxon,0.570384,0.40678,0.474886


In [15]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)

$57.04$ & $40.68$ & $47.49$


Unnamed: 0,precision,recall,f1-score
Taxon,0.570384,0.40678,0.474886


In [16]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)
FN

[{'start': 35, 'end': 64, 'text': 'porcine endogenous retrovirus'},
 {'start': 166, 'end': 197, 'text': 'Porcine endogenous retroviruses'},
 {'start': 199, 'end': 204, 'text': 'PERVs'},
 {'start': 314, 'end': 319, 'text': 'human'},
 {'start': 381, 'end': 385, 'text': 'pigs'},
 {'start': 394, 'end': 399, 'text': 'PERVs'},
 {'start': 630, 'end': 640, 'text': 'retroviral'},
 {'start': 665, 'end': 669, 'text': 'PERV'},
 {'start': 787, 'end': 793, 'text': 'PERV-B'},
 {'start': 909, 'end': 919, 'text': 'retroviral'},
 {'start': 984, 'end': 990, 'text': 'PERV-B'},
 {'start': 1135, 'end': 1141, 'text': 'PERV-A'},
 {'start': 173, 'end': 178, 'text': 'M2(T)'},
 {'start': 327, 'end': 332, 'text': 'M2(T)'},
 {'start': 550, 'end': 555, 'text': 'M2(T)'},
 {'start': 635, 'end': 666, 'text': 'Methanobacterium veterum MK4(T)'},
 {'start': 697, 'end': 707, 'text': 'DSM 863(T)'},
 {'start': 827, 'end': 832, 'text': 'M2(T)'},
 {'start': 880, 'end': 885, 'text': 'M2(T)'},
 {'start': 981, 'end': 986, 'text'

#### Eval on COPIOUS GSC

In [17]:
PATH_TO_COPIOUS_GT = '../corpora/COPIOUS_GSC_brat/copious_clean/test'
PATH_TO_COPIOUS_PRED = "./output/TAXONFINDER/COPIOUS_pred"

In [18]:
response = run_taxonfinder(PATH_TO_COPIOUS_GT, PATH_TO_COPIOUS_PRED)
parse_taxonfinder(response, PATH_TO_COPIOUS_PRED)

In [19]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)

$48.63$ & $29.51$ & $36.73$


Unnamed: 0,precision,recall,f1-score
Taxon,0.486268,0.295098,0.367297


In [20]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)

$48.95$ & $29.76$ & $37.02$


Unnamed: 0,precision,recall,f1-score
Taxon,0.489499,0.297642,0.370189


In [21]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)
FN

[{'start': 0, 'end': 8, 'text': 'MAI RANG'},
 {'start': 366, 'end': 374, 'text': 'Mai Teng'},
 {'start': 506, 'end': 510, 'text': 'Yang'},
 {'start': 640, 'end': 648, 'text': 'Mai Rang'},
 {'start': 779, 'end': 787, 'text': 'MAI RANG'},
 {'start': 789, 'end': 807, 'text': 'Pentacme Siamensis'},
 {'start': 826, 'end': 840, 'text': 'Dipterocarpeae'},
 {'start': 948, 'end': 956, 'text': 'Mai Teng'},
 {'start': 21, 'end': 47, 'text': 'H. polyalthoides Symington'},
 {'start': 211, 'end': 232, 'text': 'H. resinosa Symington'},
 {'start': 590, 'end': 611, 'text': 'Shorea acuminata Dyer'},
 {'start': 916, 'end': 935, 'text': 'S. bracteolata Dyer'},
 {'start': 1184, 'end': 1205, 'text': 'S. exelliptica Meijer'},
 {'start': 1445, 'end': 1465, 'text': 'S. ? elliptica Burck'},
 {'start': 1499, 'end': 1522, 'text': 'S. foxworthii Symington'},
 {'start': 2009, 'end': 2014, 'text': 'balau'},
 {'start': 2073, 'end': 2097, 'text': 'S. lepidota (Korth.) Bl.'},
 {'start': 2282, 'end': 2299, 'text': 'S. l

#### Eval on BB task corpus

In [22]:
PATH_TO_BB_GT = '../corpora/BB_GSC_brat/bb_clean/test'
PATH_TO_BB_PRED = "./output/TAXONFINDER/BB_pred"

In [23]:
response = run_taxonfinder(PATH_TO_BB_GT, PATH_TO_BB_PRED)
parse_taxonfinder(response, PATH_TO_BB_PRED)

In [24]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)

$79.56$ & $63.25$ & $70.47$


Unnamed: 0,precision,recall,f1-score
Taxon,0.795597,0.6325,0.704735


In [25]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)

$80.19$ & $63.75$ & $71.03$


Unnamed: 0,precision,recall,f1-score
Taxon,0.801887,0.6375,0.710306


In [26]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)