In [1]:
import os

### Build SPECIES docker image

In [2]:
import docker

In [3]:
DOCKERFILE_PATH = "../images/SPECIES"

client = docker.from_env()
client.images.build(path=DOCKERFILE_PATH, tag="species:latest")

(<Image: 'species:latest'>, <itertools._tee at 0x7f150a94be80>)

### Run SPECIES and parse results

In [15]:
import shutil

def run_species(input_dir, output_dir):
    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    volume = {os.path.abspath(input_dir): {'bind': '/home/species/corpus', 'mode': 'ro'}}
    client = docker.from_env()
    image = client.images.get("species:latest")
    response = client.containers.run(image, "species /home/species/corpus", volumes=volume, remove=True)
    with open(os.path.join(output_dir, "species.tags"), "w+") as f:
        f.write(response.decode("utf-8"))
    return os.path.join(output_dir, "species.tags")

In [16]:
from glob import glob

def parse_species(input_dir, tags_filename, output_dir):
    tags = pd.read_csv(tags_filename, sep="\t", header=None)
    tags.columns = ["document", "start", "end", "text", "#species id"]
    
    for document in glob(os.path.join(input_dir, "*.txt")):
        document = os.path.basename(document)        
        doc_tags = tags[tags["document"] == document]
                    
        doc_ann = doc_tags.drop(columns=["#species id", "document"])
        doc_ann = doc_ann.astype({'start': 'int32', 'end': 'int32'})
        doc_ann = doc_ann.drop_duplicates()
        doc_ann.reset_index(inplace=True, drop=True)
        doc_ann = doc_ann.rename('T{}'.format)
        
        doc_ann["end"] = doc_ann["end"].apply(lambda x: int(x)+1) # To align with LINNAEUS and COPIOUS char offsets        
        doc_ann.insert(0, "type", ["LIVB"]*doc_ann.shape[0])
        
        ann_filename = document.split(".")[0]+".ann"
        doc_ann.to_csv(os.path.join(output_dir, ann_filename), sep="\t", header=False)
    
    os.remove(tags_filename)

### Eval SPECIES on test corpora

In [17]:
from eval_utils import *

#### Eval on LINNAEUS GSC

In [18]:
PATH_TO_LINNAEUS_GT = '../corpora/LINNAEUS_GSC_brat/linnaeus_ascii/test'
PATH_TO_LINNAEUS_PRED = './output/SPECIES/LINNAEUS_pred'

In [19]:
tags_filename = run_species(PATH_TO_LINNAEUS_GT, PATH_TO_LINNAEUS_PRED)
parse_species(PATH_TO_LINNAEUS_GT, tags_filename, PATH_TO_LINNAEUS_PRED)

In [20]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)

$86.04$ & $64.22$ & $73.55$


Unnamed: 0,precision,recall,f1-score
Taxon,0.860412,0.642186,0.735452


In [21]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)

$86.73$ & $64.73$ & $74.13$


Unnamed: 0,precision,recall,f1-score
Taxon,0.867277,0.64731,0.74132


In [22]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)
FP

[{'start': 5504, 'end': 5508, 'text': 'ml V'},
 {'start': 6765, 'end': 6769, 'text': 'ml V'},
 {'start': 11772, 'end': 11779, 'text': 'M tetra'},
 {'start': 36346, 'end': 36350, 'text': 'CH-P'},
 {'start': 13728, 'end': 13732, 'text': 'mray'},
 {'start': 32702, 'end': 32709, 'text': 'E. coli'},
 {'start': 35329, 'end': 35336, 'text': 'E. coli'},
 {'start': 42459, 'end': 42464, 'text': 'human'},
 {'start': 43668, 'end': 43673, 'text': 'human'},
 {'start': 6344, 'end': 6350, 'text': 'Murine'},
 {'start': 36279, 'end': 36282, 'text': 'SSP'},
 {'start': 36660, 'end': 36663, 'text': 'SSP'},
 {'start': 2218, 'end': 2231, 'text': 'salmonid fish'},
 {'start': 2570, 'end': 2583, 'text': 'salmonid fish'},
 {'start': 3351, 'end': 3364, 'text': 'salmonid fish'},
 {'start': 29504, 'end': 29519, 'text': 'Atlantic Salmon'},
 {'start': 22, 'end': 25, 'text': 'GH1'},
 {'start': 106, 'end': 109, 'text': 'GH1'},
 {'start': 292, 'end': 295, 'text': 'GH1'},
 {'start': 437, 'end': 440, 'text': 'GH1'},
 {'st

#### Eval on S800 GSC

In [23]:
PATH_TO_S800_GT = '../corpora/S800_GSC_brat/s800/test'
PATH_TO_S800_PRED = "./output/SPECIES/S800_pred"

In [24]:
tags_filename = run_species(PATH_TO_S800_GT, PATH_TO_S800_PRED)
parse_species(PATH_TO_S800_GT, tags_filename, PATH_TO_S800_PRED)

In [25]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)

$75.31$ & $72.36$ & $73.80$


Unnamed: 0,precision,recall,f1-score
Taxon,0.753053,0.723598,0.738032


In [26]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)

$77.20$ & $74.19$ & $75.66$


Unnamed: 0,precision,recall,f1-score
Taxon,0.772049,0.741851,0.756649


In [27]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)
FP

[{'start': 65, 'end': 69, 'text': 'PERV'},
 {'start': 149, 'end': 153, 'text': 'PERV'},
 {'start': 450, 'end': 455, 'text': 'MoMLV'},
 {'start': 456, 'end': 460, 'text': 'PERV'},
 {'start': 620, 'end': 623, 'text': 'MLV'},
 {'start': 787, 'end': 791, 'text': 'PERV'},
 {'start': 874, 'end': 878, 'text': 'PERV'},
 {'start': 902, 'end': 908, 'text': 'murine'},
 {'start': 984, 'end': 988, 'text': 'PERV'},
 {'start': 1135, 'end': 1139, 'text': 'PERV'},
 {'start': 1243, 'end': 1247, 'text': 'PERV'},
 {'start': 729, 'end': 740, 'text': 'Parus major'},
 {'start': 456, 'end': 478, 'text': 'Peridiniopsis penardii'},
 {'start': 618, 'end': 629, 'text': 'P. penardii'},
 {'start': 791, 'end': 802, 'text': 'P. penardii'},
 {'start': 807, 'end': 818, 'text': 'P. penardii'},
 {'start': 980, 'end': 991, 'text': 'P. penardii'},
 {'start': 70, 'end': 92, 'text': 'Pseudomonas aeruginosa'},
 {'start': 175, 'end': 197, 'text': 'Pseudomonas aeruginosa'},
 {'start': 313, 'end': 318, 'text': 'sheep'},
 {'start

#### Eval on COPIOUS GSC

In [28]:
PATH_TO_COPIOUS_GT = '../corpora/COPIOUS_GSC_brat/copious_ascii/test'
PATH_TO_COPIOUS_PRED = "./output/SPECIES/COPIOUS_pred"

In [29]:
tags_filename = run_species(PATH_TO_COPIOUS_GT, PATH_TO_COPIOUS_PRED)
parse_species(PATH_TO_COPIOUS_GT, tags_filename, PATH_TO_COPIOUS_PRED)

In [30]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)

$54.86$ & $13.82$ & $22.08$


Unnamed: 0,precision,recall,f1-score
Taxon,0.548638,0.138235,0.22083


In [31]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)

$55.25$ & $13.92$ & $22.24$


Unnamed: 0,precision,recall,f1-score
Taxon,0.552529,0.139216,0.222396


In [32]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)
FP

[{'start': 0, 'end': 3, 'text': 'MAI'},
 {'start': 779, 'end': 782, 'text': 'MAI'},
 {'start': 1903, 'end': 1910, 'text': 'silvery'},
 {'start': 1957, 'end': 1962, 'text': 'kapur'},
 {'start': 2073, 'end': 2084, 'text': 'S. lepidota'},
 {'start': 2282, 'end': 2298, 'text': 'S. leprosula Miq'},
 {'start': 2404, 'end': 2413, 'text': 'S. ovalis'},
 {'start': 2543, 'end': 2552, 'text': 'chocolate'},
 {'start': 3023, 'end': 3041, 'text': 'S. palembanica Miq'},
 {'start': 270, 'end': 298, 'text': 'Symphoricarpos rotundifolius'},
 {'start': 316, 'end': 325, 'text': 'snowberry'},
 {'start': 327, 'end': 337, 'text': 'S. glaucus'},
 {'start': 582, 'end': 595, 'text': 'Varied thrush'},
 {'start': 651, 'end': 660, 'text': 'mule deer'},
 {'start': 857, 'end': 866, 'text': 'snowberry'},
 {'start': 1067, 'end': 1086, 'text': 'Symplocos tinctoria'},
 {'start': 2301, 'end': 2313, 'text': 'fox squirrel'},
 {'start': 1662, 'end': 1669, 'text': 'Sarawak'},
 {'start': 2178, 'end': 2184, 'text': 'salmon'},


#### Eval on BB task corpus

In [33]:
PATH_TO_BB_GT = '../corpora/BB_GSC_brat/bb_ascii/test'
PATH_TO_BB_PRED = "./output/SPECIES/BB_pred"

In [34]:
tags_filename = run_species(PATH_TO_BB_GT, PATH_TO_BB_PRED)
parse_species(PATH_TO_BB_GT, tags_filename, PATH_TO_BB_PRED)

In [35]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)

$67.47$ & $49.25$ & $56.94$


Unnamed: 0,precision,recall,f1-score
Taxon,0.674658,0.4925,0.569364


In [36]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)

$67.47$ & $49.25$ & $56.94$


Unnamed: 0,precision,recall,f1-score
Taxon,0.674658,0.4925,0.569364


In [37]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)
FN

[{'start': 783, 'end': 796, 'text': 'Campylobacter'},
 {'start': 19, 'end': 49, 'text': 'L. lactis subsp. cremoris B697'},
 {'start': 230, 'end': 268, 'text': 'L. lactis subsp. cremoris strain B1157'},
 {'start': 458, 'end': 462, 'text': 'B697'},
 {'start': 173, 'end': 195, 'text': 'Vibro parahaemolyticus'},
 {'start': 44, 'end': 55, 'text': 'Pseudomonas'},
 {'start': 98, 'end': 107, 'text': 'Halomonas'},
 {'start': 127, 'end': 145, 'text': 'Enterobacteriaceae'},
 {'start': 36, 'end': 66, 'text': 'Ara+ Burkholderia pseudomallei'},
 {'start': 151, 'end': 155, 'text': 'Ara-'},
 {'start': 258, 'end': 297, 'text': 'Burkholderia (Pseudomonas) pseudomallei'},
 {'start': 1180, 'end': 1184, 'text': 'Ara-'},
 {'start': 1198, 'end': 1202, 'text': 'Ara-'},
 {'start': 1212, 'end': 1216, 'text': 'Ara+'},
 {'start': 1489, 'end': 1493, 'text': 'Ara-'},
 {'start': 1498, 'end': 1502, 'text': 'Ara+'},
 {'start': 132, 'end': 146, 'text': 'Staphylococcus'},
 {'start': 1101, 'end': 1112, 'text': 'Penicilli