In [2]:
import os

### Build LINNAEUS docker image

In [3]:
import docker

In [4]:
DOCKERFILE_PATH = "../images/LINNAEUS"

client = docker.from_env()
client.images.build(path=DOCKERFILE_PATH, tag="linnaeus:latest")

(<Image: 'linnaeus:latest'>, <itertools._tee at 0x7f9540722c40>)

### Run LINNAEUS and parse results

In [5]:
import subprocess

import shutil
from glob import glob
import json

def run_linnaeus(input_dir, output_dir):
    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)
    shutil.copytree(input_dir, output_dir)
    for ann in glob(os.path.join(output_dir, "*.ann")):
        os.remove(ann)
    volume = {os.path.abspath(output_dir): {'bind': '/home/linnaeus/corpus'}}
    client = docker.from_env()
    image = client.images.get("linnaeus:latest")
    remote_jar = "/home/linnaeus/linnaeus/bin/linnaeus-2.0.jar"
    remote_properties = "/home/linnaeus/species-proxy/properties.conf"
    remote_dir = "/home/linnaeus/corpus"
    response = client.containers.run(image, "java -jar {} --properties {} --textDir {} --outDir {}".format(remote_jar, remote_properties, remote_dir, remote_dir), volumes=volume, remove=True)
    print(response)

In [6]:
import pandas as pd
from glob import glob

def parse_linnaeus(output_dir):
    for tags_filename in glob(os.path.join(output_dir, "*.tags")):
        df = pd.read_csv(tags_filename, sep="\t")
        df = df.rename('T{}'.format)
        df = df.drop(columns=["#entity id", "document", "comment"])
        df.insert(0, "type", ["Taxon"]*df.shape[0])
        ann_filename = os.path.basename(tags_filename).split(".")[0]+".ann"
        df.to_csv(os.path.join(output_dir, ann_filename), sep="\t", header=False)
        os.remove(tags_filename)

### Eval LINNAEUS on test corpora

In [7]:
from eval_utils import *

#### Eval on LINNAEUS GSC

In [9]:
PATH_TO_LINNAEUS_GT = '../corpora/LINNAEUS_GSC_brat/linnaeus_clean/test'
PATH_TO_LINNAEUS_PRED = "./output/LINNAEUS/LINNAEUS_pred"

In [11]:
run_linnaeus(PATH_TO_LINNAEUS_GT, PATH_TO_LINNAEUS_PRED)
parse_linnaeus(PATH_TO_LINNAEUS_PRED)

b'2021-03-22 09:54:42: Abbreviation resolution mode set to: true.\n2021-03-22 09:54:42: Disambiguation mode set to: ON_WHOLE.\nLoading postprocessing data files... \n done (s: 17, a: 340, f: 0, c: 0).\n2021-03-22 09:54:42: Loading variantMatcher from /home/linnaeus/species-proxy//dict-species-proxy.tsv, ignoreCase = false...\n2021-03-22 09:54:59: Completed.\n'


In [10]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)

$95.71$ & $78.14$ & $86.04$


Unnamed: 0,precision,recall,f1-score
Taxon,0.957113,0.781383,0.860367


In [11]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)

$96.03$ & $78.39$ & $86.32$


Unnamed: 0,precision,recall,f1-score
Taxon,0.960251,0.783945,0.863188


In [12]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)
FN

[{'start': 7712, 'end': 7719, 'text': 'peoples'},
 {'start': 16761, 'end': 16772, 'text': 'Participant'},
 {'start': 17033, 'end': 17045, 'text': 'participants'},
 {'start': 18069, 'end': 18081, 'text': 'participants'},
 {'start': 27828, 'end': 27836, 'text': 'patients'},
 {'start': 28273, 'end': 28280, 'text': 'persons'},
 {'start': 28679, 'end': 28691, 'text': 'participants'},
 {'start': 40235, 'end': 40247, 'text': 'participants'},
 {'start': 40407, 'end': 40419, 'text': 'participants'},
 {'start': 2070, 'end': 2080, 'text': 'Drosophila'},
 {'start': 5130, 'end': 5134, 'text': 'Calf'},
 {'start': 130, 'end': 138, 'text': 'patients'},
 {'start': 998, 'end': 1006, 'text': 'patients'},
 {'start': 3083, 'end': 3091, 'text': 'patients'},
 {'start': 4615, 'end': 4623, 'text': 'patients'},
 {'start': 5499, 'end': 5507, 'text': 'patients'},
 {'start': 7557, 'end': 7565, 'text': 'patients'},
 {'start': 17007, 'end': 17015, 'text': 'patients'},
 {'start': 21336, 'end': 21344, 'text': 'patient

#### Eval on S800 GSC

In [13]:
PATH_TO_S800_GT = '../corpora/S800_GSC_brat/s800/test'
PATH_TO_S800_PRED = "./output/LINNAEUS/S800_pred"

In [14]:
run_linnaeus(PATH_TO_S800_GT, PATH_TO_S800_PRED)
parse_linnaeus(PATH_TO_S800_PRED)

b'2021-03-22 10:00:02: Abbreviation resolution mode set to: true.\n2021-03-22 10:00:02: Disambiguation mode set to: ON_WHOLE.\nLoading postprocessing data files... \n done (s: 17, a: 340, f: 0, c: 0).\n2021-03-22 10:00:02: Loading variantMatcher from /home/linnaeus/species-proxy//dict-species-proxy.tsv, ignoreCase = false...\n2021-03-22 10:00:19: Completed.\n'


In [15]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)

$77.41$ & $70.14$ & $73.60$


Unnamed: 0,precision,recall,f1-score
Taxon,0.774101,0.701434,0.735978


In [16]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)

$77.84$ & $70.53$ & $74.01$


Unnamed: 0,precision,recall,f1-score
Taxon,0.778417,0.705346,0.740082


In [17]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)
FN

[{'start': 35, 'end': 64, 'text': 'porcine endogenous retrovirus'},
 {'start': 166, 'end': 197, 'text': 'Porcine endogenous retroviruses'},
 {'start': 199, 'end': 204, 'text': 'PERVs'},
 {'start': 394, 'end': 399, 'text': 'PERVs'},
 {'start': 630, 'end': 640, 'text': 'retroviral'},
 {'start': 665, 'end': 669, 'text': 'PERV'},
 {'start': 787, 'end': 793, 'text': 'PERV-B'},
 {'start': 909, 'end': 919, 'text': 'retroviral'},
 {'start': 984, 'end': 990, 'text': 'PERV-B'},
 {'start': 1135, 'end': 1141, 'text': 'PERV-A'},
 {'start': 173, 'end': 178, 'text': 'M2(T)'},
 {'start': 327, 'end': 332, 'text': 'M2(T)'},
 {'start': 550, 'end': 555, 'text': 'M2(T)'},
 {'start': 635, 'end': 666, 'text': 'Methanobacterium veterum MK4(T)'},
 {'start': 697, 'end': 707, 'text': 'DSM 863(T)'},
 {'start': 827, 'end': 832, 'text': 'M2(T)'},
 {'start': 880, 'end': 885, 'text': 'M2(T)'},
 {'start': 981, 'end': 986, 'text': 'M2(T)'},
 {'start': 427, 'end': 433, 'text': 'fungal'},
 {'start': 717, 'end': 727, 'tex

#### Eval on COPIOUS GSC

In [18]:
PATH_TO_COPIOUS_GT = '../corpora/COPIOUS_GSC_brat/copious_clean/test'
PATH_TO_COPIOUS_PRED = "./output/LINNAEUS/COPIOUS_pred"

In [19]:
run_linnaeus(PATH_TO_COPIOUS_GT, PATH_TO_COPIOUS_PRED)
parse_linnaeus(PATH_TO_COPIOUS_PRED)

b'2021-03-22 10:00:41: Abbreviation resolution mode set to: true.\n2021-03-22 10:00:41: Disambiguation mode set to: ON_WHOLE.\nLoading postprocessing data files... \n done (s: 17, a: 340, f: 0, c: 0).\n2021-03-22 10:00:41: Loading variantMatcher from /home/linnaeus/species-proxy//dict-species-proxy.tsv, ignoreCase = false...\n2021-03-22 10:00:59: Completed.\n'


In [20]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)

$50.20$ & $12.55$ & $20.08$


Unnamed: 0,precision,recall,f1-score
Taxon,0.501961,0.12549,0.200784


In [21]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)

In [22]:
FP

[{'start': 0, 'end': 3, 'text': 'MAI'},
 {'start': 779, 'end': 782, 'text': 'MAI'},
 {'start': 1957, 'end': 1962, 'text': 'kapur'},
 {'start': 2073, 'end': 2084, 'text': 'S. lepidota'},
 {'start': 2282, 'end': 2294, 'text': 'S. leprosula'},
 {'start': 2404, 'end': 2413, 'text': 'S. ovalis'},
 {'start': 3023, 'end': 3037, 'text': 'S. palembanica'},
 {'start': 270, 'end': 298, 'text': 'Symphoricarpos rotundifolius'},
 {'start': 316, 'end': 325, 'text': 'snowberry'},
 {'start': 327, 'end': 337, 'text': 'S. glaucus'},
 {'start': 582, 'end': 595, 'text': 'Varied thrush'},
 {'start': 651, 'end': 660, 'text': 'mule deer'},
 {'start': 857, 'end': 866, 'text': 'snowberry'},
 {'start': 1067, 'end': 1086, 'text': 'Symplocos tinctoria'},
 {'start': 1088, 'end': 1090, 'text': 'L.'},
 {'start': 1092, 'end': 1094, 'text': "L'"},
 {'start': 2301, 'end': 2313, 'text': 'fox squirrel'},
 {'start': 1042, 'end': 1046, 'text': 'lime'},
 {'start': 1968, 'end': 1971, 'text': 'men'},
 {'start': 485, 'end': 497

In [23]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)

$50.59$ & $12.65$ & $20.24$


Unnamed: 0,precision,recall,f1-score
Taxon,0.505882,0.126471,0.202353


#### Eval on BB task corpus

In [24]:
PATH_TO_BB_GT = '../corpora/BB_GSC_brat/bb_clean/test'
PATH_TO_BB_PRED = "./output/LINNAEUS/BB_pred"

In [25]:
run_linnaeus(PATH_TO_BB_GT, PATH_TO_BB_PRED)
parse_linnaeus(PATH_TO_BB_PRED)

b'2021-03-22 10:01:14: Abbreviation resolution mode set to: true.\n2021-03-22 10:01:14: Disambiguation mode set to: ON_WHOLE.\nLoading postprocessing data files... \n done (s: 17, a: 340, f: 0, c: 0).\n2021-03-22 10:01:14: Loading variantMatcher from /home/linnaeus/species-proxy//dict-species-proxy.tsv, ignoreCase = false...\n2021-03-22 10:01:31: Completed.\n'


In [26]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)

$60.91$ & $46.75$ & $52.90$


Unnamed: 0,precision,recall,f1-score
Taxon,0.609121,0.4675,0.528996


In [27]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)

$60.91$ & $46.75$ & $52.90$


Unnamed: 0,precision,recall,f1-score
Taxon,0.609121,0.4675,0.528996


In [28]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)
FN

[{'start': 712, 'end': 720, 'text': 'H pylori'},
 {'start': 783, 'end': 796, 'text': 'Campylobacter'},
 {'start': 1271, 'end': 1279, 'text': 'H pylori'},
 {'start': 1419, 'end': 1427, 'text': 'H pylori'},
 {'start': 1627, 'end': 1635, 'text': 'H pylori'},
 {'start': 1651, 'end': 1659, 'text': 'H pylori'},
 {'start': 2358, 'end': 2366, 'text': 'H pylori'},
 {'start': 2539, 'end': 2547, 'text': 'H pylori'},
 {'start': 19, 'end': 49, 'text': 'L. lactis subsp. cremoris B697'},
 {'start': 230, 'end': 268, 'text': 'L. lactis subsp. cremoris strain B1157'},
 {'start': 458, 'end': 462, 'text': 'B697'},
 {'start': 173, 'end': 195, 'text': 'Vibro parahaemolyticus'},
 {'start': 44, 'end': 55, 'text': 'Pseudomonas'},
 {'start': 98, 'end': 107, 'text': 'Halomonas'},
 {'start': 127, 'end': 145, 'text': 'Enterobacteriaceae'},
 {'start': 36, 'end': 66, 'text': 'Ara+ Burkholderia pseudomallei'},
 {'start': 151, 'end': 155, 'text': 'Ara-'},
 {'start': 258, 'end': 297, 'text': 'Burkholderia (Pseudomonas)