In [1]:
import os

### Build MERPY docker image

In [2]:
import docker

In [3]:
DOCKERFILE_PATH = "../images/MER"

client = docker.from_env()
client.images.build(path=DOCKERFILE_PATH, tag="merpy:latest")

(<Image: 'merpy:latest'>, <itertools._tee at 0x7f21eb1b22c0>)

### Run MERPY and parse results

In [4]:
import shutil

def run_merpy(input_dir, output_dir):
    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    volume = {os.path.abspath(input_dir): {'bind': '/home/mer/input', 'mode': 'ro'}, os.path.abspath(output_dir): {'bind': '/home/mer/output'}}
    client = docker.from_env()
    image = client.images.get("merpy:latest")
    response = client.containers.run(image, "python3 run_merpy.py input output", volumes=volume, remove=True)
    return response.decode("utf-8")

### Eval MERPY on test corpora

In [5]:
from eval_utils import *

#### Eval on LINNAEUS GSC

In [6]:
PATH_TO_LINNAEUS_GT = '../corpora/LINNAEUS_GSC_brat/linnaeus_clean/test/'
PATH_TO_LINNAEUS_PRED = "./output/MERPY/LINNAEUS_pred"

In [7]:
response = run_merpy(PATH_TO_LINNAEUS_GT, PATH_TO_LINNAEUS_PRED)

In [8]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)

$27.35$ & $47.40$ & $34.69$


Unnamed: 0,precision,recall,f1-score
Taxon,0.273534,0.473954,0.346875


In [9]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)

$27.38$ & $47.40$ & $34.71$


Unnamed: 0,precision,recall,f1-score
Taxon,0.273804,0.473954,0.347092


In [10]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)
FP

[{'start': 1920, 'end': 1925, 'text': 'major'},
 {'start': 1926, 'end': 1931, 'text': 'areas'},
 {'start': 6209, 'end': 6214, 'text': 'areas'},
 {'start': 10937, 'end': 10942, 'text': 'areas'},
 {'start': 11029, 'end': 11034, 'text': 'areas'},
 {'start': 12356, 'end': 12360, 'text': 'data'},
 {'start': 17990, 'end': 17996, 'text': 'permit'},
 {'start': 18657, 'end': 18661, 'text': 'none'},
 {'start': 19227, 'end': 19230, 'text': 'fax'},
 {'start': 19277, 'end': 19281, 'text': 'Data'},
 {'start': 26301, 'end': 26306, 'text': 'major'},
 {'start': 26307, 'end': 26312, 'text': 'areas'},
 {'start': 27174, 'end': 27179, 'text': 'areas'},
 {'start': 27427, 'end': 27432, 'text': 'areas'},
 {'start': 28316, 'end': 28323, 'text': 'Alberta'},
 {'start': 28591, 'end': 28596, 'text': 'areas'},
 {'start': 31927, 'end': 31934, 'text': 'unknown'},
 {'start': 32793, 'end': 32798, 'text': 'areas'},
 {'start': 34618, 'end': 34623, 'text': 'areas'},
 {'start': 34660, 'end': 34665, 'text': 'areas'},
 {'sta

#### Eval on S800 GSC

In [11]:
PATH_TO_S800_GT = '../corpora/S800_GSC_brat/s800/test'
PATH_TO_S800_PRED = "./output/MERPY/S800_pred"

In [12]:
response = run_merpy(PATH_TO_S800_GT, PATH_TO_S800_PRED)

In [13]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)

$28.82$ & $55.80$ & $38.01$


Unnamed: 0,precision,recall,f1-score
Taxon,0.288215,0.558018,0.380107


In [14]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)

$29.86$ & $57.63$ & $39.34$


Unnamed: 0,precision,recall,f1-score
Taxon,0.298649,0.576271,0.393413


In [15]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)
FN

[{'start': 166, 'end': 197, 'text': 'Porcine endogenous retroviruses'},
 {'start': 199, 'end': 204, 'text': 'PERVs'},
 {'start': 394, 'end': 399, 'text': 'PERVs'},
 {'start': 630, 'end': 640, 'text': 'retroviral'},
 {'start': 665, 'end': 669, 'text': 'PERV'},
 {'start': 787, 'end': 793, 'text': 'PERV-B'},
 {'start': 909, 'end': 919, 'text': 'retroviral'},
 {'start': 984, 'end': 990, 'text': 'PERV-B'},
 {'start': 1135, 'end': 1141, 'text': 'PERV-A'},
 {'start': 173, 'end': 178, 'text': 'M2(T)'},
 {'start': 327, 'end': 332, 'text': 'M2(T)'},
 {'start': 550, 'end': 555, 'text': 'M2(T)'},
 {'start': 635, 'end': 666, 'text': 'Methanobacterium veterum MK4(T)'},
 {'start': 697, 'end': 707, 'text': 'DSM 863(T)'},
 {'start': 827, 'end': 832, 'text': 'M2(T)'},
 {'start': 880, 'end': 885, 'text': 'M2(T)'},
 {'start': 981, 'end': 986, 'text': 'M2(T)'},
 {'start': 427, 'end': 433, 'text': 'fungal'},
 {'start': 992, 'end': 1005, 'text': 'M. truncatula'},
 {'start': 717, 'end': 727, 'text': 'great ti

#### Eval on COPIOUS GSC

In [16]:
PATH_TO_COPIOUS_GT = '../corpora/COPIOUS_GSC_brat/copious_clean/test'
PATH_TO_COPIOUS_PRED = "./output/MERPY/COPIOUS_pred"

In [17]:
response = run_merpy(PATH_TO_COPIOUS_GT, PATH_TO_COPIOUS_PRED)

In [18]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)

$22.63$ & $23.82$ & $23.21$


Unnamed: 0,precision,recall,f1-score
Taxon,0.226257,0.238235,0.232092


In [19]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)

$22.81$ & $24.02$ & $23.40$


Unnamed: 0,precision,recall,f1-score
Taxon,0.228119,0.240196,0.234002


In [20]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)
FP

[{'start': 0, 'end': 3, 'text': 'MAI'},
 {'start': 175, 'end': 183, 'text': 'sleepers'},
 {'start': 366, 'end': 369, 'text': 'Mai'},
 {'start': 640, 'end': 643, 'text': 'Mai'},
 {'start': 779, 'end': 782, 'text': 'MAI'},
 {'start': 948, 'end': 951, 'text': 'Mai'},
 {'start': 590, 'end': 596, 'text': 'Shorea'},
 {'start': 909, 'end': 913, 'text': 'ants'},
 {'start': 1450, 'end': 1459, 'text': 'elliptica'},
 {'start': 2494, 'end': 2498, 'text': 'none'},
 {'start': 2543, 'end': 2552, 'text': 'chocolate'},
 {'start': 2945, 'end': 2950, 'text': 'Mesua'},
 {'start': 590, 'end': 606, 'text': 'Shorea acuminata'},
 {'start': 0, 'end': 12, 'text': 'FORAMINIFERA'},
 {'start': 13, 'end': 19, 'text': 'PLANTS'},
 {'start': 45, 'end': 59, 'text': 'Symphoricarpos'},
 {'start': 74, 'end': 88, 'text': 'Symphoricarpos'},
 {'start': 89, 'end': 99, 'text': 'oreophilus'},
 {'start': 101, 'end': 115, 'text': 'Symphoricarpos'},
 {'start': 164, 'end': 178, 'text': 'Symphoricarpos'},
 {'start': 196, 'end': 210,

#### Eval on BB task corpus

In [26]:
PATH_TO_BB_GT = '../corpora/BB_GSC_brat/bb_ascii/test'
PATH_TO_BB_PRED = "./output/MERPY/BB_pred"

In [27]:
response = run_merpy(PATH_TO_BB_GT, PATH_TO_BB_PRED)

In [28]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)

$34.39$ & $43.50$ & $38.41$


Unnamed: 0,precision,recall,f1-score
Taxon,0.343874,0.435,0.384106


In [29]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)

$34.39$ & $43.50$ & $38.41$


Unnamed: 0,precision,recall,f1-score
Taxon,0.343874,0.435,0.384106


In [30]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)
FN

[{'start': 712, 'end': 720, 'text': 'H pylori'},
 {'start': 783, 'end': 796, 'text': 'Campylobacter'},
 {'start': 1271, 'end': 1279, 'text': 'H pylori'},
 {'start': 1419, 'end': 1427, 'text': 'H pylori'},
 {'start': 1627, 'end': 1635, 'text': 'H pylori'},
 {'start': 1651, 'end': 1659, 'text': 'H pylori'},
 {'start': 2358, 'end': 2366, 'text': 'H pylori'},
 {'start': 2539, 'end': 2547, 'text': 'H pylori'},
 {'start': 19, 'end': 49, 'text': 'L. lactis subsp. cremoris B697'},
 {'start': 230, 'end': 268, 'text': 'L. lactis subsp. cremoris strain B1157'},
 {'start': 458, 'end': 462, 'text': 'B697'},
 {'start': 173, 'end': 195, 'text': 'Vibro parahaemolyticus'},
 {'start': 36, 'end': 66, 'text': 'Ara+ Burkholderia pseudomallei'},
 {'start': 151, 'end': 155, 'text': 'Ara-'},
 {'start': 258, 'end': 297, 'text': 'Burkholderia (Pseudomonas) pseudomallei'},
 {'start': 696, 'end': 711, 'text': 'B. pseudomallei'},
 {'start': 1180, 'end': 1184, 'text': 'Ara-'},
 {'start': 1198, 'end': 1202, 'text': 