In [1]:
import os

### Build GNFINDER docker image

In [2]:
import docker

In [3]:
DOCKERFILE_PATH = "../images/gnfinder"

client = docker.from_env()
client.images.build(path=DOCKERFILE_PATH, tag="gnfinder:latest")

(<Image: 'gnfinder:latest'>, <itertools._tee at 0x7f646636d140>)

### Run GNFINDER and parse results

In [4]:
import shutil
from glob import glob
import json

def run_gnfinder(input_dir):  
    volume = {os.path.abspath(input_dir): {'bind': '/home/gnfinder/corpus', 'mode': 'ro'}}
    client = docker.from_env()
    image = client.images.get("gnfinder:latest")
    
    results = []
    for document in glob(os.path.join(input_dir, "*.txt")):
        name = os.path.basename(document)
        remote_path = os.path.join("/home/gnfinder/corpus", name)
        response = client.containers.run(image, "gnfinder find {}".format(remote_path), volumes=volume, remove=True)
        results += [{"document": name, "tags" : json.loads(response.decode("utf-8"))}]
    return results

In [5]:
import pandas as pd

def parse_gnfinder(input_dir, response, output_dir, apply_corrections=True):
    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    
    for doc_tags in response:
        document = doc_tags["document"]
        tags = doc_tags["tags"]["names"]
        doc_ann = pd.DataFrame(tags)
        doc_ann = doc_ann.rename('T{}'.format)
        doc_ann.insert(0, "type", ["LIVB"]*doc_ann.shape[0])
        new_cols = ["type", "start", "end", "verbatim", "name", "cardinality", "odds", "annotationNomenType", "annotation"]
        if not doc_ann.empty:
            doc_ann = doc_ann[new_cols]
            if apply_corrections:
                doc_ann = doc_ann.apply(correct_annotations, axis=1)
        ann_filename = document.split(".")[0]+".ann"
        doc_ann.to_csv(os.path.join(output_dir, ann_filename), sep="\t", header=False)
        
def correct_annotations(row):
    sub_start = row["verbatim"].lower().find(row["name"].lower())
    row["start"] += sub_start
    row["end"] = row["start"]+len(row["name"])
    return row
    

### Eval GNFINDER on test corpora

In [6]:
from eval_utils import *

#### Eval on LINNAEUS GSC

In [7]:
PATH_TO_LINNAEUS_GT = '../corpora/LINNAEUS_GSC_brat/linnaeus_clean/test'
PATH_TO_LINNAEUS_PRED = "./output/GNFINDER/LINNAEUS_pred"

In [8]:
response = run_gnfinder(PATH_TO_LINNAEUS_GT)
parse_gnfinder(PATH_TO_LINNAEUS_GT, response, PATH_TO_LINNAEUS_PRED, apply_corrections=False)

In [9]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)

$52.13$ & $13.58$ & $21.54$


Unnamed: 0,precision,recall,f1-score
Taxon,0.521311,0.135781,0.215447


In [10]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)

$78.36$ & $20.41$ & $32.38$


Unnamed: 0,precision,recall,f1-score
Taxon,0.783607,0.204099,0.323848


In [11]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)
FN

[{'start': 436, 'end': 441, 'text': 'human'},
 {'start': 3639, 'end': 3646, 'text': 'patient'},
 {'start': 7712, 'end': 7719, 'text': 'peoples'},
 {'start': 7868, 'end': 7874, 'text': 'people'},
 {'start': 8268, 'end': 8274, 'text': 'person'},
 {'start': 10999, 'end': 11005, 'text': 'people'},
 {'start': 12055, 'end': 12060, 'text': 'human'},
 {'start': 13935, 'end': 13940, 'text': 'human'},
 {'start': 16120, 'end': 16125, 'text': 'Human'},
 {'start': 16761, 'end': 16772, 'text': 'Participant'},
 {'start': 17033, 'end': 17045, 'text': 'participants'},
 {'start': 18069, 'end': 18081, 'text': 'participants'},
 {'start': 27828, 'end': 27836, 'text': 'patients'},
 {'start': 28273, 'end': 28280, 'text': 'persons'},
 {'start': 28679, 'end': 28691, 'text': 'participants'},
 {'start': 32117, 'end': 32122, 'text': 'human'},
 {'start': 36326, 'end': 36333, 'text': 'patient'},
 {'start': 39739, 'end': 39744, 'text': 'human'},
 {'start': 40235, 'end': 40247, 'text': 'participants'},
 {'start': 404

#### Eval on S800 GSC

In [12]:
PATH_TO_S800_GT = '../corpora/S800_GSC_brat/s800/test'
PATH_TO_S800_PRED = "./output/GNFINDER/S800_pred"

In [13]:
tags_filename = run_gnfinder(PATH_TO_S800_GT)
parse_gnfinder(PATH_TO_S800_GT, tags_filename, PATH_TO_S800_PRED, apply_corrections=False)

In [14]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)

$27.88$ & $21.77$ & $24.45$


Unnamed: 0,precision,recall,f1-score
Taxon,0.278798,0.217731,0.24451


In [15]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)

$53.92$ & $42.11$ & $47.29$


Unnamed: 0,precision,recall,f1-score
Taxon,0.539232,0.421121,0.472914


In [16]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)
FN

[{'start': 35, 'end': 64, 'text': 'porcine endogenous retrovirus'},
 {'start': 166, 'end': 197, 'text': 'Porcine endogenous retroviruses'},
 {'start': 199, 'end': 204, 'text': 'PERVs'},
 {'start': 314, 'end': 319, 'text': 'human'},
 {'start': 381, 'end': 385, 'text': 'pigs'},
 {'start': 394, 'end': 399, 'text': 'PERVs'},
 {'start': 630, 'end': 640, 'text': 'retroviral'},
 {'start': 665, 'end': 669, 'text': 'PERV'},
 {'start': 787, 'end': 793, 'text': 'PERV-B'},
 {'start': 909, 'end': 919, 'text': 'retroviral'},
 {'start': 984, 'end': 990, 'text': 'PERV-B'},
 {'start': 1135, 'end': 1141, 'text': 'PERV-A'},
 {'start': 173, 'end': 178, 'text': 'M2(T)'},
 {'start': 327, 'end': 332, 'text': 'M2(T)'},
 {'start': 550, 'end': 555, 'text': 'M2(T)'},
 {'start': 635, 'end': 666, 'text': 'Methanobacterium veterum MK4(T)'},
 {'start': 697, 'end': 707, 'text': 'DSM 863(T)'},
 {'start': 827, 'end': 832, 'text': 'M2(T)'},
 {'start': 880, 'end': 885, 'text': 'M2(T)'},
 {'start': 981, 'end': 986, 'text'

#### Eval on COPIOUS GSC

In [17]:
PATH_TO_COPIOUS_GT = '../corpora/COPIOUS_GSC_brat/copious_clean/test'
PATH_TO_COPIOUS_PRED = "./output/GNFINDER/COPIOUS_pred"

In [18]:
tags_filename = run_gnfinder(PATH_TO_COPIOUS_GT)
parse_gnfinder(PATH_TO_COPIOUS_GT, tags_filename, PATH_TO_COPIOUS_PRED, apply_corrections=False)

In [19]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)

$23.85$ & $15.29$ & $18.64$


Unnamed: 0,precision,recall,f1-score
Taxon,0.238532,0.152941,0.18638


In [20]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)

$46.94$ & $30.10$ & $36.68$


Unnamed: 0,precision,recall,f1-score
Taxon,0.469419,0.30098,0.366786


In [21]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)
FN

[{'start': 0, 'end': 8, 'text': 'MAI RANG'},
 {'start': 366, 'end': 374, 'text': 'Mai Teng'},
 {'start': 506, 'end': 510, 'text': 'Yang'},
 {'start': 640, 'end': 648, 'text': 'Mai Rang'},
 {'start': 779, 'end': 787, 'text': 'MAI RANG'},
 {'start': 789, 'end': 807, 'text': 'Pentacme Siamensis'},
 {'start': 826, 'end': 840, 'text': 'Dipterocarpeae'},
 {'start': 948, 'end': 956, 'text': 'Mai Teng'},
 {'start': 21, 'end': 47, 'text': 'H. polyalthoides Symington'},
 {'start': 211, 'end': 232, 'text': 'H. resinosa Symington'},
 {'start': 590, 'end': 611, 'text': 'Shorea acuminata Dyer'},
 {'start': 916, 'end': 935, 'text': 'S. bracteolata Dyer'},
 {'start': 1184, 'end': 1205, 'text': 'S. exelliptica Meijer'},
 {'start': 1445, 'end': 1465, 'text': 'S. ? elliptica Burck'},
 {'start': 1499, 'end': 1522, 'text': 'S. foxworthii Symington'},
 {'start': 2009, 'end': 2014, 'text': 'balau'},
 {'start': 2073, 'end': 2097, 'text': 'S. lepidota (Korth.) Bl.'},
 {'start': 2282, 'end': 2299, 'text': 'S. l

#### Eval on BB task corpus

In [22]:
PATH_TO_BB_GT = '../corpora/BB_GSC_brat/bb_clean/test'
PATH_TO_BB_PRED = "./output/GNFINDER/BB_pred"

In [23]:
tags_filename = run_gnfinder(PATH_TO_BB_GT)
parse_gnfinder(PATH_TO_BB_GT, tags_filename, PATH_TO_BB_PRED, apply_corrections=False)

In [24]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)

$51.05$ & $42.50$ & $46.38$


Unnamed: 0,precision,recall,f1-score
Taxon,0.510511,0.425,0.463847


In [27]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)

$79.28$ & $66.00$ & $72.03$


Unnamed: 0,precision,recall,f1-score
Taxon,0.792793,0.66,0.720327


In [28]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)
FP

[{'start': 19,
  'end': 44,
  'text': 'L. lactis subsp. cremoris L. lactis subsp. cremoris 3 37306238190145.18 NO_ANNOT'},
 {'start': 230,
  'end': 255,
  'text': 'L. lactis subsp. cremoris L. lactis subsp. cremoris 3 37306238190145.18 NO_ANNOT'},
 {'start': 41,
  'end': 66,
  'text': 'Burkholderia pseudomallei Burkholderia pseudomallei 2 1046118956947.8068 NO_ANNOT'},
 {'start': 271,
  'end': 284,
  'text': '(Pseudomonas) Pseudomonas 1 39016.22781627596 NO_ANNOT'},
 {'start': 1035,
  'end': 1062,
  'text': 'Methylobacterium extorquens Methylobacterium extorquens 2 9836376537556.105 NO_ANNOT'},
 {'start': 57,
  'end': 67,
  'text': 'Klebsiella Klebsiella 1 168.8714873349176 NO_ANNOT'},
 {'start': 321,
  'end': 331,
  'text': 'Klebsiella Klebsiella 1 168.8714873349176 NO_ANNOT'},
 {'start': 608,
  'end': 618,
  'text': 'Klebsiella Klebsiella 1 168.8714873349176 NO_ANNOT'},
 {'start': 36,
  'end': 66,
  'text': 'Bacillus thuringiensis serovar Bacillus thuringiensis serovar 3 771572793573