In [1]:
from taxonerd import TaxoNERD

In [2]:
import logging
logging.basicConfig(level=logging.ERROR)

### Run TaxoNERD and parse results

In [3]:
def run_taxonerd(ner, input_dir, output_dir):
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    ner.find_all_files(input_dir, output_dir)

In [4]:
ner_biobert = TaxoNERD(model="en_ner_eco_biobert", prefer_gpu=False, with_abbrev=False, with_linking=None, verbose=False)

### Eval TaxoNERD on test corpora

In [5]:
from eval_utils import *

#### Eval on LINNAEUS GSC

In [6]:
PATH_TO_LINNAEUS_GT = '../corpora/LINNAEUS_GSC_brat/linnaeus_clean/test'
PATH_TO_LINNAEUS_PRED = "./output/TAXONERD_BIOBERT/LINNAEUS_pred"

In [7]:
run_taxonerd(ner_biobert, PATH_TO_LINNAEUS_GT, PATH_TO_LINNAEUS_PRED)

Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors


In [8]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=exact)

$59.06$ & $26.73$ & $36.80$


Unnamed: 0,precision,recall,f1-score
Taxon,0.590566,0.267293,0.368019


In [9]:
get_precision_recall_f1_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)

$61.89$ & $28.01$ & $38.57$


Unnamed: 0,precision,recall,f1-score
Taxon,0.618868,0.280102,0.385655


In [10]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_LINNAEUS_PRED, PATH_TO_LINNAEUS_GT, criterion=approximate)
FN

[{'start': 436, 'end': 441, 'text': 'human'},
 {'start': 3639, 'end': 3646, 'text': 'patient'},
 {'start': 7712, 'end': 7719, 'text': 'peoples'},
 {'start': 7868, 'end': 7874, 'text': 'people'},
 {'start': 8268, 'end': 8274, 'text': 'person'},
 {'start': 10999, 'end': 11005, 'text': 'people'},
 {'start': 12055, 'end': 12060, 'text': 'human'},
 {'start': 13935, 'end': 13940, 'text': 'human'},
 {'start': 16120, 'end': 16125, 'text': 'Human'},
 {'start': 16761, 'end': 16772, 'text': 'Participant'},
 {'start': 17033, 'end': 17045, 'text': 'participants'},
 {'start': 18069, 'end': 18081, 'text': 'participants'},
 {'start': 27828, 'end': 27836, 'text': 'patients'},
 {'start': 28273, 'end': 28280, 'text': 'persons'},
 {'start': 28679, 'end': 28691, 'text': 'participants'},
 {'start': 32117, 'end': 32122, 'text': 'human'},
 {'start': 36326, 'end': 36333, 'text': 'patient'},
 {'start': 39739, 'end': 39744, 'text': 'human'},
 {'start': 40235, 'end': 40247, 'text': 'participants'},
 {'start': 404

#### Eval on S800 GSC

In [11]:
PATH_TO_S800_GT = '../corpora/S800_GSC_brat/s800/test'
PATH_TO_S800_PRED = "./output/TAXONERD_BIOBERT/S800_pred"

In [12]:
run_taxonerd(ner_biobert, PATH_TO_S800_GT, PATH_TO_S800_PRED)

In [13]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=exact)

$48.54$ & $62.97$ & $54.82$


Unnamed: 0,precision,recall,f1-score
Taxon,0.485427,0.629726,0.548241


In [14]:
get_precision_recall_f1_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)

$53.77$ & $69.84$ & $60.76$


Unnamed: 0,precision,recall,f1-score
Taxon,0.537688,0.698433,0.607609


In [15]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_S800_PRED, PATH_TO_S800_GT, criterion=approximate)
FN

[{'start': 35, 'end': 64, 'text': 'porcine endogenous retrovirus'},
 {'start': 166, 'end': 197, 'text': 'Porcine endogenous retroviruses'},
 {'start': 314, 'end': 319, 'text': 'human'},
 {'start': 381, 'end': 385, 'text': 'pigs'},
 {'start': 630, 'end': 640, 'text': 'retroviral'},
 {'start': 665, 'end': 669, 'text': 'PERV'},
 {'start': 787, 'end': 793, 'text': 'PERV-B'},
 {'start': 909, 'end': 919, 'text': 'retroviral'},
 {'start': 984, 'end': 990, 'text': 'PERV-B'},
 {'start': 1135, 'end': 1141, 'text': 'PERV-A'},
 {'start': 173, 'end': 178, 'text': 'M2(T)'},
 {'start': 327, 'end': 332, 'text': 'M2(T)'},
 {'start': 550, 'end': 555, 'text': 'M2(T)'},
 {'start': 635, 'end': 666, 'text': 'Methanobacterium veterum MK4(T)'},
 {'start': 697, 'end': 707, 'text': 'DSM 863(T)'},
 {'start': 827, 'end': 832, 'text': 'M2(T)'},
 {'start': 880, 'end': 885, 'text': 'M2(T)'},
 {'start': 981, 'end': 986, 'text': 'M2(T)'},
 {'start': 427, 'end': 433, 'text': 'fungal'},
 {'start': 845, 'end': 862, 'text

#### Eval on COPIOUS GSC

In [16]:
PATH_TO_COPIOUS_GT = '../corpora/COPIOUS_GSC_brat/copious_clean/test'
PATH_TO_COPIOUS_PRED = "./output/TAXONERD_BIOBERT/COPIOUS_pred"

In [17]:
run_taxonerd(ner_biobert, PATH_TO_COPIOUS_GT, PATH_TO_COPIOUS_PRED)

In [18]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=exact)

$75.85$ & $74.51$ & $75.17$


Unnamed: 0,precision,recall,f1-score
Taxon,0.758483,0.745098,0.751731


In [19]:
get_precision_recall_f1_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)

$82.14$ & $81.00$ & $81.57$


Unnamed: 0,precision,recall,f1-score
Taxon,0.821357,0.810039,0.815659


In [20]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_COPIOUS_PRED, PATH_TO_COPIOUS_GT, criterion=approximate)
FN

[{'start': 0, 'end': 8, 'text': 'MAI RANG'},
 {'start': 366, 'end': 374, 'text': 'Mai Teng'},
 {'start': 2324, 'end': 2333, 'text': 'Porcupine'},
 {'start': 393, 'end': 412, 'text': 'S. tetonensis Nels.'},
 {'start': 1972, 'end': 1988, 'text': 'daun batik angin'},
 {'start': 2622, 'end': 2631, 'text': 'crocodile'},
 {'start': 2839, 'end': 2853, 'text': 'Gunong Tundong'},
 {'start': 46, 'end': 52, 'text': 'CERVUS'},
 {'start': 85, 'end': 89, 'text': 'bird'},
 {'start': 130, 'end': 153, 'text': 'BLACK-THROATED BARBICAN'},
 {'start': 311, 'end': 340, 'text': 'Cephalopholis argus Schneider'},
 {'start': 2130,
  'end': 2173,
  'text': 'Pempherit vanicduntis Cuvier & Valenciennes'},
 {'start': 2227, 'end': 2237, 'text': 'SERRANID-F'},
 {'start': 1748, 'end': 1753, 'text': 'Verde'},
 {'start': 2128, 'end': 2152, 'text': 'Colina costata, A. Adams'},
 {'start': 2286, 'end': 2310, 'text': 'Colina pygmsea, H. Adams'},
 {'start': 826, 'end': 844, 'text': 'thin-walled bamboo'},
 {'start': 545, 'end

#### Eval on BB task corpus

In [21]:
PATH_TO_BB_GT = '../corpora/BB_GSC_brat/bb_clean/test'
PATH_TO_BB_PRED = "./output/TAXONERD_BIOBERT/BB_pred"

In [22]:
run_taxonerd(ner_biobert, PATH_TO_BB_GT, PATH_TO_BB_PRED)

In [23]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=exact)

$87.20$ & $90.25$ & $88.70$


Unnamed: 0,precision,recall,f1-score
Taxon,0.871981,0.9025,0.886978


In [24]:
get_precision_recall_f1_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)

$89.13$ & $92.48$ & $90.77$


Unnamed: 0,precision,recall,f1-score
Taxon,0.891304,0.924812,0.907749


In [25]:
FN, FP, TP = get_FN_FP_TP_single_corpus(PATH_TO_BB_PRED, PATH_TO_BB_GT, criterion=approximate)
FN

[{'start': 36, 'end': 66, 'text': 'Ara+ Burkholderia pseudomallei'},
 {'start': 151, 'end': 155, 'text': 'Ara-'},
 {'start': 1180, 'end': 1184, 'text': 'Ara-'},
 {'start': 1198, 'end': 1202, 'text': 'Ara-'},
 {'start': 1212, 'end': 1216, 'text': 'Ara+'},
 {'start': 1489, 'end': 1493, 'text': 'Ara-'},
 {'start': 1498, 'end': 1502, 'text': 'Ara+'},
 {'start': 311, 'end': 327, 'text': 'γ-Proteobacteria'},
 {'start': 224, 'end': 243, 'text': 'coryneform bacteria'},
 {'start': 185,
  'end': 240,
  'text': 'Bacillus thuringiensis serovar sotto strain 96-OK-85-24'},
 {'start': 319, 'end': 330, 'text': '96-OK-85-24'},
 {'start': 674, 'end': 685, 'text': '96-OK-85-24'},
 {'start': 879,
  'end': 927,
  'text': 'R. conorii reference strains (Moroccan and no. 7'},
 {'start': 1151, 'end': 1170, 'text': 'spotted fever group'},
 {'start': 295, 'end': 298, 'text': '993'},
 {'start': 261, 'end': 285, 'text': 'L. lactis B1157 and B697'},
 {'start': 318, 'end': 323, 'text': 'B1157'},
 {'start': 357, 'end