## Init

In [1]:
# load libraries
import torch
import os
import transformers
import pandas as pd
import numpy as np
import statistics
import re
import csv
import json
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Model list

In [9]:
models = {'ChemicalBERT':'recobo/chemical-bert-uncased',

          'BioBERT':'dmis-lab/biobert-base-cased-v1.2',

          'BERT':'bert-base-uncased',
          'BERT-large': 'bert-large-cased-whole-word-masking',

          'RoBERTa':'roberta-base', 
          'RoBERTa-large':'roberta-large',

          # 'BigBird-RoBERTa-large':'google/bigbird-roberta-large', # Error with protos, prefer run this on google colab

          'Muppet-RoBERTa-large':'facebook/muppet-roberta-large',
          
          'PubMedBERT-full':'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext',
          'PubMedBERT':'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
          
          'Clinical-BigBird':'yikuan8/Clinical-BigBird',
          'Clinical-Longformer':'yikuan8/Clinical-Longformer'
}


## Evaluating the proportion of chemical names, that are part of the vocabulary of the tokenizers

1.   Load natural products and taxa names
2.   Estimate tokenizer and vocabularies capabilities of each model.

---

Rq: we only consider taxa from mycobank




In [4]:
# load names of np
dir = "../../data/np/pre-processing/"

# load natural products names
np_names = pd.read_csv(os.path.join(dir, "wikidata", "np_names_wikidata.tsv"), sep="\t", quoting=csv.QUOTE_NONE, index_col=None, dtype=object)
np_names.columns = ["CID", "label"]

# For some ids, the name may be unavailable. Exclude them.
np_names.dropna(inplace=True)
np_names.reset_index(inplace=True, drop=True)

# read np synonyms table
np_names_synonyms = pd.read_csv(os.path.join(dir, "wikidata", "np_synonyms_wikidata.tsv"), sep="\t", quoting=csv.QUOTE_NONE, index_col=None, dtype=object)
np_names_synonyms.dropna(inplace=True)
np_names_synonyms.columns = ["CID", "label"]

# remove abbreviation of one character
np_names_synonyms = np_names_synonyms[np_names_synonyms.loc[:, "label"].apply(lambda x: len(x) > 1).tolist()]

# laod taxa names (not only accepted taxa but ALL of them)
taxa = pd.read_csv(os.path.join(dir, "ALL-taxons-ids-names.tsv"), sep = "\t", header=0, dtype=object)

# keep only mycobank names and species.
taxa = taxa[(taxa["rank"] == "species") & (taxa["TAX_SOURCE"] == "mycobank-taxonomy")]

# extract main entities ID: those wihtout an acceptedID because they are themself the main entity
main_e = taxa["acceptedID"].isnull()

# complete the table
taxa.loc[main_e.tolist(), "acceptedID"] = taxa.loc[main_e.tolist(), "ID"]

In [5]:
def get_voc_and_tokenizer_capabilities_summary(input_names, models):
  """
    - input_names: a list of string
    - models a dict of models
  """

  # init
  summary_tokenizers = pd.DataFrame()
  n = len(input_names)


  for model_name, model_code in models.items():

    print("Treating model: " + model_name)

    tokenizer = AutoTokenizer.from_pretrained(model_code)
    
    nb_UNK = 0
    tokens_len = []
    chem_in_voc = []
    
    for chem_name in input_names:
      
      chem_name = chem_name.lower()
      
      id = tokenizer.convert_tokens_to_ids(chem_name)
      if id != tokenizer.unk_token_id:
        chem_in_voc.append(chem_name)
      else:
        nb_UNK += 1
      # no need of special tokens for estinating the length
      tokens_len.append(len(tokenizer.encode(chem_name, add_special_tokens=False)))
    
    mean = sum(tokens_len) / n
    median = statistics.median(tokens_len)
    l_max = max(tokens_len)
    l_min = min(tokens_len)
    
    summary_tokenizers = pd.concat([summary_tokenizers, pd.DataFrame({"model": [model_name], "n.K": [n - nb_UNK], "n.UNK": [nb_UNK], "prop.unk": [(nb_UNK/n) * 100], "avg.tokenized.len": [mean], "median.tokenized.len": [median], "min":[l_min], "max":[l_max], "list.known": ["; ".join(chem_in_voc)]})])

  return summary_tokenizers


##### For natural products

(not considering synonyms as they contains a lot of abbreviations that are not specific to chemicals, but can easily be part of the word-piece tokenizer, e.g: Leu, Arg, etc …)


In [10]:
get_voc_and_tokenizer_capabilities_summary(np_names["label"].to_list(), models)

Treating model: ChemicalBERT
Treating model: BioBERT
Treating model: BERT
Treating model: BERT-large
Treating model: RoBERTa
Treating model: RoBERTa-large
Treating model: Muppet-RoBERTa-large


Downloading (…)lve/main/config.json: 100%|██████████| 482/482 [00:00<00:00, 45.1kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.20MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.06MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 1.80MB/s]


Treating model: PubMedBERT-full
Treating model: PubMedBERT
Treating model: Clinical-BigBird


Downloading (…)okenizer_config.json: 100%|██████████| 1.28k/1.28k [00:00<00:00, 518kB/s]
Downloading (…)"spiece.model";: 100%|██████████| 846k/846k [00:00<00:00, 1.84MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 846k/846k [00:00<00:00, 1.59MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 775/775 [00:00<00:00, 352kB/s]


Treating model: Clinical-Longformer


Downloading (…)okenizer_config.json: 100%|██████████| 347/347 [00:00<00:00, 41.0kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 1.50MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.07MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.11MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 100kB/s]


Unnamed: 0,model,n.K,n.UNK,prop.unk,avg.tokenized.len,median.tokenized.len,min,max,list.known
0,ChemicalBERT,57,31934,99.821825,25.342065,7,1,162,paclitaxel; cholesterol; biotin; tryptophan; h...
0,BioBERT,7,31984,99.978119,29.618299,8,1,193,ethanol; aluminium; iron; calcium; zinc; potas...
0,BERT,11,31980,99.965615,28.212841,8,1,182,testosterone; methane; ethanol; magnesium; alu...
0,BERT-large,7,31984,99.978119,29.618299,8,1,193,ethanol; aluminium; iron; calcium; zinc; potas...
0,RoBERTa,3,31988,99.990622,26.589197,8,1,199,quin; iron; herical
0,RoBERTa-large,3,31988,99.990622,26.589197,8,1,199,quin; iron; herical
0,Muppet-RoBERTa-large,3,31988,99.990622,26.589197,8,1,199,quin; iron; herical
0,PubMedBERT-full,105,31886,99.671783,24.289581,7,1,169,paclitaxel; lycopene; colistin; cholesterol; b...
0,PubMedBERT,125,31866,99.609265,23.970523,7,1,170,paclitaxel; lycopene; colistin; cholesterol; b...
0,Clinical-BigBird,3,31988,99.990622,25.946673,7,1,174,quin; iron; herical


##### For taxa

In [11]:
get_voc_and_tokenizer_capabilities_summary(taxa["name"].to_list(), models)

Treating model: ChemicalBERT
Treating model: BioBERT
Treating model: BERT
Treating model: BERT-large
Treating model: RoBERTa
Treating model: RoBERTa-large
Treating model: Muppet-RoBERTa-large
Treating model: PubMedBERT-full
Treating model: PubMedBERT
Treating model: Clinical-BigBird
Treating model: Clinical-Longformer


Unnamed: 0,model,n.K,n.UNK,prop.unk,avg.tokenized.len,median.tokenized.len,min,max,list.known
0,ChemicalBERT,0,405341,100.0,6.492679,6,2,37,
0,BioBERT,0,405341,100.0,7.831098,8,3,44,
0,BERT,0,405341,100.0,7.180423,7,2,41,
0,BERT-large,0,405341,100.0,7.833923,8,3,44,
0,RoBERTa,0,405341,100.0,7.263736,7,3,39,
0,RoBERTa-large,0,405341,100.0,7.263736,7,3,39,
0,Muppet-RoBERTa-large,0,405341,100.0,7.263736,7,3,39,
0,PubMedBERT-full,0,405341,100.0,6.536842,6,2,38,
0,PubMedBERT,0,405341,100.0,6.544083,6,2,42,
0,Clinical-BigBird,0,405341,100.0,6.99362,7,2,39,


### Results

Most of the models doesn't have a vocabulary nor a tokenizer suited for chemicals.
- Only few chemical names are actually represented as single-tokens entities.
- The tokeniser split chemical names in long sequences of tokens (mean > 25 et median ~= 8)

The dataset should only keep chemicals with a reasonable tokenized length <= 10.

- For taxa, because the name is alredy in 2 tokens, none of them can be part of the vocabulary and are necessay multi-tokens