## Init

In [1]:
# install libraries
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Looking in indexes: https://pypi.org/simple, https://us

In [2]:
# Google Colab: set current dir
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/data

Mounted at /content/drive
/content/drive/MyDrive/data


In [13]:
# load libraries
import os
import gc
import transformers
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
import csv
import statistics
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers.file_utils import is_torch_available
from scipy.spatial import distance
from scipy.stats import pearsonr

In [4]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [5]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

set_seed(1024)

### Model list

In [6]:
models = {'ChemicalBERT':'recobo/chemical-bert-uncased',

          'BioBERT':'dmis-lab/biobert-base-cased-v1.2',

          'BERT':'bert-base-uncased',
          'BERT-large': 'bert-large-cased-whole-word-masking',

          'RoBERTa':'roberta-base', # needs <mask>
          'RoBERTa-large':'roberta-large',

          'BigBird-RoBERTa-large':'google/bigbird-roberta-large',

          'Muppet-RoBERTa-large':'facebook/muppet-roberta-large',
          
          'PubMedBERT-full':'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext',
          'PubMedBERT':'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
          
          'Clinical-BigBird':'yikuan8/Clinical-BigBird',
          'Clinical-Longformer':'yikuan8/Clinical-Longformer'
}

test = {'ChemicalBERT':'recobo/chemical-bert-uncased'}

## Evaluating the proportion of chemical names, that are part of the vocabulary of the tokenizers

In [128]:
# load names of np
dir = "prompting-data"


## load natural products names
np_names = pd.read_csv(os.path.join(dir, "all_natural_products_with_names.tsv"), sep="\t", quoting=csv.QUOTE_NONE, header=None, index_col=None, dtype=object)
np_names.columns = ["CID", "chem_names"]

### For some ids, the name may be unavailable. Exclude them.
np_names.dropna(inplace=True)
np_names.reset_index(inplace=True, drop=True)


## laod taxa names
taxa = pd.read_csv(os.path.join(dir, "ALL-taxons-ids-names.tsv"), sep = "\t", header=0, dtype=object)

### keep only mycobank names and species.
taxa = taxa[(taxa["rank"] == "species") & (taxa["TAX_SOURCE"] == "mycobank-taxonomy")]


## load taxa - np pairs
pairs = pd.read_csv(os.path.join(dir, "taxon-np-list.csv"), sep=",", header=0, dtype=object)
### Keep only relations with species and myco-bank taxonomy. (then we will also filter by the tokenized len of the chem name). 
### Warning: Since we filter on the rank of the Accepted Taxon species, the cpd_related_taxa_id may still correspond to a variety or sub-species.
### These exceptions will be removed next when merging by names fitered on species and excluding nan values
pairs = pairs[(pairs["rank"] == "species") & (pairs["TAX_SOURCE"] == "mycobank-taxonomy")]
### Group by Accepted ID and pubchemId and count the nimber of references
df_pairs = pairs.groupby(['cpd_related_taxa_ID','pubchemId'])['ref'].count().reset_index(name='counts')
df_pairs = df_pairs.sort_values(by = "counts", ascending=False)

### Merge compounds names and taxa names
df_pairs = df_pairs.merge(taxa[["ID", "name"]], how='left', left_on='cpd_related_taxa_ID', right_on='ID')
df_pairs = df_pairs.merge(np_names, how='left', left_on='pubchemId', right_on='CID')

### remove lines where taxon name or np name is missing
df_pairs = df_pairs.dropna()
df_pairs = df_pairs[["cpd_related_taxa_ID", "pubchemId",	"counts", "name", "chem_names"]]
df_pairs.columns = ["taxon_id", "pubchem_id", "counts", "taxon_name", "np_name"]
df_pairs.to_csv(os.path.join(dir, "all_available_pairs.tsv"), sep="\t", index=False)

In [129]:
df_pairs

Unnamed: 0,taxon_id,pubchem_id,counts,taxon_name,np_name
0,148413,471002,23,Ganoderma lucidum,Ganoderic acid A
1,148413,471003,22,Ganoderma lucidum,Ganoderic acid B
2,107372,5484385,19,Wolfiporia cocos,Pachymic acid
3,317065,5484385,19,Macrohyporia cocos,Pachymic acid
4,148413,14109375,13,Ganoderma lucidum,lucidenic acid A
...,...,...,...,...,...
35548,213059,102030,1,Valsa sordida,16alpha-Hydroxydehydroepiandrosterone
35549,213059,10140,1,Valsa sordida,Glycocholic acid
35550,213059,1014,1,Valsa sordida,Phosphocholine
35551,213059,10114,1,Valsa sordida,Enoxolone


In [122]:
summary_tokenizers = pd.DataFrame()
n = len(df_pairs)

for model_name, model_code in models.items():

  tokenizer = AutoTokenizer.from_pretrained(model_code)
  
  nb_UNK = 0
  tokens_len = []
  chem_in_voc = []
  
  for chem_name in df_pairs["chem_names"]:
    
    chem_name = chem_name.lower()
    
    id = tokenizer.convert_tokens_to_ids(chem_name)
    if id != tokenizer.unk_token_id:
      chem_in_voc.append(chem_name)
    else:
      nb_UNK += 1

    tokens_len.append(len(tokenizer.encode(chem_name)))
  
  mean = sum(tokens_len) / n
  median = statistics.median(tokens_len)

  
  summary_tokenizers = pd.concat([summary_tokenizers, pd.DataFrame({"model": [model_name], "n.K": [n - nb_UNK], "n.UNK": [nb_UNK], "prop.unk": [(nb_UNK/n) * 100], "avg.tokenized.len": [mean], "median.tokenized.len": [median], "list.known": ["; ".join(chem_in_voc)]})])

summary_tokenizers


Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,model,n.K,n.UNK,prop.unk,avg.tokenized.len,median.tokenized.len,list.known
0,ChemicalBERT,321,34914,99.088974,30.248304,9,paclitaxel; austin; adenosine; adenosine; tryp...
0,BioBERT,9,35226,99.974457,34.9553,10,ethanol; ethanol; zinc; iron; aluminum; calciu...
0,BERT,16,35219,99.954591,33.409962,10,austin; ethanol; ethanol; austin; testosterone...
0,BERT-large,9,35226,99.974457,34.9553,10,ethanol; ethanol; zinc; iron; aluminum; calciu...
0,RoBERTa,2,35233,99.994324,31.4894,10,herical; iron
0,RoBERTa-large,2,35233,99.994324,31.4894,10,herical; iron
0,BigBird-RoBERTa-large,2,35233,99.994324,30.815808,9,herical; iron
0,Muppet-RoBERTa-large,2,35233,99.994324,31.4894,10,herical; iron
0,PubMedBERT-full,533,34702,98.4873,29.021456,9,paclitaxel; austin; adenosine; adenosine; wort...
0,PubMedBERT,570,34665,98.38229,28.636044,9,paclitaxel; muscimol; adenosine; adenosine; mu...


### Results

Most of the models doesn't have a vocabulary containing suited for chemicals.
- Only few chemical names are actually represented as single-tokens entities.
- The tokeniser split chemical names in long sequences of tokens (mean > 31 et median ~= 10)

Objectif provide a dataset with reasonable chemicals with tokenized length <= 10.

Le choix des modèles testés est basé sur le pré-entrainement et ceux aui semble disposer du plus de nom de chemicals:
  - ChemicalBERT
  - BioBERT
  - PubMedBERT