# Computing ChemBERTa features

## install transformers and datasets library

In [None]:
!pip install transformers
!pip install datasets



## Load ChemBERTa model

In [None]:
from transformers import AutoTokenizer, AutoModel

model_checkpoint = "seyonec/PubChem10M_SMILES_BPE_450k"
#model_checkpoint = "DeepChem/ChemBERTa-77M-MLM"
#model_checkpoint = "DeepChem/ChemBERTa-77M-MTR"

model = AutoModel.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [1]:
sum(p.numel() for p in model.parameters())

NameError: ignored

## Load data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!cp '/content/gdrive/My Drive/all_carboxylics.csv' all_carboxylics.csv

import pandas as pd

carboxylics_frame = pd.read_csv('all_carboxylics.csv', index_col='Unnamed: 0')
tasks = list(carboxylics_frame.columns[2:])
carboxylics_smiles = pd.DataFrame(carboxylics_frame['smiles'])

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Turn into transformers Dataset

In [None]:
from datasets import Dataset  # transformers datasets

smiles_dataset = Dataset.from_pandas(carboxylics_smiles, preserve_index=False)

## Tokenize

In [None]:
def tokenize(batch):
  return tokenizer(batch["smiles"], padding=True, truncation=True)

In [None]:
smiles_tokenized = smiles_dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## compute ChemBERTa features

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(52000, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Drop

In [None]:
smiles_tokenized.set_format('torch', columns=['input_ids', 'attention_mask'])

In [None]:
feature_types = ['cls_embedding', 'pooler_output', 'token_mean', 'cls_embedding_concat']

def extract_hidden_states(batch):
  """
  Compute 4 common features:
    1. 'cls_embedding': last [CLS] token embedding
    2. 'pooler_output': the cls_embedding further processed by a fully connected layer and tanh activation
    3. 'token_mean': mean of last embeddings of all tokens
    4. 'cls_embedding_concat': concatenation of the last 4 [CLS] embeddings

  Return them in the form of a dict with keys as above.
  """
  inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}

  with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)

    features = {
    'cls_embedding': outputs['last_hidden_state'][:, 0, :].cpu().numpy(),
    'pooler_output': outputs['pooler_output'].cpu().numpy(),
    'token_mean': get_token_mean(outputs, inputs).cpu().numpy(),
    'cls_embedding_concat': get_concat(outputs).cpu().numpy()
    }
  return features

def get_token_mean(outputs, encoded):
  """
  Compute the mean over all tokens of the last embeddings, taking care to ignore padding through the attention mask.

  The input to this function needs to contain both `'last_hidden_state'` and `'attention_mask'` keys.
  """
  return torch.mean(outputs['last_hidden_state'] * encoded['attention_mask'].unsqueeze(-1), axis=1)

def get_concat(outputs, last_n=4):
  """"
  Compute concatenation of the last 4 (`last_n`) hidden state of the [CLS] token.
  """
  return torch.concat([hidden[:, 0, :] for hidden in outputs['hidden_states'][-last_n:]], axis=1)

In [None]:
smiles_features = smiles_tokenized.map(extract_hidden_states, batched=True, batch_size=32)

  0%|          | 0/346 [00:00<?, ?ba/s]

## Saving dataset

In [None]:
import h5py
import os

data_dir = '/content/gdrive/My Drive/quantumDots/data/'
data_name = 'carboxylic_features.hdf5'

if not os.path.isfile(data_dir + data_name):
  with h5py.File(data_dir + data_name, 'w') as f:
    f['smiles'] = smiles_features['smiles']

with h5py.File(data_dir + data_name, 'a') as f:
  for feature in feature_types:
    f[model_checkpoint + f'/{feature}'] = smiles_features[feature]