In [None]:
import os
import random
import numpy as np
from tqdm import tqdm
import torch
import logging
from sklearn.metrics import accuracy_score

from deepchem.feat.smiles_tokenizer import SmilesTokenizer
from transformers import TrainingArguments, Trainer, TrainerCallback
from transformers.trainer_pt_utils import _get_learning_rate
from transformers import AutoConfig, BertConfig, BertForMaskedLM

# os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [2]:
# !pip install deepchem

## Seed Fix

In [3]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # current gpu seed
    torch.cuda.manual_seed_all(seed) # All gpu seed
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = False  # if True, They choose gpu algorithm automatically

seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Load Pre-Trined Tokenizer

In [4]:
# please change your own path
vocab_path = '/home/egg2018037024/Jupyter_Home/Brand-New-ChemBERT/vocab.txt'
tokenizer = SmilesTokenizer(vocab_path)

In [5]:
tokenizer

SmilesTokenizer(name_or_path='', vocab_size=591, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
smiles_tokens = tokenizer.tokenize("CN1CCC2=C[C@@H]([C@@H]3[C@H]([C@@H]21)C4=CC5=C(C=C4C(=O)O3)OCO5)OC(=O)C6=CC=C(C=C6)C=C")
smiles_tokens

In [7]:
tokenizer.all_special_tokens, tokenizer.all_special_ids

(['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'], [11, 13, 0, 12, 14])

## Config

In [8]:
# you can change the position_embedding by <position_embedding_type option>
# Please refer the huggingface website : https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertConfig

config = BertConfig(vocab_size = tokenizer.vocab_size+1,
                    position_embedding_type= 'absolute', # BERT default setting
                   max_position_embeddings = 512,  # BERT default setting
                    type_vocab_size = 1,  # We don't use NSP, so we don't need type_ids
                   )  

# config.pad_token_id= -100

print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 592
}



In [9]:
model = BertForMaskedLM(config = config)
model.resize_token_embeddings(len(tokenizer))
print(model)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(591, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
             

In [12]:
print(model.num_parameters())

86496591


## Preparing Data

In [None]:
import pandas as pd

# Please change your own path
pt_data = pd.read_csv("/home/egg2018037024/Jupyter_Home/Brand-New-ChemBERT/Ant_Lab_anti_malaria_data_2023 (1).csv",
                      )
pt_data

In [15]:
pt_data['SMILES_len'] = pt_data['Canonical_Isomeric_SMILES(sources: PubChem_ChEMBL_and_EMBL-EBI)'].str.len()
pt_data['SMILES_len'].describe()

count    4794.000000
mean       56.505423
std        17.493073
min         8.000000
25%        45.000000
50%        54.000000
75%        66.000000
max       190.000000
Name: SMILES_len, dtype: float64

In [16]:
class Bert_Dataset(torch.utils.data.Dataset):
    def __init__(self, data:list, tokenizer):  
        # Assume that data is well pre-processed.
        self.data = data
        self.tokenizer = tokenizer
        # self.label = label

    def __getitem__(self, idx):
        text = self.data[idx]
        tokens = self.tokenizer(text, 
                              #  return_tensors="pt",  # pytorch.Tensor로 리턴
                                max_length=512, 
                                padding="max_length",  
                                truncation=True,  # truncation, if it exceed max_length
                               )
        
        return tokens

    def __len__(self):  # # of samples
        return len(self.data)

In [17]:
tokenized_whole_dataset = Bert_Dataset(pt_data['Canonical_Isomeric_SMILES(sources: PubChem_ChEMBL_and_EMBL-EBI)'].to_list(), 
                                       tokenizer)

In [18]:
# Tokenizer and Bert_Dataset class Test 

print(tokenized_whole_dataset.__len__())
print(tokenized_whole_dataset.__getitem__(970))
print(tokenizer.decode(tokenized_whole_dataset.__getitem__(970)['input_ids']))
print(pt_data['Canonical_Isomeric_SMILES(sources: PubChem_ChEMBL_and_EMBL-EBI)'].iloc[970])

4794
{'input_ids': [12, 16, 33, 20, 16, 19, 16, 21, 22, 16, 20, 16, 17, 22, 19, 18, 16, 17, 22, 19, 18, 16, 26, 22, 16, 21, 16, 22, 16, 16, 32, 22, 16, 26, 35, 17, 16, 16, 16, 32, 17, 16, 18, 16, 18, 19, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Data collator

In [19]:
from transformers import DataCollatorForWholeWordMask, DataCollatorForLanguageModeling

# This class performs masking on the data
# More details : https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/data_collator#data-collator

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15, return_tensors='pt'
)

In [20]:
print("Before Masking : ", tokenized_whole_dataset.__getitem__(970))
print("====================================================================")

col_test = data_collator.torch_call([tokenized_whole_dataset.__getitem__(970)])
print("After Masking : ", col_test)
print("====================================================================")
print("Decoded Version : ", tokenizer.decode(
    col_test['input_ids'].tolist()[0]))

Before Masking :  {'input_ids': [12, 16, 33, 20, 16, 19, 16, 21, 22, 16, 20, 16, 17, 22, 19, 18, 16, 17, 22, 19, 18, 16, 26, 22, 16, 21, 16, 22, 16, 16, 32, 22, 16, 26, 35, 17, 16, 16, 16, 32, 17, 16, 18, 16, 18, 19, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [21]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.total_element_rfp = 0
        self.total_correct_rfp = 0
        self.count = 0
        self.others = 0
        
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        
        outputs = model(**inputs)

        # code for calculating accuracy
        if "labels" in inputs:
            preds = outputs.logits.detach().cpu()
            input_label = inputs['labels'].detach().cpu()
            correct_rfp = preds.argmax(dim=-1).eq(input_label)
            correct_rfp_filter = input_label.ne(-100)  # 0 = self.vocab.pad_index
            correct_rfp = correct_rfp[correct_rfp_filter].sum().item()
            self.total_correct_rfp += correct_rfp
            self.total_element_rfp += correct_rfp_filter.sum().item()
            acc = self.total_correct_rfp / self.total_element_rfp * 100
            
            # Logging every N steps; N==1000
            if (self.state.global_step % 1000) == 0 or self.state.global_step == self.state.max_steps:
                print("Global Step: ", self.state.global_step)
                print("Max Steps: ", self.state.max_steps)
                print("Num Train Epochs: ", self.state.num_train_epochs)
                print("acc: ", acc)
                print("LR: ", self._get_learning_rate())
                print("================================")
                print("# total correct:", self.total_correct_rfp)
                print("# total element:", self.total_element_rfp)
                self.count = 0
                self.total_element_rfp = 0
                self.total_correct_rfp = 0
            
            
        # end code for calculating accuracy
                    

        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            unwrapped_model = unwrap_model(model)
            if is_peft_available() and isinstance(unwrapped_model, PeftModel):
                model_name = unwrapped_model.base_model.model._get_name()
            else:
                model_name = unwrapped_model._get_name()
            if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

In [22]:
training_args = TrainingArguments(
    output_dir="/home/user/10TB/NEWBERT/SmileBERT_MLMmodel",
    logging_dir= "/home/user/10TB/NEWBERT/SmileBERT_MLMlog",
    num_train_epochs=20,
    learning_rate = 1e-4,
   # max_steps=1000,
    per_device_train_batch_size=32,
#    gradient_accumulation_steps = 16,
#    per_device_eval_batch_size = 16,
#    eval_accumulation_steps = 32,
    logging_strategy = "steps",
    save_strategy = "steps",
    lr_scheduler_type = "linear",
    dataloader_num_workers = 16,
    warmup_steps = 10000,
    weight_decay=0.01,
#    warmup_steps = 1643
#    evaluation_strategy = "steps", # need a eval_dataset
#    eval_steps = 10,
    save_steps=1000,
    logging_steps=1000,
#    save_total_limit=10,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_whole_dataset,
)

In [None]:
#trainer.add_callback(CustomCallback(trainer))
trainer.train()

In [None]:
trainer.save_model("/home/user/10TB/NEWBERT/SmileBERT_MLMmodel_final20")