#### https://docs.nvidia.com/bionemo-framework/latest/notebooks/encoder-finetuning-notebook-fw.html

In [1]:
!pip install openpyxl -q

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


## Dataset Preparation

In [2]:
import pandas as pd
import numpy as np
df = pd.read_excel('TestInputData_June17.xlsx')

In [3]:
df.head()

Unnamed: 0,Compound as referred to in the document,Canonical Smiles,Size,ExptVariable
0,Compound 1,CCCCCCCCCCCCCCCCCCN(CCO)CCCCCCCC(=O)OC(CCCCCCC...,72.7,402000000
1,Compound 2,CCCCCCCCCCCCCCN(CCO)CCCCCCCC(=O)OC(CCCCCCCC)CC...,83.9,4870000000
2,Compound 3,CCCCCCCCCN(CCO)CCCCCCCC(=O)OC(CCCCCCCC)CCCCCCCC,97.5,13900000000
3,Compound 4,CCCCCCCCC(CCCCCCCC)OC(=O)CCCCCCCN(CCO)CCCCCCCC,120.5,5260000000
4,Compound 5,CCCCCCCCC(CCCCCCCC)OC(=O)CCCCCCCN(CCO)CCCCCC,196.4,58400000


In [4]:
x = df['Canonical Smiles'].tolist()
y = np.log10(df['ExptVariable'])

In [5]:
df_prepared = pd.DataFrame({
    'SMILES':x,
    'y':y,
})

In [6]:
import os
BIO_HOME = os.environ['BIONEMO_HOME']

In [7]:
train_path = os.path.join(BIO_HOME,'data/reg_test_input_data','train')
test_path = os.path.join(BIO_HOME,'data/reg_test_input_data','test')
val_path = os.path.join(BIO_HOME,'data/reg_test_input_data','val')
os.makedirs(train_path,exist_ok=True)
os.makedirs(test_path,exist_ok=True)
os.makedirs(val_path,exist_ok=True)

In [8]:
from sklearn.model_selection import train_test_split
train_df,test_df = train_test_split(df_prepared,test_size =0.1,shuffle  = True,random_state= 42)
train_df,val_df = train_test_split(df_prepared,test_size =0.1,shuffle  = True,random_state= 42)
train_df.to_csv(os.path.join(train_path,'x000.csv'),index = False)
test_df.to_csv(os.path.join(test_path,'x000.csv'),index = False)
val_df.to_csv(os.path.join(val_path,'x000.csv'),index = False)

In [9]:
len(train_df),len(val_df),len(test_df)

(99, 12, 12)

In [10]:
%%writefile /workspace/bionemo/examples/molecule/megamolbart/conf/finetune_config2.yaml
name: mmb_physchem
defaults: 
  - pretrain_small_span_aug
do_preprocessing: False
do_training: True # set to false if data preprocessing steps must be completed
do_testing: True # set to true to run evaluation on test data after training, requires test_dataset section
restore_from_path: null # path to nemo checkpoint of the fine-tuned model (encoder + task head) to be used for further training, testing or inference
target: bionemo.model.molecule.megamolbart.MegaMolBARTModel
infer_target: bionemo.model.molecule.megamolbart.infer.MegaMolBARTInference

trainer:
  devices: 1 # number of GPUs or CPUs
  num_nodes: 1
  max_epochs: 5 # use max_steps instead with NeMo Megatron models
  max_steps: 10 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
  val_check_interval: 1 # set to integer when using steps to determine frequency of validation, use fraction with epochs
  limit_val_batches: 1 # number of batches in validation step, use fraction for fraction of data, 0 to disable
  limit_test_batches: 1 # number of batches in test step, use fraction for fraction of data, 0 to disable
  precision: 16-mixed
exp_manager:
  wandb_logger_kwargs:
    project: ${name}_finetuning
    name: ${name}_finetuning_encoder_frozen_${model.encoder_frozen}
  checkpoint_callback_params:
    monitor: val_loss # use molecular accuracy to select best checkpoints
    mode: min # use min or max of monitored metric to select best checkpoints
    filename: '${name}-${model.name}--{val_loss:.2f}-{step}-{consumed_samples}'
  resume_if_exists: False

model:
  restore_encoder_path:  '/workspace/bionemo/MegaMolBART_0_2_0.nemo'  #${oc.env:BIONEMO_HOME}/models/molecule/megamolbart/megamolbart.nemo # path to nemo checkpoint of the MegaMolBART model
  seq_length: 512 # TODO make a checkpoint with this set to 128. Maximum sequence length allowed. Set to 512 for backwards compatibililty with the checkpoint.
  max_position_embeddings: ${.seq_length}
  encoder_frozen: True
  post_process: False
  micro_batch_size: 12 # NOTE: adjust to occupy ~ 90% of GPU memory
  global_batch_size: null
  tensor_model_parallel_size: 1  # model parallelism
  pipeline_model_parallel_size: 1
  downstream_task:
    n_outputs: 1
    hidden_layer_size: 128
    loss_func: MSELoss
    restore_from_path: '/workspace/bionemo/MegaMolBART_0_2_0.nemo' # path of pretrained model to be used in inference
    outputs: [embeddings, hiddens] # Which outputs to extract per sample (a value or list). Possible values: hiddens, embeddings.
  data:
    # Preprocessing data params
    links_file: ${oc.env:BIONEMO_HOME}/examples/molecule/megamolbart/dataset/PhysChem-downloader.txt
    preprocessed_data_path: "/workspace/bionemo/data/" #sets the location physchem dataset will be downloaded
    dataset_path: ${model.data.preprocessed_data_path}/${model.data.task_name} 
    split_data: False
    val_frac: 0.15 # proportion of samples used for validation set
    test_frac: 0.15 # proportion of samples used for test set

    # Finetuning data params
    task_name: reg_test_input_data #specifies which MoleculeNet physchem dataset to use for training, expected values: SAMPL, Lipophilicity, or delaney-processed
    task_type: 'regression'
    sequence_column: 'SMILES'
    target_column: 'y'
    emb_batch_size: ${model.micro_batch_size}
    dataset:
      train: x000
      val: x000
      test: x000
    num_workers: 8
    shuffle: False
  
  finetuning_optim:
    name: adam
    lr: 0.001
    betas:
      - 0.9
      - 0.999
    eps: 1e-8
    weight_decay: 0.01
    sched:
      name: WarmupAnnealing
      min_lr: 0.00001
      last_epoch: -1

Overwriting /workspace/bionemo/examples/molecule/megamolbart/conf/finetune_config2.yaml


In [11]:
%%writefile train_script.py
from nemo.core.config import hydra_runner
from nemo.utils import logging
from omegaconf.omegaconf import OmegaConf
from bionemo.model.utils import (setup_trainer,)

import torch
import torch.nn as nn
import bionemo.utils
from functools import lru_cache
from nemo.utils.model_utils import import_class_by_path
from bionemo.model.core import MLPModel
from bionemo.model.core.encoder_finetuning import EncoderFineTuning

#import a BioNeMo data module or your custom data module
from bionemo.data.datasets.single_value_dataset import SingleValueDataModule

class DownstreamTaskModel(EncoderFineTuning):

    def __init__(self, cfg, trainer):
        #store config parameters within object so they can be access easily
        self.full_cfg = cfg
        # we want our downstream model to behave differently based on whether the
        # encoder_frozen config parameter is set to True or False so we store it for 
        # convenient access within the object
        self.encoder_frozen = self.full_cfg.model.encoder_frozen
        super().__init__(cfg.model, trainer=trainer) 
        self.batch_target_name = self.cfg.data.target_column

    def configure_optimizers(self):
        super().setup_optimization(optim_config=self.cfg.finetuning_optim)

        if self._scheduler is None:
            return self._optimizer
        else:
            return [self._optimizer], [self._scheduler]

    # use this function to define what the loss func of the task head should be
    def build_loss_fn(self):
        return bionemo.utils.lookup_or_use(torch.nn, self.cfg.downstream_task.loss_func)

    # define the architecture of our prediction task head for the downstream task
    def build_task_head(self):

        # we create an instance of MLPModel using parameters defined in the config file
        # choose the right task head architecture based on your downstream task (for example,. regression vs classification)
        regressor = MLPModel(layer_sizes=[self.encoder_model.cfg.model.hidden_size, self.cfg.downstream_task.hidden_layer_size, self.cfg.downstream_task.n_outputs],
            dropout=0.1,
        )

        # we can use pytorch libraries to further define our architecture and tensor operations
        task_head = nn.Sequential(regressor, nn.Flatten(start_dim=0))
        return task_head

    # returns the model from which we will use the pretrained encoder
    def setup_encoder_model(self, cfg, trainer):
        infer_class = import_class_by_path(self.full_cfg.infer_target)
        pretrained_model = infer_class(
            self.full_cfg, 
            freeze=self.encoder_frozen, #determines whether encoders weights are trainable
            restore_path=self.full_cfg.restore_from_path,
            training=not self.cfg.encoder_frozen)
        return pretrained_model

    # use this function to define all your data operations
    # in this example, we use the config parameter to determine the value of our model variable
    # then we pass it into an instance of SingleValueDataModule()
    @lru_cache
    def data_setup(self):
        if self.encoder_frozen:
            model = self.encoder_model
        else:
            model = None
        self.data_module = SingleValueDataModule(
            self.cfg, self.trainer, model=model
        )

    # ensures that we create our necessary datasets 
    def on_fit_start(self):
        self.build_train_valid_test_datasets()
        return super().on_fit_start()

    # function that simply instatiates our datasets and stores them within our object 
    def build_train_valid_test_datasets(self):
        self._train_ds = self.data_module.get_sampled_train_dataset()
        self._validation_ds = self.data_module.get_sampled_val_dataset()
        self._test_ds = self.data_module.get_sampled_test_dataset()

    # define the behavior for retrieving embeddings from encoder
    def encoder_forward(self, bart_model, batch: dict):
        if self.encoder_frozen:
            enc_output = batch["embeddings"]
        else:
            enc_output = bart_model.seq_to_embeddings(batch["embeddings"])
        return enc_output

    # define additional operations on the encoder output
    # in this example we simply convert the values of the tensor to float
    # see forward() in encoder_finetuning.py for additional information
    def extract_for_task_head(self, input_tensor):
        return input_tensor.float()
    
    def get_target_from_batch(self, batch):
        ret = batch['target']

        return ret.float()

@hydra_runner(config_path="/workspace/bionemo/examples/molecule/megamolbart/conf/", config_name="finetune_config2") 
def main(cfg) -> None:

    logging.info("\n\n************* Finetune config ****************")
    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
    cfg['exp_manager']['create_wandb_logger'] = False
    cfg['model']['downstream_task']['restore_from_path'] = '/workspace/bionemo/MegaMolBART_0_2_0.nemo'
    trainer = setup_trainer(
        cfg, builder=None,)

    # we instantiate our model 
    model = DownstreamTaskModel(cfg, trainer)

    if cfg.do_training:
        logging.info("************** Starting Training ***********")
        trainer.fit(model) # train our downstream task model using the dataset defined in config
        logging.info("************** Finished Training ***********")

    if cfg.do_testing:
        if "test" in cfg.model.data.dataset:
            trainer.test(model)
        else:
            raise UserWarning("Skipping testing, test dataset file was not provided. Specify 'test_ds.data_file' in yaml config")

if __name__ == '__main__':
    main()

Overwriting train_script.py


In [12]:
!python train_script.py

[NeMo I 2024-06-20 15:48:23 megatron_hiddens:110] Registered hidden transform sampled_var_cond_gaussian at bionemo.model.core.hiddens_support.SampledVarGaussianHiddenTransform
[NeMo I 2024-06-20 15:48:23 megatron_hiddens:110] Registered hidden transform interp_var_cond_gaussian at bionemo.model.core.hiddens_support.InterpVarGaussianHiddenTransform
    
    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
      ret = run_job(
    
[NeMo I 2024-06-20 15:48:24 train_script:110] 
    
    ************* Finetune config ****************
[NeMo I 2024-06-20 15:48:24 train_script:111] 
    name: mmb_physchem
    do_training: true
    do_testing: true
    seed: 42
    restore_from_path: null
    trainer:
      devices: 1
      num_nodes: 1
      precision: 16-mixed
      accelerator: gpu
      max_epochs: 100
      max_steps: 10000
      log_every_n_steps: 100
      val_check_interval: 1
      num_sanity_val_steps: 2
      limit_train_batches: 1