## Fine-tuning
- This notebook contains code for the fine-tuning of target task regressor using pre-trained weights of general domain language model 
- The code is adapted from https://github.com/XinhaoLi74/MolPMoFiT/blob/master/notebooks/04_QSAR_Regression.ipynb

In [None]:
!git clone https://github.com/fastai/fastai1.git

Cloning into 'fastai1'...
remote: Enumerating objects: 32981, done.[K
remote: Counting objects: 100% (212/212), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 32981 (delta 96), reused 198 (delta 92), pack-reused 32769[K
Receiving objects: 100% (32981/32981), 471.68 MiB | 22.84 MiB/s, done.
Resolving deltas: 100% (23993/23993), done.


In [None]:
pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.8 MB)
[K     |████████████████████████████████| 36.8 MB 92.7 MB/s 
Installing collected packages: rdkit
Successfully installed rdkit-2022.3.5


#### Install RDKit on Google colaboratory

In [None]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO

Import the important libraries

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*') # switch off RDKit warning messages

from sklearn.model_selection import train_test_split

from fastai1.fastai import *
from fastai1.fastai.text import *
from fastai1.fastai.vision import *

import numpy as np
import threading
import random
from sklearn.utils import shuffle 

Set the seed value

In [None]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

# Data
Mount Google Drive to Google Colab to access the google drive files 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Create a path to save the results

data_path = Path('/content/gdrive/My Drive/results')
name = 'regressor'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)

In [None]:
data = pd.read_csv('/content/gdrive/MyDrive/AI /Drug Discovery/TL_Catalysis_Code/Data/Fine-Tuning/reaction-3.csv')
print('Dataset:', data.shape)

Dataset: (368, 2)


### Target task regressor fine-tuning on target task LM

Train-validation-test splits

- Split the data into train-validation-test sets
- Validation set is used for hyperparameter tuning
- Test set is used for the final performance evaluation

In [None]:
random_seed(1234, True)

train_ , test = train_test_split(data, test_size=0.20, random_state=100)
train, valid = train_test_split(train_, test_size=0.125, random_state=0)
print(train.shape)
print(test.shape)
print(valid.shape)

(257, 2)
(74, 2)
(37, 2)


### SMILES augmentation for regression task

- For the regression task, a gaussian noise (with mean zero and standard deviation, σg_noise) is added to the labels of the augmented SMILES during the training
- The number of augmented SMILES and σg_noise is tuned on the validation set

In [None]:
def randomize_smiles(smiles):
    m = Chem.MolFromSmiles(smiles)
    ans = list(range(m.GetNumAtoms()))
    np.random.shuffle(ans)
    nm = Chem.RenumberAtoms(m,ans)
    return Chem.MolToSmiles(nm, canonical=False, isomericSmiles=True, kekuleSmiles=False)

In [None]:
def ee_smiles_augmentation(df, N_rounds, noise):
    '''
    noise: add gaussion noise to the label
    '''
    dist_aug = {col_name: [] for col_name in df}

    for i in range(df.shape[0]):
        for j in range(N_rounds):
            dist_aug['smiles'].append(randomize_smiles(df.iloc[i].smiles))
            dist_aug['ee'].append(df.iloc[i]['ee'] + np.random.normal(0,noise))
    df_aug = pd.DataFrame.from_dict(dist_aug)
    df_aug = df_aug.append(df, ignore_index=True)
    return df_aug.drop_duplicates('smiles')

In [None]:
%%time
random_seed(1234, True)
# No of rounds changed to 50 from 100
train_aug = ee_smiles_augmentation(train, 100, noise=0.5)
print("Train_aug: ", train_aug.shape)

Train_aug:  (25950, 2)
CPU times: user 23.9 s, sys: 0 ns, total: 23.9 s
Wall time: 23.9 s


### Data pre-processing

Define a custom tokenizer

In [None]:
# Don't include the defalut specific token of fastai, only keep the padding token
BOS,EOS,FLD,UNK,PAD = 'xxbos','xxeos','xxfld','xxunk','xxpad'
TK_MAJ,TK_UP,TK_REP,TK_WREP = 'xxmaj','xxup','xxrep','xxwrep'
defaults.text_spec_tok = [PAD]

special_tokens = ['[BOS]', '[C@H]', '[C@@H]','[C@]', '[C@@]','[C-]','[C+]', '[c-]', '[c+]','[cH-]',
                   '[nH]', '[N+]', '[N-]', '[n+]', '[n-]' '[NH+]', '[NH2+]',
                   '[O-]', '[S+]', '[s+]', '[S-]', '[O+]', '[SH]', '[B-]','[BH2-]', '[BH3-]','[b-]',
                   '[PH]','[P+]', '[I+]', 
                  '[Si]','[SiH2]', '[Se]','[SeH]', '[se]', '[Se+]', '[se+]','[te]','[te+]', '[Te]']

class MolTokenizer(BaseTokenizer):
    def __init__(self, lang = 'en', special_tokens = special_tokens):
        self.lang = lang
        self.special_tokens = special_tokens
        
    def tokenizer(self, smiles):
        # add specific token '[BOS]' to represetences the start of SMILES
        smiles = '[BOS]' + smiles
        regex = '(\[[^\[\]]{1,10}\])'
        char_list = re.split(regex, smiles)
        tokens = []
        
        if self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    if char in special_tokens:
                        tokens.append(str(char))
                    else:
                        tokens.append('[UNK]')
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]                    
        
        if not self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    tokens.append(str(char))
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]
                
        #fix the 'Br' be splited into 'B' and 'r'
        if 'B' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'B':
                    if index < len(tokens)-1: # make sure 'B' is not the last character
                        if tokens[index+1] == 'r':
                            tokens[index: index+2] = [reduce(lambda i, j: i + j, tokens[index : index+2])]
        
        #fix the 'Cl' be splited into 'C' and 'l'
        if 'l' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'l':
                    if tokens[index-1] == 'C':
                            tokens[index-1: index+1] = [reduce(lambda i, j: i + j, tokens[index-1 : index+1])]
        return tokens    
    
    def add_special_cases(self, toks):
        pass

In [None]:
bs = 128
tok = Tokenizer(partial(MolTokenizer, special_tokens = special_tokens), n_cpus=6, pre_rules=[], post_rules=[])

Adpot the encoder of the general domain LM according to the target dataset



In [None]:
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) 
random_seed(1234, True)

lm_vocab = TextLMDataBunch.from_df(path, train_aug, valid, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols=0,label_cols=1, max_vocab=60000, include_bos=False, min_freq=1, num_workers=0)
print(f'Vocab Size: {len(lm_vocab.vocab.itos)}')

Vocab Size: 40


In [None]:
pretrained_model_path = Path('/content/gdrive/MyDrive/AI /Drug Discovery/TL_Catalysis_Code/results/pre-trained_new/models')
pretrained_fnames = ['pre-trained_new_wt', 'pre-trained_new_vocab']
fnames = [pretrained_model_path/f'{fn}.{ext}' for fn,ext in zip(pretrained_fnames, ['pth', 'pkl'])]

In [None]:
fnames

[PosixPath('/content/gdrive/MyDrive/AI /Drug Discovery/TL_Catalysis_Code/results/pre-trained_new/models/pre-trained_new_wt.pth'),
 PosixPath('/content/gdrive/MyDrive/AI /Drug Discovery/TL_Catalysis_Code/results/pre-trained_new/models/pre-trained_new_vocab.pkl')]

In [None]:
random_seed(1234, True)

lm_learner = language_model_learner(lm_vocab, AWD_LSTM, drop_mult=0.2, pretrained=False)
lm_learner = lm_learner.load_pretrained(*fnames)
lm_learner.freeze()
lm_learner.save_encoder(f'lm_encoder_31')

Create a text databunch for regression:

- It takes as input the train and validation data
- Pass the vocab of the pre-trained LM as defined in the previous step
- Specify the column containing text data and output
- Define the batch size according to the GPU memory available


In [None]:
random_seed(1234, True)

data_clas = TextClasDataBunch.from_df(path, train_aug, valid, bs=bs, tokenizer=tok, 
                                          chunksize=50000, text_cols='smiles',label_cols='ee', 
                                          vocab=lm_vocab.vocab, max_vocab=60000, include_bos=False, min_freq=1, num_workers=0)

print(f'Vocab Size: {len(data_clas.vocab.itos)}')

Vocab Size: 40


### Training the regression model

Create a learner for regression:

- Pass the databunch
- Load the encoder of the pre-trained LM
- The drop_mult hyperparameter can be tuned
- The model is evaluated using RMSE and R-squared value as error metric

In [None]:
random_seed(1234, True)

reg_learner = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2, metrics = [r2_score, rmse])
reg_learner.load_encoder(f'lm_encoder_31')
reg_learner.freeze()

In [None]:
# Model architecture
reg_learner.model

SequentialRNN(
  (0): MultiBatchEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(40, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(40, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1152, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1152, 1152, batch_first=True)
        )
        (2): WeightDropout(
          (module): LSTM(1152, 400, batch_first=True)
        )
      )
      (input_dp): RNNDropout()
      (hidden_dps): ModuleList(
        (0): RNNDropout()
        (1): RNNDropout()
        (2): RNNDropout()
      )
    )
  )
  (1): PoolingLinearClassifier(
    (layers): Sequential(
      (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Dropout(p=0.08000000000000002, inplace=False)
      (2): Linear(in_features=1200, out_features=50, bias=True)
      (3): ReLU(inplace=True)
      (4): BatchN

The regressor is fine-tuned using gradual unfreezing method in four steps:

- the regressor
- the regressor and the final LSTM layer
- the regressor and the last two LSTM layers, and
- the full model

Number of epochs and learning rate in each of these steps are tuned

In [None]:
random_seed(1234, True)

# Here also, i have edited the code. lr was missing
lr = 1e-3
reg_learner.fit_one_cycle(5, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,r2_score,root_mean_squared_error,time
0,265.567627,167.136642,-0.452972,12.928134,00:46
1,308.623779,157.626633,-0.370298,12.554945,00:44
2,272.628998,181.502716,-0.577861,13.472295,00:42
3,323.703613,174.620346,-0.51803,13.214399,00:44
4,308.649719,167.15741,-0.453152,12.928937,00:44


In [None]:
 random_seed(1234, True)

reg_learner.freeze_to(-2)
reg_learner.fit_one_cycle(6, 1e-2, moms=(0.8,0.7))

epoch,train_loss,valid_loss,r2_score,root_mean_squared_error,time
0,271.329407,127.850441,-0.111444,11.307096,00:57
1,314.899292,149.679352,-0.30121,12.234351,00:55
2,278.01181,170.596664,-0.483051,13.061266,00:53
3,328.85907,174.91629,-0.520603,13.225593,00:55
4,311.540863,148.10051,-0.287485,12.169655,00:55
5,268.188232,136.027237,-0.182528,11.663072,00:54


In [None]:
random_seed(1234, True)

reg_learner.freeze_to(-3)
reg_learner.fit_one_cycle(6, 1e-3, moms=(0.8,0.7))

epoch,train_loss,valid_loss,r2_score,root_mean_squared_error,time
0,264.878296,127.455475,-0.108011,11.289618,01:38
1,304.995178,136.588058,-0.187403,11.687089,01:37
2,270.969513,143.841522,-0.25046,11.993395,01:31
3,318.143707,159.019226,-0.382405,12.610283,01:36
4,304.021393,138.337173,-0.202609,11.761683,01:35
5,261.185577,145.109222,-0.26148,12.046129,01:34


In [None]:
random_seed(1234, True)

reg_learner.unfreeze()
reg_learner.fit_one_cycle(6, 1e-3, moms=(0.8,0.7))

epoch,train_loss,valid_loss,r2_score,root_mean_squared_error,time
0,257.354553,153.812683,-0.337142,12.402124,02:09
1,292.2854,149.982193,-0.303843,12.246721,02:07
2,255.454926,209.600937,-0.822127,14.477602,02:01
3,307.490112,194.993912,-0.695144,13.964023,02:06
4,294.639099,163.189056,-0.418654,12.774548,02:05
5,251.174698,181.370468,-0.576711,13.467385,02:03


The regressor can also be fine-tuned all at once without any frozen weights (i.e., no gradual unfreezing)

In [None]:
random_seed(1234, True)

reg_learner.unfreeze()
reg_learner.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))

epoch,train_loss,valid_loss,r2_score,root_mean_squared_error,time
0,246.956985,175.830109,-0.528547,13.260095,02:09
1,286.452881,148.796631,-0.293536,12.198222,02:07
2,252.796509,153.944321,-0.338287,12.40743,02:01
3,307.349548,215.212311,-0.870909,14.670116,02:06
4,292.806091,141.571564,-0.230726,11.898385,02:05
5,247.096344,144.315765,-0.254583,12.013149,02:03
6,248.780045,152.388748,-0.324764,12.344584,02:02
7,231.387833,133.896774,-0.164007,11.571378,02:08
8,279.721252,145.36731,-0.263724,12.056837,02:04
9,292.067841,180.401749,-0.568289,13.431372,01:56


Save the trained learner. It is then later used for prediction on test set

In [None]:
split_id = 31 # reaction 3 model with TL_m1  
reg_learner.save(f'{split_id}_reg')

#### Evaluation on the Test Set

In [None]:
def test_smiles_augmentation(df, N_rounds):
    dist_aug = {col_name: [] for col_name in df}

    for i in range(df.shape[0]):
        for j in range(N_rounds):
            dist_aug['smiles'].append(randomize_smiles(df.iloc[i].smiles))
            dist_aug['ee'].append(df.iloc[i]['ee'])
    df_aug = pd.DataFrame.from_dict(dist_aug)
    
    return pd.DataFrame.from_dict(dist_aug) 

The test set performance is evaluated using the predictions based on the canonical SMILES as well as that employing test-time augmentation

In [None]:
preds = []

# Randomized SMILES Predictions
for i in range(4):
    np.random.seed(12*i)
    test_aug = test_smiles_augmentation(test,1)
    
    #model
    test_db = TextClasDataBunch.from_df(path, train, test_aug, tokenizer=tok, vocab=lm_vocab.vocab,
                                            text_cols='smiles', label_cols='ee', bs=bs, include_bos=False)
    
    learner = text_classifier_learner(test_db, AWD_LSTM, pretrained=False, drop_mult=0.2, metrics = [r2_score, root_mean_squared_error])
    #print(test_db)
    learner.load(f'{split_id}_reg'); 
  
    #get predictions
    pred,lbl = learner.get_preds(ds_type=DatasetType.Valid)
    
    preds.append(pred)

# Canonical SMILES Predictions
test_db = TextClasDataBunch.from_df(path, train, test, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols='smiles',label_cols='ee', vocab=lm_vocab.vocab, max_vocab=60000,
                                              include_bos=False)

learner = text_classifier_learner(test_db, AWD_LSTM, pretrained=False, drop_mult=0.2, metrics = [r2_score, root_mean_squared_error])

learner.load(f'{split_id}_reg');


#get predictions
pred_canonical,lbl2 = learner.get_preds(ds_type=DatasetType.Valid)
    
#preds.append(pred_canonical)

In [None]:
print('Test Set (Canonical)')
print('RMSE:', root_mean_squared_error(pred_canonical,lbl2))
print('MAE:', mean_absolute_error(pred_canonical,lbl2))
print('R2:', r2_score(pred_canonical,lbl2))

avg_preds = sum(preds)/len(preds)

#print('\n')
print('Test Set (Average)')
print('RMSE:', root_mean_squared_error(avg_preds,lbl))
print('MAE:', mean_absolute_error(avg_preds,lbl))
print('R2:', r2_score(avg_preds,lbl))

Test Set (Canonical)
RMSE: tensor(20.5161)
MAE: tensor(13.6415)
R2: tensor(-0.0820)
Test Set (Average)
RMSE: tensor(20.2574)
MAE: tensor(13.5375)
R2: tensor(-0.0548)
