In [None]:
!git clone https://github.com/fastai/fastai1.git

Cloning into 'fastai1'...
remote: Enumerating objects: 32976, done.[K
remote: Counting objects: 100% (207/207), done.[K
remote: Compressing objects: 100% (113/113), done.[K
remote: Total 32976 (delta 95), reused 197 (delta 91), pack-reused 32769[K
Receiving objects: 100% (32976/32976), 471.68 MiB | 31.15 MiB/s, done.
Resolving deltas: 100% (23992/23992), done.
Checking out files: 100% (815/815), done.


In [None]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO

logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()

add /root/miniconda/lib/python3.7/site-packages to PYTHONPATH
INFO:__main__:add /root/miniconda/lib/python3.7/site-packages to PYTHONPATH
python version: 3.7.13
INFO:__main__:python version: 3.7.13
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
INFO:__main__:fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
INFO:__main__:done
installing miniconda to /root/miniconda
INFO:__main__:installing miniconda to /root/miniconda
done
INFO:__main__:done
installing rdkit
INFO:__main__:installing rdkit
done
INFO:__main__:done
rdkit-2020.09.1 installation finished!
INFO:__main__:rdkit-2020.09.1 installation finished!


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*') # switch off RDKit warning messages

from sklearn.model_selection import train_test_split

from fastai1.fastai import *
from fastai1.fastai.text import *
from fastai1.fastai.vision import *

import numpy as np
import threading
import random
from sklearn.utils import shuffle 

In [None]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Create a path to save the results
from pathlib import Path

data_path = Path('/content/gdrive/MyDrive/AI /Drug Discovery/TL_Catalysis_Code/results')
name = 'regressor_20'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)

In [None]:
import pandas as pd

data = pd.read_excel('/content/gdrive/MyDrive/AI /Drug Discovery/TL_Catalysis_Code/Data/Fine-Tuning/reaction-2.xlsx')
print('Dataset:', data.shape)

Dataset: (1075, 2)


In [None]:

random_seed(1234, True)

train_ , test = train_test_split(data, test_size=0.20, random_state=100)
train, valid = train_test_split(train_, test_size=0.125, random_state=0)
print(train.shape)
print(test.shape)
print(valid.shape)

(752, 2)
(215, 2)
(108, 2)


In [None]:
def randomize_smiles(smiles):
    m = Chem.MolFromSmiles(smiles)
    ans = list(range(m.GetNumAtoms()))
    np.random.shuffle(ans)
    nm = Chem.RenumberAtoms(m,ans)
    return Chem.MolToSmiles(nm, canonical=False, isomericSmiles=True, kekuleSmiles=False)

In [None]:
# For Reaction-3 'yield' is replaced by 'ee'.
def ee_smiles_augmentation(df, N_rounds, noise):
    '''
    noise: add gaussion noise to the label
    '''
    dist_aug = {col_name: [] for col_name in df}

    for i in range(df.shape[0]):
        for j in range(N_rounds):
            dist_aug['smiles'].append(randomize_smiles(df.iloc[i].smiles))
            dist_aug['ee'].append(df.iloc[i]['ee'] + np.random.normal(0,noise))
    df_aug = pd.DataFrame.from_dict(dist_aug)
    df_aug = df_aug.append(df, ignore_index=True)
    return df_aug.drop_duplicates('smiles')

In [None]:
%%time 
random_seed(1234, True)

# N_rounds changed to 50 from 100
train_aug = ee_smiles_augmentation(train, 50, noise=0.5)
print("Train_aug: ", train_aug.shape)

Train_aug:  (38351, 2)
CPU times: user 49 s, sys: 233 ms, total: 49.2 s
Wall time: 49.1 s


In [None]:
# Don't include the defalut specific token of fastai, only keep the padding token
BOS,EOS,FLD,UNK,PAD = 'xxbos','xxeos','xxfld','xxunk','xxpad'
TK_MAJ,TK_UP,TK_REP,TK_WREP = 'xxmaj','xxup','xxrep','xxwrep'
defaults.text_spec_tok = [PAD]

special_tokens = ['[BOS]', '[C@H]', '[C@@H]','[C@]', '[C@@]','[C-]','[C+]', '[c-]', '[c+]','[cH-]',
                   '[nH]', '[N+]', '[N-]', '[n+]', '[n-]' '[NH+]', '[NH2+]',
                   '[O-]', '[S+]', '[s+]', '[S-]', '[O+]', '[SH]', '[B-]','[BH2-]', '[BH3-]','[b-]',
                   '[PH]','[P+]', '[I+]', 
                  '[Si]','[SiH2]', '[Se]','[SeH]', '[se]', '[Se+]', '[se+]','[te]','[te+]', '[Te]']

class MolTokenizer(BaseTokenizer):
    def __init__(self, lang = 'en', special_tokens = special_tokens):
        self.lang = lang
        self.special_tokens = special_tokens
        
    def tokenizer(self, smiles):
        # add specific token '[BOS]' to represetences the start of SMILES
        smiles = '[BOS]' + smiles
        regex = '(\[[^\[\]]{1,10}\])'
        char_list = re.split(regex, smiles)
        tokens = []
        
        if self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    if char in special_tokens:
                        tokens.append(str(char))
                    else:
                        tokens.append('[UNK]')
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]                    
        
        if not self.special_tokens:
            for char in char_list:
                if char.startswith('['):
                    tokens.append(str(char))
                else:
                    chars = [unit for unit in char]
                    [tokens.append(i) for i in chars]
                
        #fix the 'Br' be splited into 'B' and 'r'
        if 'B' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'B':
                    if index < len(tokens)-1: # make sure 'B' is not the last character
                        if tokens[index+1] == 'r':
                            tokens[index: index+2] = [reduce(lambda i, j: i + j, tokens[index : index+2])]
        
        #fix the 'Cl' be splited into 'C' and 'l'
        if 'l' in tokens:
            for index, tok in enumerate(tokens):
                if tok == 'l':
                    if tokens[index-1] == 'C':
                            tokens[index-1: index+1] = [reduce(lambda i, j: i + j, tokens[index-1 : index+1])]
        return tokens    
    
    def add_special_cases(self, toks):
        pass

In [None]:
bs = 128
tok = Tokenizer(partial(MolTokenizer, special_tokens = special_tokens), n_cpus=6, pre_rules=[], post_rules=[])

In [None]:
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

# Here also 'yield' is replaced with 'ee'. 

random_seed(1234, True)
data_clas = TextClasDataBunch.from_df(path, train_aug, valid, bs=bs, tokenizer=tok, 
                                          chunksize=50000, text_cols='smiles',label_cols='ee', 
                                          max_vocab=60000, include_bos=False, min_freq=1, num_workers=0)

print(f'Vocab Size: {len(data_clas.vocab.itos)}')

Vocab Size: 32


In [None]:
random_seed(1234, True)

reg_learner = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.0, metrics = [rmse, r2_score])
reg_learner.unfreeze()

In [None]:
# Model architecture
reg_learner.model

SequentialRNN(
  (0): MultiBatchEncoder(
    (module): AWD_LSTM(
      (encoder): Embedding(32, 400, padding_idx=1)
      (encoder_dp): EmbeddingDropout(
        (emb): Embedding(32, 400, padding_idx=1)
      )
      (rnns): ModuleList(
        (0): WeightDropout(
          (module): LSTM(400, 1152, batch_first=True)
        )
        (1): WeightDropout(
          (module): LSTM(1152, 1152, batch_first=True)
        )
        (2): WeightDropout(
          (module): LSTM(1152, 400, batch_first=True)
        )
      )
      (input_dp): RNNDropout()
      (hidden_dps): ModuleList(
        (0): RNNDropout()
        (1): RNNDropout()
        (2): RNNDropout()
      )
    )
  )
  (1): PoolingLinearClassifier(
    (layers): Sequential(
      (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=1200, out_features=50, bias=True)
      (2): ReLU(inplace=True)
      (3): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_run

In [None]:
%%time
# To turn off the warning in "VisibleDeprecationwarning"
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

random_seed(1234, True)

lr = 1e-3

# I have changed it to one epoch instead of 15 epoch. 
reg_learner.fit_one_cycle(8, lr, moms=(0.8,0.7)) 

epoch,train_loss,valid_loss,root_mean_squared_error,r2_score,time
0,1.0865,4.933871,2.221232,-7.378555,02:06
1,0.554738,0.329801,0.574283,0.439941,02:05
2,0.472795,0.26411,0.513916,0.551496,02:10
3,0.479009,0.707385,0.841062,-0.201261,02:01
4,0.433733,0.192872,0.439172,0.672471,02:05
5,0.411045,0.134274,0.366435,0.771979,02:08
6,0.395672,0.153948,0.392362,0.73857,02:10
7,0.4396,0.121592,0.3487,0.793517,02:00


CPU times: user 16min 38s, sys: 22.4 s, total: 17min
Wall time: 16min 50s


In [None]:
split_id = 20 # Unique ID for TL_m0_reaction_2
reg_learner.save(f'{split_id}_reg')

In [None]:
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

preds = []

# Randomized SMILES Predictions
for i in range(4):
    np.random.seed(12*i)
    test_aug = ee_smiles_augmentation(test,1,noise=0.3)

    # above, I have changed test_smiles_augmentation to smiles augmentation. 

    #model (yeild changed to ee)
    test_db = TextClasDataBunch.from_df(path, train, test_aug, tokenizer=tok, vocab=data_clas.vocab,
                                            text_cols='smiles', label_cols='ee', bs=bs, include_bos=False)
    learner = text_classifier_learner(test_db, AWD_LSTM, pretrained=False, drop_mult=0.0, metrics = [r2_score, rmse])
    
    learner.load(f'{split_id}_reg'); 
  
    #get predictions
    pred,lbl = learner.get_preds(ordered=True)
    
    preds.append(pred)
    
# Canonical SMILES Predictions

# Here 'yield' is replaced by 'ee'.

test_db = TextClasDataBunch.from_df(path, train, test, bs=bs, tokenizer=tok, 
                              chunksize=50000, text_cols='smiles',label_cols='ee', vocab=data_clas.vocab, max_vocab=60000,
                                              include_bos=False)

learner = text_classifier_learner(test_db, AWD_LSTM, pretrained=False, drop_mult=0.0, metrics = [r2_score, rmse])

learner.load(f'{split_id}_reg');

#get predictions
pred_canonical,lbl2 = learner.get_preds(ordered=True)
    
#preds.append(pred_canonical)
#preds

In [None]:
print('Test Set (Canonical)')
print('RMSE:', root_mean_squared_error(pred_canonical,lbl2))
print('R2:', r2_score(pred_canonical,lbl2))
print('MAE:', mean_absolute_error(pred_canonical,lbl2))

avg_preds = sum(preds)/len(preds)
#print('\n')
print('Test Set (Average)')
print('RMSE:', root_mean_squared_error(avg_preds,lbl))
print('R2:', r2_score(avg_preds,lbl))
print('MAE:', mean_absolute_error(avg_preds,lbl))

Test Set (Canonical)
RMSE: tensor(0.3183)
R2: tensor(0.7972)
MAE: tensor(0.2260)
Test Set (Average)
RMSE: tensor(0.3499)
R2: tensor(0.7869)
MAE: tensor(0.2601)
