In [1]:
%reload_ext autoreload
%autoreload 2
%load_ext watermark
%watermark -a 'Marcos Santana' -d -p fastai,descriptastorus,numpy,pandas,torch,joblib,guacamol,fastinference,rdkit,tqdm -v



Author: Marcos Santana

Python implementation: CPython
Python version       : 3.9.12
IPython version      : 8.4.0

fastai         : 2.7.9
descriptastorus: 2.3.0.6
numpy          : 1.22.3
pandas         : 1.4.2
torch          : 1.12.0
joblib         : 1.1.0
guacamol       : 0.5.4
fastinference  : 0.0.36
rdkit          : 2022.03.4
tqdm           : 4.62.3



# Dependencies and module

In [2]:
import pandas as pd
import numpy as np

from fastai.text.all import *
from denovo_design.tokenizer import MolTokenizer
from denovo_design.generative_callback import *

from tqdm.notebook import trange, tqdm
from rdkit import rdBase

rdBase.DisableLog('rdApp.error')
rdBase.DisableLog('rdApp.info')
from rdkit.Chem import MolFromSmiles, MolToSmiles

2022-08-18 12:03:33.950289: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-08-18 12:03:33.950343: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Load data

In [3]:
lm_data = pd.read_csv(f'../datasets/processed_data/fxa_processed.csv')

In [4]:
act_data = lm_data[lm_data['Label']==1]

In [5]:
act_data.shape

(2423, 21)

In [6]:
MODELS = Path('models')
FT = Path(f'{MODELS}/testingcallback')
ENCODER = Path(f'{MODELS}/encoder')
FT.mkdir(exist_ok=True, parents=True)
ENCODER.mkdir(exist_ok=True, parents=True)
FT,ENCODER

(Path('models/testingcallback'), Path('models/encoder'))

# Dataloader

In [7]:
config=awd_lstm_lm_config.copy()

In [8]:
splitter = RandomSplitter(0.10)
spec_toks = [BOS, PAD]
text_col = 'processed_smiles'
tfms = [attrgetter(text_col),
        Tokenizer.from_df(text_cols=text_col,tok=MolTokenizer(),rules=[],tok_text_col=text_col),
        Numericalize(min_freq = 1, special_toks=spec_toks)]
dst = Datasets(act_data, tfms=[tfms], splits = splitter(act_data), dl_type=LMDataLoader)

In [9]:
dls = dst.dataloaders(bs=64,seq_len=30)

In [10]:
dls.vocab = [x for x in dls.vocab if x != 'xxfake']

In [11]:
len(dls.train), len(dls.valid)

(71, 8)

In [12]:
dls.show_batch()

Unnamed: 0,text,text_
0,xxbos C [C@@H] ( C ( = O ) N ( C ) C ) N 1 C C [C@H] ( N S ( = O ) ( = O,C [C@@H] ( C ( = O ) N ( C ) C ) N 1 C C [C@H] ( N S ( = O ) ( = O )
1,2 ) C 1 = O ) S ( = O ) ( = O ) c 1 c c c 2 c c ( Cl ) c c c,) C 1 = O ) S ( = O ) ( = O ) c 1 c c c 2 c c ( Cl ) c c c 2
2,C ( = O ) c 1 s c c ( C N 2 C C S C C 2 ) c 1 Cl xxbos N = C ( N,( = O ) c 1 s c c ( C N 2 C C S C C 2 ) c 1 Cl xxbos N = C ( N )
3,) c 2 c 1 xxbos C O C ( = O ) C C C N ( c 1 c c c 2 n c ( C ) n,c 2 c 1 xxbos C O C ( = O ) C C C N ( c 1 c c c 2 n c ( C ) n (
4,O C 1 C C C C C 1 xxbos N = C ( N ) c 1 c c c ( N C ( = O ) C 2,C 1 C C C C C 1 xxbos N = C ( N ) c 1 c c c ( N C ( = O ) C 2 C
5,xxbos O = C ( C N C ( = O ) C ( C C c 1 c c c c [n+] 1 [O-] ) N S ( =,O = C ( C N C ( = O ) C ( C C c 1 c c c c [n+] 1 [O-] ) N S ( = O
6,Cl ) s 1 xxbos C / C ( = C ( \ F ) C ( = O ) N c 1 c c c ( - c 2,) s 1 xxbos C / C ( = C ( \ F ) C ( = O ) N c 1 c c c ( - c 2 c
7,C ) C ) c 1 xxbos N = C ( N ) c 1 c c c ( O ) c ( / C = C / C N,) C ) c 1 xxbos N = C ( N ) c 1 c c c ( O ) c ( / C = C / C N C
8,N 1 C C C ( N C ( = O ) c 2 c c 3 c c c c c 3 n 2 C C ( = O,1 C C C ( N C ( = O ) c 2 c c 3 c c c c c 3 n 2 C C ( = O )


In [13]:
#torch.save(dls, ENCODER/f'encoder_dataloader.pkl')

# Train generative model

In [14]:
#dls = torch.load(ENCODER/f'encoder_dataloader.pkl')

In [15]:
vocab_path = ENCODER/'vocab.pkl'
pretrained_path = ENCODER/'models/fit6.pth'
pretrained_path,vocab_path

(Path('models/encoder/models/fit6.pth'), Path('models/encoder/vocab.pkl'))

In [16]:
sampler_cb = GenerativeCallback(reference_mols = lm_data.processed_smiles.unique().tolist(),temperature=1.0,max_mols=20,max_size=100, output_file=FT/'generated_mols.smi')
lm_data.shape[0]

3503

In [17]:
validity_score = ValueMetric(sampler_cb._validity_score, 'Validity')
novelty_score = ValueMetric(sampler_cb._novelty_score, 'Novelty')
uniqueness_score = ValueMetric(sampler_cb._uniqueness_score, 'Uniqueness')

In [18]:
cbs = [sampler_cb, CSVLogger(append=True)]
metrics = [validity_score, novelty_score, uniqueness_score]

In [19]:
learn = language_model_learner(dls, AWD_LSTM,pretrained=False,path=FT,metrics=metrics,
                               drop_mult=0.25).load_pretrained('models/finetuning/fit6.pth', 'models/finetuning/vocab.pkl')

In [20]:
#learn.lr_find(num_it=600)

In [21]:
#learn.recorder.plot_lr_find(skip_end=80)

In [22]:
lrs=1e-2
lr = slice(lrs/2.6**4, lrs)

In [23]:
learn.fit_one_cycle(2, lr, cbs=cbs)

epoch,train_loss,valid_loss,Validity,Novelty,Uniqueness,time
0,0.640984,0.651613,0.95,0.95,0.95,00:18
1,0.597784,0.633443,0.8,0.8,0.8,00:19




In [None]:
learn.save('fit1');

In [None]:
learn.freeze_to(-2)

In [None]:
learn.fit_one_cycle(5, lr, cbs=cbs)

In [None]:
learn.freeze_to(-3)

In [None]:
learn.fit_one_cycle(5, lr, cbs=cbs)

In [None]:
learn.unfreeze()

In [None]:
learn.fit_one_cycle(5, lr, cbs=cbs)

In [None]:
learn.save_encoder('finetuned_encoder')

In [None]:
learn.export()

# Generate molecules

In [None]:
from denovo_design.moleculegenerator import MolGenerator

In [None]:
mol_generator = MolGenerator(model_fname = FT/'export.pkl', cpu=False)

In [None]:
generated_mols = pd.DataFrame({'smiles':mol_generator.generate_mols(max_mols=5000,temperature=1.0,max_size=70)})

In [None]:
np.

In [None]:
generated_mols['act'] = None
generated_mols['ID'] = [f'mol_{x}' for x in range(len(generated_mols))]

In [None]:
import joblib
ml_model = joblib.load('../FXA/regression/regressor_rf3000.pkl')

In [None]:
from sophosdata.sophos.featurizer import Fingerprinter
from sophosdata.sophos.standardizer import SophosSanitizer, normalize_mol

In [None]:
fingerprinter = Fingerprinter('ecfp')

In [None]:
generated_mols['canonical_smiles'] = generated_mols['smiles'].apply(lambda x : MolToSmiles(normalize_mol(x)))

In [None]:
generated_mols.dropna(subset='canonical_smiles',inplace=True)

In [None]:
xtest = fingerprinter.generate_features(generated_mols['canonical_smiles'].values)

In [None]:
#generated_mols['probas'] = ml_model.predict_proba(xtest)[:, 1]
generated_mols['preds_reg'] = ml_model.predict(xtest)

In [None]:
generated_mols.head()

In [None]:
generated_mols.to_csv('../FXA/generated_molecules.csv',index=False)

In [None]:
MolFromSmiles('O=C(NC1CCN(c2ccccc2CN2CCC[C@H]2O)CC1)c1cc(F)n(-c2ccc(F)cc2)c1')