In [17]:
%load_ext autoreload
%autoreload 2
# Occupy a GPU for the model to be loaded 
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
# GPU ID, if occupied change to an available GPU ID listed under !nvidia-smi
%env CUDA_VISIBLE_DEVICES=0

import numpy as np
import rdkit
from rdkit import Chem
import h5py, ast, pickle

from ddc_pub import ddc_v3 as ddc

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [18]:
def get_descriptors(binmols_list, qsar_model=None):
    """Calculate molecular descriptors of SMILES in a list.
    The descriptors are logp, tpsa, mw, qed, hba, hbd and probability of being active towards DRD2.
    
    Returns:
        A np.ndarray of descriptors.
    """
    from tqdm import tqdm_notebook as tqdm
    import rdkit
    from rdkit import Chem, DataStructs
    from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, QED
    
    descriptors = []
    active_mols = []
    
    for idx, binmol in enumerate(binmols_list):
        # Convert to mol
        mol = Chem.Mol(binmol)
        # If valid, calculate its properties
        if mol:
            try:
                logp  = Descriptors.MolLogP(mol)
                tpsa  = Descriptors.TPSA(mol)
                molwt = Descriptors.ExactMolWt(mol)
                hba   = rdMolDescriptors.CalcNumHBA(mol)
                hbd   = rdMolDescriptors.CalcNumHBD(mol)
                qed   = QED.qed(mol)
                
                # Calculate fingerprints
                fp = AllChem.GetMorganFingerprintAsBitVect(mol,2, nBits=2048)
                ecfp4 = np.zeros((2048,))
                DataStructs.ConvertToNumpyArray(fp, ecfp4) 
                # Predict activity and pick only the second component
                active = qsar_model.predict_proba([ecfp4])[0][1]
                descriptors.append([logp, tpsa, molwt, qed, hba, hbd, active]) 
                
            except Exception as e:
                print(e)
        # Else, return None
        else:
            print("Invalid generation.")
            
    return np.asarray(descriptors)

In [19]:
# Load QSAR model
qsar_model_name = "models/qsar_model.pickle"
with open(qsar_model_name, "rb") as file:
    qsar_model = pickle.load(file)["classifier_sv"]

In [20]:
# Load dataset
dataset_filename = "datasets/CHEMBL25_TRAIN_MOLS.h5"
with h5py.File(dataset_filename, "r") as f:
    binmols = f["mols"][:]

In [21]:
descr = get_descriptors(binmols[:2000], qsar_model=qsar_model)

In [22]:
# A priori known charset for the SMILES in the dataset
charset = "Brc1(-23[nH])45C=NOso#FlS67+89%0"
# A priori known max length for the SMILES in the dataset
maxlen = 128
# Name for the dataset
name = "ChEMBL25_TRAIN"

dataset_info = {"charset": charset, "maxlen": maxlen, "name": name}

In [23]:
# Initialize a model
model = ddc.DDC(x              = descr,
                y              = binmols[:2000],
                dataset_info   = dataset_info,
                scaling        = True,
                noise_std      = 0.1,
                lstm_dim       = 512,
                dec_layers     = 3,
                td_dense_dim   = 0,
                batch_size     = 128,
                codelayer_dim  = -1)



Initializing model in train mode.
Input type is 'molecular descriptors'.
Applying scaling on input.
Ignoring requested codelayer_dim because it is inferred from the cardinality of the descriptors.
Model received 1800 train samples and 200 validation samples.
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Latent_Input (InputLayer)       [(None, 7)]          0                                            
__________________________________________________________________________________________________
Decoder_Inputs (InputLayer)     [(None, 142, 35)]    0                                            
__________________________________________________________________________________________________
latent_to_states_model (Model)  [(None, 512), (None, 36864       Latent_Input[0][0]               
_______________________________