In [2]:
import os
import tensorflow as tf
from keras import backend as K
from utils import *
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from imblearn.over_sampling import SMOTE
from tokens import tokens_table
from tensorflow.keras.models import model_from_json

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
session = tf.compat.v1.Session()
K.set_session(session)

config_file = '/home/data-house-01/zhangxiang/BBBmodel/configPredictor.json'  # Name of the configuration file
property_identifier = 'bbb'
model_type = 'dnn'  # 'dnn', 'SVR', 'RF', or 'KNN'
descriptor = 'ECFP'  # The type of model's descriptor can be 'SMILES' or 'ECFP'. If we want to use
# rnn architecture we use SMILES. Conversely, if we want to use a fully connected architecture, we use ECFP descriptors.
searchParameters = False  # True (gridSearch) or False (train with the optimal parameters)

config = load_config(config_file, property_identifier)
directories([config.checkpoint_dir])

# Load the table of possible tokens
token_table = tokens_table().table

# Read and extract smiles and labels from the csv file
smiles_raw, labels_raw = reading_csv(config, property_identifier)
# print(len(smiles_raw))
mols = [Chem.MolFromSmiles(x) for x in smiles_raw]

morgan_fp = [AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits = 2048) for x in mols]


# convert the RDKit explicit vectors into numpy arrays
morg_fp_np = []
for fp in morgan_fp:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    morg_fp_np.append(arr)


x_morg = morg_fp_np

x_morg_rsmp, y_morg_rsmp = SMOTE().fit_resample(x_morg, labels_raw)

# Padd each SMILES string with spaces until reaching the size of the largest molecule
smiles_padded, padd = pad_seq(smiles_raw, token_table, 0)
config.paddSize = padd

# Compute the dictionary that makes the correspondence between each token and unique integers
tokenDict = smilesDict(token_table)

# Tokenize - transform the SMILES strings into lists of tokens
[tokens, problem_idx] = tokenize(smiles_padded, token_table)
labels_raw = np.delete(labels_raw, problem_idx).tolist()

# Transforms each token to the respective integer, according to the previously computed dictionary
smiles_int = smiles2idx(tokens, tokenDict)

data_rnn_ecfp = data_division(config, x_morg_rsmp, y_morg_rsmp, True, model_type, descriptor)
x_test = data_rnn_ecfp[2]
y_test = data_rnn_ecfp[3]
data_cv = cv_split(data_rnn_ecfp, config)

data_utils = [[1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.]]


def SMILES2ECFP(smiles, radius=3, bit_len=2048, index=None):
    """
    This function transforms a list of SMILES strings into a list of ECFP with
    radius 3.
    ----------
    smiles: List of SMILES strings to transform
    Returns
    -------
    This function return the SMILES strings transformed into a vector of 4096 elements
    """
    fps = np.zeros((len(smiles), bit_len))
    for i, smile in enumerate(smiles):
        mol = Chem.MolFromSmiles(smile)
        arr = np.zeros((1,))
        try:

            mol = MurckoScaffold.GetScaffoldForMol(mol)

            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bit_len)
            DataStructs.ConvertToNumpyArray(fp, arr)
            fps[i, :] = arr
        except:
            print(smile)
            fps[i, :] = [0] * bit_len
    return pd.DataFrame(fps, index=(smiles if index is None else index))


class Predictor(object):
    def __init__(self, config, tokens, model_type, descriptor_type):
        super(Predictor, self).__init__()
        self.tokens = tokens
        self.config = config
        self.model_type = model_type
        self.descriptor_type = descriptor_type
        loaded_models = []
        for i in range(5):
            json_file = open(
                "/home/data-house-01/zhangxiang/BBBmodel/experiments/bbb-final/Model/" + "model" + str(i) + ".json",
                'r')
            loaded_model_json = json_file.read()
            json_file.close()
            loaded_model = model_from_json(loaded_model_json)
            # load weights into new model
            loaded_model.load_weights(
                "/home/data-house-01/zhangxiang/BBBmodel/experiments/bbb-final/Model/" + "model" + str(i) + ".h5")
            print("Model " + str(i) + " loaded from disk!")
            loaded_models.append(loaded_model)

        self.loaded_models = loaded_models

    def predict(self, smiles, data):
        data_2_predict = SMILES2ECFP(smiles)
        prediction = []
        for m in range(len(self.loaded_models)):
            prediction.append(self.loaded_models[m].predict(data_2_predict))
        prediction = np.array(prediction).reshape(len(self.loaded_models), -1)
        prediction = denormalization(prediction, data)
        prediction = np.mean(prediction, axis=0)
        return prediction

2023-03-28 11:49:30.887212: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30164 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:62:00.0, compute capability: 7.0


Loading configuration file...
Configuration file loaded successfully!


In [3]:
list_ss = [["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CCC(NC(C)=O)C4=CC=CS4"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4=CC(NC(C)=O)=CC=C4)=O)C5=CC=CS5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4=C(OC)C=CC(S(=O)(N)=O)=C4)=O)C5=CC=CS5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4=CC(S(NC)(=O)=O)=CC=C4)=O)C5=CC=CS5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCC(C(OC)=O)CC4)=O)C5=CSC=C5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(CC#N)=O)C4=CC=C(C)S4"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(CN4C=NN=N4)=O)C5=CC=C(C)S5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCN(S(C)(=O)=O)CC4)=O)C5=CC=C(C)S5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCN(S(=O)(C)=O)CC4)=O)C5=CC(C)=CS5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCCC4)=O)C5=CSC=C5C"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCCC4)=O)C5=CC=C(C#N)S5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCCCC4)=O)C5=CC=C(C#N)S"],
           ["O=C(C1CCC(F)(F)CC1)NC(C2=CC=CS2)CCN3[C@@H]4C[C@H](N5C(C6CC6)=NN=C5C(F)(F)F)C[C@H]3CC4"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCCCC4)=O)C5=CC=C(F)S5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCCC4)=O)C5=C(Cl)SC=C5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCC(F)(F)CC4)=O)C5=C(Cl)SC=C5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCC(F)(F)CC4)=O)C5=CSC=N5"],
           ["CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C(F)(F)F)=O)C4=CSC5=C4C=CC=C5"],
           ["CC(C)C1=NN=C(COC)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCC(F)(F)CC4)=O)C5=CSC=C5"],
           ["CC1=NC2=C(CCN(C(C(C)C)=O)C2)N1[C@H]3C[C@H]4N([C@H](CC4)C3)CC[C@H](NC(C5CCC(F)(F)CC5)=O)C6=CSC=C6"],
           ["CC1=NC2=C(CCN(C(C)=O)C2)N1[C@H]3C[C@H]4N([C@H](CC4)C3)CC[C@H](NC(C5CCC(F)(F)CC5)=O)C6=CSC=C6"]]

In [4]:
predictor = Predictor(config, token_table, model_type, descriptor)
prediction0 = predictor.predict(list_ss[0],data_utils)
print(prediction0[0])

2023-03-28 11:49:37.123369: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30164 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:62:00.0, compute capability: 7.0


Model 0 loaded from disk!
Model 1 loaded from disk!
Model 2 loaded from disk!
Model 3 loaded from disk!
Model 4 loaded from disk!
0.9778967


In [5]:
prediction1 = predictor.predict(list_ss[1],data_utils)
print(prediction1)

[0.9615682]


In [26]:
prediction2 = predictor.predict(list_ss[2],data_utils)
print(prediction2)

[0.9615682]


In [25]:
prediction3 = predictor.predict(list_ss[3],data_utils)
print(prediction3)

[0.9615682]


In [8]:
prediction4 = predictor.predict(list_ss[4],data_utils)
print(prediction4)

[0.75009555]


In [9]:
prediction5 = predictor.predict(list_ss[5],data_utils)
print(prediction5)

[0.9778967]


In [10]:
prediction6 = predictor.predict(list_ss[6],data_utils)
print(prediction6)

[0.7612922]


In [11]:
prediction7 = predictor.predict(list_ss[7],data_utils)
print(prediction7)

[0.94922817]


In [12]:
prediction8 = predictor.predict(list_ss[8],data_utils)
print(prediction8)

[0.94922817]


In [13]:
prediction9 = predictor.predict(list_ss[9],data_utils)
print(prediction9)

[0.8606041]


In [14]:
prediction10 = predictor.predict(list_ss[10],data_utils)
print(prediction10)

[0.8852037]


In [15]:
prediction11 = predictor.predict(list_ss[11],data_utils)
print(prediction11)

CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCCCC4)=O)C5=CC=C(C#N)S


RDKit ERROR: [11:49:48] SMILES Parse Error: unclosed ring for input: 'CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCCCC4)=O)C5=CC=C(C#N)S'
[11:49:48] SMILES Parse Error: unclosed ring for input: 'CC1=NN=C(C(C)C)N1[C@H](C[C@H]2CC3)C[C@H]3N2CC[C@H](NC(C4CCCCC4)=O)C5=CC=C(C#N)S'


[0.5351601]


In [16]:
prediction12 = predictor.predict(list_ss[12],data_utils)
print(prediction12)

[0.89739925]


In [17]:
prediction13 = predictor.predict(list_ss[13],data_utils)
print(prediction13)

[0.8271716]


In [18]:
prediction14 = predictor.predict(list_ss[14],data_utils)
print(prediction14)

[0.8606041]


In [19]:
prediction15 = predictor.predict(list_ss[15],data_utils)
print(prediction15)

[0.75009555]


In [27]:
prediction16 = predictor.predict(list_ss[16],data_utils)
print(prediction16)

[0.07591142]


In [21]:
prediction17 = predictor.predict(list_ss[17],data_utils)
print(prediction17)

[0.72282684]


In [22]:
prediction18 = predictor.predict(list_ss[18],data_utils)
print(prediction18)

[0.75009555]


In [23]:
prediction19 = predictor.predict(list_ss[19],data_utils)
print(prediction19)

[0.918887]


In [24]:
prediction20 = predictor.predict(list_ss[20],data_utils)
print(prediction20)

[0.918887]
