In [1]:
import pandas as pd 
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import rdChemReactions as Reactions

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import keras
from keras import backend as K
from keras.models import load_model
import argparse
import h5py
import pdb

2022-11-29 21:23:45.873433: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-29 21:23:46.083324: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-29 21:24:00.309560: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-29 21:24:00.309733: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M',
    'S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']
seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}


def encodeSeq(seq, seq_dic):
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]


def load_modelfile(model_string):
	loaded_model = tf.keras.models.load_model(model_string)
	return loaded_model


def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):
    Prot_ID = prot_input_str.split(':')[0]
    Prot_seq = prot_input_str.split(':')[1]
    prot_dataframe = pd.DataFrame(
        {'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])
    prot_dataframe.set_index('Protein_ID')

    prot_dataframe["encoded_sequence"] = prot_dataframe.Sequence.map(
        lambda a: encodeSeq(a, seq_dic))
    prot_feature = pad_sequences(
        prot_dataframe["encoded_sequence"].values, prot_len)

    return prot_feature, Prot_ID


def mol_feature_gen_from_str_input(mol_str):
	try:
		mol_ID = mol_str.split(':')[0]
		mol_smiles = mol_str.split(':')[1]
		mol = Chem.MolFromSmiles(mol_smiles)
		fp1 = AllChem.GetMorganFingerprintAsBitVect(
		    mol, useChirality=True, radius=2, nBits=2048)
		fp_list = list(np.array(fp1).astype(float))
		fp_str = list(map(str, fp_list))
		mol_fp = '\t'.join(fp_str)

		mol_dict = {}
		mol_dict['Compound_ID'] = mol_ID
		mol_dict['Smiles'] = mol_smiles
		mol_dict['morgan_fp_r2'] = mol_fp

		mol_info_df = pd.DataFrame(mol_dict, index=[0])
		mol_info_df = mol_info_df.set_index('Compound_ID')

		final_return = mol_info_df
		final_id = mol_ID

	except Exception as error:
		print('Something wrong with molecule input string...' + repr(error))

	return final_return, final_id


def act_df_gen_mol_feature(mol_id, prot_id):
	act_df = pd.DataFrame(
	    {'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])

	return act_df


def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):
	act_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)
	comp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split("\t")))
	comp_feature = comp_feature.astype('float')
	return comp_feature


def model_prediction(compound_feature, enz_feature, model):
    prediction_vals = model.predict([compound_feature, enz_feature])

    return prediction_vals[0][0]



In [3]:
act_file = pd.read_csv('../CNN_data/test_CYP450/test_cyp450_act.csv', usecols=[1,2,3])
comp_file = pd.read_csv('../CNN_data/test_CYP450/test_compound_CYP.csv', usecols=[1,2])
prot_file = pd.read_csv('../CNN_data/test_CYP450/test_prot_cyp_new.csv', usecols=[1,2])

In [4]:
prot_file_pid = prot_file['Protein_ID'].tolist()
prot_file_seq = prot_file['Sequence'].tolist()

prot_file_dict = {}

for ix, pfpid in enumerate(prot_file_pid):
    pfstr = pfpid + ':' + prot_file_seq[ix]
    prot_file_dict[pfpid] = pfstr

In [5]:
comp_file_cid = comp_file['Compound_ID'].tolist()
comp_file_smiles = comp_file['smiles'].tolist()

comp_file_dict = {}

for icx, cfcid in enumerate(comp_file_cid):
    cfstr = cfcid + ':' + comp_file_smiles[icx]
    comp_file_dict[cfcid] = cfstr

In [6]:
act_file_pid = act_file['Protein_ID'].tolist()
act_file_cid = act_file['Compound_ID'].tolist()
act_file_label = act_file['Label'].tolist()

act_file_label = [int(x) for x in act_file_label]


## prediction for CYP450 test dataset

In [7]:
graph = tf.compat.v1.get_default_graph()
ld_model = tf.keras.models.load_model('../CNN_results_split_final/Final_model.model')

2022-11-29 21:24:24.219062: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-11-29 21:24:24.219091: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-11-29 21:24:24.219501: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [40]:
score_ls = []
pred_label_ls = []
comp_label_ls = []
see_ls = []


TP_ls = []
TN_ls = []
FP_ls = []
FN_ls = []

for act_idx, act_cid in enumerate(act_file_cid):
    
    act_pid= act_file_pid[act_idx]
    act_label = act_file_label[act_idx]
    
    cstr = comp_file_dict[act_cid]
    pstr = prot_file_dict[act_pid]
    
    prot_feature, prot_id = prot_feature_gen_from_str_input(pstr)
    comp_feature, comp_id = mol_feature_gen_from_str_input(cstr)
    

    act_dataframe = act_df_gen_mol_feature(comp_id, prot_id)
    compound_feature = compound_feature_gen_df_input(act_dataframe, comp_feature)
    y = ld_model.predict([compound_feature, prot_feature])
    score_ls.append(y)
    
    if y<0.5:
        pred_label = 0
    else:
        pred_label = 1
        
    if act_label == 1 and pred_label == 1:
        TP_ls.append([act_pid, act_cid, act_label, pred_label])
    elif act_label == 1 and pred_label == 0:
        FN_ls.append([act_pid, act_cid, act_label, pred_label])
    elif act_label == 0 and pred_label == 1:
        FP_ls.append([act_pid, act_cid, act_label, pred_label])
    elif act_label == 0 and pred_label == 0:
        TN_ls.append([act_pid, act_cid, act_label, pred_label])
        
        
    pred_label_ls.append(pred_label)
    comp_label_ls.append([act_label, pred_label])
    see_ls.append([act_pid, act_cid,y[0][0], act_label, pred_label])
        



In [41]:
TP_ls

[['Q9NYL5', 'C13550', 1, 1],
 ['Q64441', 'C01561', 1, 1],
 ['Q64441', 'C05443', 1, 1],
 ['P10632', 'C00219', 1, 1],
 ['P10632', 'C06429', 1, 1],
 ['P10632', 'C00951', 1, 1],
 ['P51581', 'C06429', 1, 1],
 ['P28649', 'C00535', 1, 1],
 ['P28649', 'C05290', 1, 1],
 ['P28649', 'C00468', 1, 1],
 ['P00185', 'C00468', 1, 1],
 ['P00185', 'C00951', 1, 1]]

In [42]:
TN_ls

[['P10632', 'C01438', 0, 0],
 ['P10632', 'C00109', 0, 0],
 ['P10632', 'C00022', 0, 0],
 ['P10632', 'C06423', 0, 0],
 ['P10632', 'C00163', 0, 0],
 ['P10632', 'C00246', 0, 0],
 ['P51581', 'C01438', 0, 0],
 ['P51581', 'C00109', 0, 0],
 ['P51581', 'C06423', 0, 0],
 ['P51581', 'C00163', 0, 0],
 ['P51581', 'C00246', 0, 0],
 ['P00185', 'C01438', 0, 0],
 ['P00185', 'C00109', 0, 0],
 ['P00185', 'C00022', 0, 0],
 ['P00185', 'C06423', 0, 0],
 ['P00185', 'C00163', 0, 0],
 ['P00185', 'C00246', 0, 0],
 ['P00185', 'C00803', 0, 0],
 ['P00185', 'C01585', 0, 0]]

In [43]:
FP_ls

[['P51581', 'C00022', 0, 1]]

In [44]:
FN_ls 

[['Q64441', 'C01673', 1, 0],
 ['P10632', 'C00777', 1, 0],
 ['P28649', 'C00280', 1, 0],
 ['P28649', 'C05297', 1, 0]]

In [46]:
positive_recovery = len(TP_ls)/(len(TP_ls) + len(FN_ls))
negative_recovery = len(TN_ls)/(len(FP_ls) + len(TN_ls))

print('Positive recovery: ', positive_recovery)
print('negative recovery: ', negative_recovery)

Positive recovery:  0.75
negative recovery:  0.95


In [47]:
len(TP_ls) + len(FN_ls)

16

In [48]:
see_ls

[['Q9NYL5', 'C13550', 0.5132372, 1, 1],
 ['Q64441', 'C01673', 0.38006923, 1, 0],
 ['Q64441', 'C01561', 0.65167063, 1, 1],
 ['Q64441', 'C05443', 0.587643, 1, 1],
 ['P10632', 'C00219', 0.9721749, 1, 1],
 ['P10632', 'C06429', 0.8000854, 1, 1],
 ['P10632', 'C00777', 0.22985458, 1, 0],
 ['P10632', 'C00951', 0.9876478, 1, 1],
 ['P10632', 'C01438', 0.34986204, 0, 0],
 ['P10632', 'C00109', 0.0030233562, 0, 0],
 ['P10632', 'C00022', 0.17397922, 0, 0],
 ['P10632', 'C06423', 0.09950569, 0, 0],
 ['P10632', 'C00163', 0.0067325532, 0, 0],
 ['P10632', 'C00246', 0.011217833, 0, 0],
 ['P51581', 'C06429', 0.5959733, 1, 1],
 ['P51581', 'C01438', 0.3993878, 0, 0],
 ['P51581', 'C00109', 0.107837886, 0, 0],
 ['P51581', 'C00022', 0.64545226, 0, 1],
 ['P51581', 'C06423', 0.36598843, 0, 0],
 ['P51581', 'C00163', 0.13346031, 0, 0],
 ['P51581', 'C00246', 0.1963638, 0, 0],
 ['P28649', 'C00535', 0.72955155, 1, 1],
 ['P28649', 'C05290', 0.6119687, 1, 1],
 ['P28649', 'C00468', 0.9385452, 1, 1],
 ['P28649', 'C00280',