In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem import AllChem
# from rdkit.Chem import Draw
from rdkit.Chem import rdChemReactions as Reactions

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import keras
from keras import backend as K
from keras.models import load_model
import argparse
import h5py
import pdb


2023-05-18 17:44:01.989769: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-18 17:44:02.220999: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-05-18 17:44:24.759060: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-18 17:44:24.759216: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:

seq_rdic = ['A', 'I', 'L', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'M','S', 'T', 'D', 'E', 'R', 'H', 'K', 'G', 'P', 'O', 'U', 'X', 'B', 'Z']
seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}


def encodeSeq(seq, seq_dic):
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]


def load_modelfile(model_string):
	loaded_model = tf.keras.models.load_model(model_string)
	return loaded_model


In [45]:

def prot_feature_gen_from_str_input(prot_input_str, prot_len=2500):
    Prot_ID = prot_input_str.split(':')[0]
    Prot_seq = prot_input_str.split(':')[1]
    prot_dataframe = pd.DataFrame(
        {'Protein_ID': Prot_ID, 'Sequence': Prot_seq}, index=[0])
    prot_dataframe.set_index('Protein_ID')

    prot_dataframe["encoded_sequence"] = prot_dataframe.Sequence.map(
        lambda a: encodeSeq(a, seq_dic))
    prot_feature = pad_sequences(
        prot_dataframe["encoded_sequence"].values, prot_len)

    return prot_feature, Prot_ID


def mol_feature_gen_from_str_input(mol_str, kegg_id_flag, kegg_df):

	if kegg_id_flag == 1:
		KEGG_ID = mol_str
		kegg_id_loc = kegg_df.index[kegg_df.Compound_ID == KEGG_ID][0]
		KEGG_ID_info = kegg_df.loc[kegg_id_loc]
		KEGG_ID_info_df = KEGG_ID_info.to_frame().T.set_index('Compound_ID')

		final_return = KEGG_ID_info_df
		final_id = KEGG_ID

	else:
		try:
			mol_ID = mol_str.split(':')[0]
			mol_smiles = mol_str.split(':')[1]
			mol = Chem.MolFromSmiles(mol_smiles)
			fp1 = AllChem.GetMorganFingerprintAsBitVect(
			    mol, useChirality=True, radius=2, nBits=2048)
			fp_list = list(np.array(fp1).astype(float))
			fp_str = list(map(str, fp_list))
			mol_fp = '\t'.join(fp_str)

			mol_dict = {}
			mol_dict['Compound_ID'] = mol_ID
			mol_dict['Smiles'] = mol_smiles
			mol_dict['morgan_fp_r2'] = mol_fp

			mol_info_df = pd.DataFrame(mol_dict, index=[0])
			mol_info_df = mol_info_df.set_index('Compound_ID')

			final_return = mol_info_df
			final_id = mol_ID

		except Exception as error:
			print('Something wrong with molecule input string...' + repr(error))

	return final_return, final_id


def act_df_gen_mol_feature(mol_id, prot_id):
	act_df = pd.DataFrame(
	    {'Protein_ID': prot_id, 'Compound_ID': mol_id}, index=[0])

	return act_df


def compound_feature_gen_df_input(act_df, comp_df, comp_len=2048, comp_vec='morgan_fp_r2'):
	act_df = pd.merge(act_df, comp_df, left_on='Compound_ID', right_index=True)
	comp_feature = np.stack(act_df[comp_vec].map(lambda fp: fp.split("\t")))
	comp_feature = comp_feature.astype('float')
	return comp_feature


def model_prediction(compound_feature, enz_feature, model):
    prediction_vals = model.predict([compound_feature, enz_feature])

    return prediction_vals[0][0]



loaded_model = load_modelfile('./CNN_results_split_final/Final_model.model')
KEGG_compound_read = pd.read_csv('./CNN_data/Final_test/kegg_compound.csv', index_col = 'Compound_ID')
kegg_df = KEGG_compound_read.reset_index()



In [83]:

enz_str ="A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN"

comp_str = 'X00001: NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)c1'
comp_ids_input = 'C00149'

comp_ls = ['C00149', 'C00022', 'X00001: NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)c1', 'C00497']
enz_ls = ["A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN", "A0A1D8PAW7:MKRILVAGGAGFIGSHLCERLVNEGHYVVCLDNFFTGNKKKVEQLLNNPRFEIAKHDVIEPYFNEVDEIYNLACPASPIHYQVDPIKTIKTSVLGAMNMLGLAKKTNAKILQASTSEVYGEPEVHPQYEEYWGNVNPIGKRSCYNEGKRCAESLFINYHSQHQTKIKIIRIFNTYGPKMDINDGRVISNFVIQALKGKDITIYGDGKQTRSFQYVDDLVEGMIRMMNTDDSFTGPVNIGNPEEYTMLELVSFIIEMTQSKSKLIFLPLPEDDPKRRRPNIELAKKELNNWEPKIKLREGLIKTINYFEKII", "A0A7T5EPK7:MKYFSAAVIPGDGIGPEVMEVGMSLLQAIGDIHGGLSFEAESFPWNCRYYLQHGRMMPEDGLERLRPFDVILLGAIGAPGVPDHISVWELILPIRRSFQQYVNLRPIKLLRGLESPLRGKGHEHLDFVVVRENTEGEYSNMGGRLHVGTPYEMAMQNNVFTRYGTERIIRYAFELAQATGKTRLTAATKSNGINHSMPFWDEIVKEISLHYPNIQTSLIHIDALAAFFVSRPEAFDVVVASNLFGDILTDLGAAVVGGLGLAPSGNINPEKTYPSMFEPIHGSAPDIAGRGIANPIATIWSISMMLDHLGERELGRLVLDCIEEVLVEGKVRTPDIGGKATTQEMGKAILAQLYRRGG"]


## Predicting for multiple protein sequence vs single substrate (MPSS) function 

In [64]:
SUB_Single_input = comp_ids_input
ENZ_mul_input = enz_ls


In [104]:
def MPSS(ENZ_INPUT, SUB_INPUT):
    
    if len(SUB_INPUT.split(':')) < 2:
        kegg_id_flag = 1
    else:
        kegg_id_flag = 0

    Final_sub_ls = []
    Final_enz_ls = []
    Final_score_ls = []
#     Final_enz_feature_ls = []
#     Final_sub_feature_ls = []

    for seq_inp in ENZ_INPUT:
        try:
            p_feature, p_id = prot_feature_gen_from_str_input(seq_inp)
            c_feature, c_id = mol_feature_gen_from_str_input(SUB_INPUT, kegg_id_flag, kegg_df)

            act_df = act_df_gen_mol_feature(c_id, p_id)
            c_feature = compound_feature_gen_df_input(act_df, c_feature)


            EnzRankScore = model_prediction(c_feature, p_feature, loaded_model)

            Final_enz_ls.append(p_id)
#             Final_enz_feature_ls.append(p_feature)
            Final_sub_ls.append(c_id)
#             Final_sub_feature_ls.append(c_feature)
            Final_score_ls.append(EnzRankScore)

        except Exception as e:
            print('Error somewhere...' + repr(e))
            
    Final_dict = {'Compound_ID': Final_sub_ls, 'Protein_ID': Final_enz_ls, 'score': Final_score_ls}
    Final_df = pd.DataFrame(Final_dict)
    
    return Final_df
    

In [105]:
MPSS(ENZ_mul_input, SUB_Single_input)



Unnamed: 0,Compound_ID,Protein_ID,score
0,C00149,A0A4P8WFA8,0.931402
1,C00149,A0A1D8PAW7,0.996195
2,C00149,A0A7T5EPK7,0.997919


## Predicting for Single protein sequence vs multiple substrates (SPMS) function 

In [102]:
def SPMS(ENZ_INPUT, SUB_INPUT):
    
    p_feature, p_id = prot_feature_gen_from_str_input(ENZ_INPUT)

    Final_sub_ls = []
    Final_enz_ls = []
    Final_score_ls = []
#     Final_enz_feature_ls = []
#     Final_sub_feature_ls = []

    for sub_inp in SUB_INPUT:
        
        if len(sub_inp.split(':')) < 2:
            kegg_id_flag = 1
        else:
            kegg_id_flag = 0

        try:            
            c_feature, c_id = mol_feature_gen_from_str_input(sub_inp, kegg_id_flag, kegg_df)
            act_df = act_df_gen_mol_feature(c_id, p_id)
            c_feature = compound_feature_gen_df_input(act_df, c_feature)


            EnzRankScore = model_prediction(c_feature, p_feature, loaded_model)

            Final_enz_ls.append(p_id)
#             Final_enz_feature_ls.append(p_feature)
            Final_sub_ls.append(c_id)
#             Final_sub_feature_ls.append(c_feature)
            Final_score_ls.append(EnzRankScore)

        except Exception as e:
            print('Error somewhere...' + repr(e))
            
    Final_dict = {'Compound_ID': Final_sub_ls, 'Protein_ID': Final_enz_ls, 'score': Final_score_ls}
    Final_df = pd.DataFrame(Final_dict)
    
    return Final_df
    

In [103]:
SPMS(enz_str, comp_ls)



Unnamed: 0,Compound_ID,Protein_ID,score
0,C00149,A0A4P8WFA8,0.931402
1,C00022,A0A4P8WFA8,0.985112
2,X00001,A0A4P8WFA8,0.067942
3,C00497,A0A4P8WFA8,0.931402


## Predicting for multiple enzyme multiple substrates (MPMS) function 

In [97]:
def MPMS(ENZ_INPUT, SUB_INPUT):
    try:
        len(ENZ_INPUT) == len(SUB_INPUT)
    except Exception as e:
        print('Enzyme and substrate input list have different length...' + repr(e))
        
    
    
    Final_sub_ls = []
    Final_enz_ls = []
    Final_score_ls = []

#     Final_enz_feature_ls = []
#     Final_sub_feature_ls = []
    
    
    p_feat_ls = []
    p_id_ls = []
    for enz_inp in ENZ_INPUT:
        p_feature, p_id = prot_feature_gen_from_str_input(enz_inp)
        
        p_feat_ls.append(p_feature)
        p_id_ls.append(p_id)

    c_id_ls = []
    c_feat_ls = []
    for sub_inp in SUB_INPUT:
    
        if len(sub_inp.split(':')) < 2:
            kegg_id_flag = 1
        else:
            kegg_id_flag = 0

        try:            
            c_feature, c_id = mol_feature_gen_from_str_input(sub_inp, kegg_id_flag, kegg_df)
            act_df = act_df_gen_mol_feature(c_id, p_id)
            c_feature = compound_feature_gen_df_input(act_df, c_feature)
            
            c_feat_ls.append(c_feature)
            c_id_ls.append(c_id)
            
        except Exception as e:
            print('Error somewhere...' + repr(e))
            
    for ix, pf in enumerate(p_feat_ls):
        cf = c_feat_ls[ix]
        cpid = c_id_ls[ix]
        ppid = p_id_ls[ix]
        
        EnzRankScore = model_prediction(cf, pf, loaded_model)

        Final_enz_ls.append(ppid)
#             Final_enz_feature_ls.append(p_feature)
        Final_sub_ls.append(cpid)
#             Final_sub_feature_ls.append(c_feature)
        Final_score_ls.append(EnzRankScore)


            
    Final_dict = {'Compound_ID': Final_sub_ls, 'Protein_ID': Final_enz_ls, 'score': Final_score_ls}
    Final_df = pd.DataFrame(Final_dict)
    
    return Final_df
    

In [98]:
comp_ls_MPMS = ['C00149', 'X00001: NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)([O-])OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)c1', 'C00497']
enz_ls_MPMS = ["A0A4P8WFA8:MTKRVLVTGGAGFLGSHLCERLLSEGHEVICLDNFGSGRRKNIKEFEDHPSFKVNDRDVRISESLPSVDRIYHLASRASPADFTQFPVNIALANTQGTRRLLDQARACDARMVFASTSEVYGDPKVHPQPETYTGNVNIRGARGCYDESKRFGETLTVAYQRKYDVDARTVRIFNTYGPRMRPDDGRVVPTFVTQALRGDDLTIYGDGEQTRSFCYVDDLIEGLISLMRVDNPEHNVYNIGKENERTIKELAYEVLGLTDTESDIVYEPLPEDDPGQRRPDITRAKTELDWEPKISLREGLEDTITYFDN", "A0A1D8PAW7:MKRILVAGGAGFIGSHLCERLVNEGHYVVCLDNFFTGNKKKVEQLLNNPRFEIAKHDVIEPYFNEVDEIYNLACPASPIHYQVDPIKTIKTSVLGAMNMLGLAKKTNAKILQASTSEVYGEPEVHPQYEEYWGNVNPIGKRSCYNEGKRCAESLFINYHSQHQTKIKIIRIFNTYGPKMDINDGRVISNFVIQALKGKDITIYGDGKQTRSFQYVDDLVEGMIRMMNTDDSFTGPVNIGNPEEYTMLELVSFIIEMTQSKSKLIFLPLPEDDPKRRRPNIELAKKELNNWEPKIKLREGLIKTINYFEKII", "A0A7T5EPK7:MKYFSAAVIPGDGIGPEVMEVGMSLLQAIGDIHGGLSFEAESFPWNCRYYLQHGRMMPEDGLERLRPFDVILLGAIGAPGVPDHISVWELILPIRRSFQQYVNLRPIKLLRGLESPLRGKGHEHLDFVVVRENTEGEYSNMGGRLHVGTPYEMAMQNNVFTRYGTERIIRYAFELAQATGKTRLTAATKSNGINHSMPFWDEIVKEISLHYPNIQTSLIHIDALAAFFVSRPEAFDVVVASNLFGDILTDLGAAVVGGLGLAPSGNINPEKTYPSMFEPIHGSAPDIAGRGIANPIATIWSISMMLDHLGERELGRLVLDCIEEVLVEGKVRTPDIGGKATTQEMGKAILAQLYRRGG"]


In [99]:
MPMS(enz_ls_MPMS, comp_ls_MPMS)



Unnamed: 0,Compound_ID,Protein_ID,score
0,C00149,A0A4P8WFA8,0.931402
1,X00001,A0A1D8PAW7,0.00197
2,C00497,A0A7T5EPK7,0.997919
