In [1]:
import numpy as np
import pandas as pd
import pickle
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina

In [2]:
def Hierarchical_rmse(similarity_list,d_list):
    data=pd.DataFrame({'similarity':similarity_list,'d':d_list})
    print(np.mean(data[(data['similarity']>=0.8)&(data['similarity']<0.9)]['d'])**0.5)
    print(np.mean(data[(data['similarity']>=0.7)&(data['similarity']<0.8)]['d'])**0.5)
    print(np.mean(data[(data['similarity']>=0.6)&(data['similarity']<0.7)]['d'])**0.5)
    print(np.mean(data[(data['similarity']>=0.5)&(data['similarity']<0.6)]['d'])**0.5)
    print(np.mean(data[(data['similarity']<0.5)]['d'])**0.5)

In [3]:
train_smiles_list = []
with open('./Dataset/acidic_train_0.70_smiles.txt') as f:
    for line in f.readlines():
        line = line.replace('\n','').split('./t')
        train_smiles_list.append(line[0])

train_ms = [Chem.MolFromSmiles(i) for i in train_smiles_list]
train_fps_list = [AllChem.GetMorganFingerprintAsBitVect(x,2,1024) for x in train_ms]

test_smiles_list = []
with open('./Dataset/acidic_test_0.15_smiles.txt') as f:
    for line in f.readlines():
        line = line.replace('\n','').split('./t')
        test_smiles_list.append(line[0])
        
test_ms = [Chem.MolFromSmiles(i) for i in test_smiles_list]
test_fps_list = [AllChem.GetMorganFingerprintAsBitVect(x,2,1024) for x in test_ms]

In [4]:
similarity_list = []
for test_fps in test_fps_list:
    sims = DataStructs.BulkTanimotoSimilarity(test_fps,train_fps_list)
    similarity_list.append(max(sims))

In [5]:
from My_Pka_Model import Pka_basic_view,Pka_acidic_view
import torch
from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer, CanonicalBondFeaturizer
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

Using backend: pytorch


In [6]:
def predict(smiles,model_view):

    node_featurizer = CanonicalAtomFeaturizer(atom_data_field='h')
    edge_featurizer = CanonicalBondFeaturizer(bond_data_field='h')
    bg = smiles_to_bigraph(smiles= smiles, 
                  node_featurizer=node_featurizer,
                  edge_featurizer=edge_featurizer,canonical_atom_order= False)

    with torch.no_grad():
        model_view.eval()
        molecule_pka,atom_pka = model_view(bg,bg.ndata['h'], bg.edata['h'])
        
    return molecule_pka,atom_pka

In [7]:
acid_pred = Pka_acidic_view(node_feat_size = 74,
                            edge_feat_size = 12,
                            output_size = 1,
                            num_layers= 6,
                            graph_feat_size=200,
                            dropout=0.2)

for i in range(1,4):
    acid_pred.load_state_dict(torch.load('./Trained_model/acidic_ramdom_split_{}.pkl'.format(i),map_location='cuda:0'))

    with open('./Dataset/acidic_test_0.15_smiles.txt') as f: #acidic_test_0.15_smiles.txt,SAMPL7_acidic_smiles.txt
        pred = []
        label = []
        for line in f.readlines():
            line = line.replace('\n','').split('\t')
            molecule_pka,atom_pka = predict(line[0],acid_pred)
            pred.append(molecule_pka)
            label.append(float(line[1]))

    d_list = []
    for i,j in zip(pred,label):
        d_list.append(abs(i-j))
        
    d2_list = []
    for i,j in zip(pred,label):
        d2_list.append(abs(i-j)**2)
    
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

0.9482059244366742

0.6973620327083837
0.5861789664571425
0.7039104142128333
1.2354260070470853
1.3214167020944867

0.9205022739301771

0.6572524720976982
0.6256730667402162
0.6915970553229078
1.1901515988106832
1.268563653326904

0.9351637811552465

0.748608547127342
0.6526276208969111
0.7161501935845964
1.1497881364711753
1.325546838631536



In [8]:
for i in range(1,4):
    with open("./Machine_learning_model/acidic_RF_{}.pickle".format(i), 'rb') as fr:
        model = pickle.load(fr)

    df_test = pd.read_csv('./Dataset/acidic_test_0.15_FP.csv',header = None)
    test_data_label = np.array(df_test)
    X_test, y_test = test_data_label[:,1:],test_data_label[:,0]

    y_pred = model.predict(X_test)

    d_list = []
    for i,j in zip(y_pred,y_test):
        d_list.append(abs(i-j))
        
    d2_list = []
    for i,j in zip(y_pred,y_test):
        d2_list.append(abs(i-j)**2)
    
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

1.2747669540218887

0.7069409927892839
0.8748372062372406
0.8922256719491036
1.3026120615876162
2.162671117766957

1.301754065374355

0.7199596655047414
0.8616794126404148
0.8808438306803708
1.31671019853018
2.244084824677326

1.2927378694368055

0.7138314113516608
0.8297042623376265
0.8669411230151916
1.2967976322654264
2.24875942445501



In [9]:
for i in range(1,4):
    with open("./Machine_learning_model/acidic_XGBoost_{}.pickle".format(i), 'rb') as fr:
        model = pickle.load(fr)

    df_test = pd.read_csv('./Dataset/acidic_test_0.15_FP.csv',header = None)
    test_data_label = np.array(df_test)
    X_test, y_test = test_data_label[:,1:],test_data_label[:,0]

    y_pred = model.predict(X_test)

    d_list = []
    for i,j in zip(y_pred,y_test):
        d_list.append(abs(i-j))
        
    d2_list = []
    for i,j in zip(y_pred,y_test):
        d2_list.append(abs(i-j)**2)
    
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

1.1071980288624135

0.526425137722146
0.8157677199284313
0.8095963840635765
1.1784785222995804
1.8165899831970242

1.1266766895782805

0.5864918097961399
0.8468345339146198
0.8933852185456405
1.1991610636662962
1.7955790840815498

1.0652443007346302

0.5646406853488363
0.8373027382693453
0.8109359669187403
1.165462550607222
1.6746147991999372



In [10]:
for i in range(1,4):
    with open("./Machine_learning_model/acidic_MLP_{}.pickle".format(i), 'rb') as fr:
        model = pickle.load(fr)

    df_test = pd.read_csv('./Dataset/acidic_test_0.15_FP.csv',header = None)
    test_data_label = np.array(df_test)
    X_test, y_test = test_data_label[:,1:],test_data_label[:,0]

    y_pred = model.predict(X_test)

    d_list = []
    for i,j in zip(y_pred,y_test):
        d_list.append(abs(i-j))
        
    d2_list = []
    for i,j in zip(y_pred,y_test):
        d2_list.append(abs(i-j)**2)
    
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

1.086259869443916

0.6156063238962103
0.728267236913142
0.8835958974734168
1.256934529618565
1.644308578512263

1.0805960879807073

0.63011473384627
0.7442506801897891
0.8596139152541588
1.266753287545286
1.62427900960446

1.1020770946329737

0.5996053019437628
0.7670400702036275
0.8501896614995366
1.2894801098986248
1.6762500437999006



In [None]:
for i in range(1,2):
    with open("./Machine_learning_model/acidic_SVR_{}.pickle".format(i), 'rb') as fr:
        model = pickle.load(fr)

    df_test = pd.read_csv('./Dataset/acidic_test_0.15_FP.csv',header = None)
    test_data_label = np.array(df_test)
    X_test, y_test = test_data_label[:,1:],test_data_label[:,0]

    y_pred = model.predict(X_test)

    d_list = []
    for i,j in zip(y_pred,y_test):
        d_list.append(abs(i-j))
        
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

In [11]:
train_smiles_list = []
with open('./Dataset/basic_train_0.70_smiles.txt') as f:
    for line in f.readlines():
        line = line.replace('\n','').split('./t')
        train_smiles_list.append(line[0])

train_ms = [Chem.MolFromSmiles(i) for i in train_smiles_list]
train_fps_list = [AllChem.GetMorganFingerprintAsBitVect(x,2,2048) for x in train_ms]

test_smiles_list = []
with open('./Dataset/basic_test_0.15_smiles.txt') as f:
    for line in f.readlines():
        line = line.replace('\n','').split('./t')
        test_smiles_list.append(line[0])
        
test_ms = [Chem.MolFromSmiles(i) for i in test_smiles_list]
test_fps_list = [AllChem.GetMorganFingerprintAsBitVect(x,2,2048) for x in test_ms]

In [12]:
similarity_list = []
for test_fps in test_fps_list:
    sims = DataStructs.BulkTanimotoSimilarity(test_fps,train_fps_list)
    similarity_list.append(max(sims))

In [14]:
print(np.sum(np.array(similarity_list) >= 0.9))
print(np.sum(np.array(similarity_list) >= 0.8))
print(np.sum(np.array(similarity_list) >= 0.7))
print(np.sum(np.array(similarity_list) >= 0.6))
print(np.sum(np.array(similarity_list) >= 0.5))
print(np.sum(np.array(similarity_list) >= 0.4))
print(np.sum(np.array(similarity_list) <0.4))

209
301
480
734
1020
1161
105


In [13]:
from My_Pka_Model import Pka_basic_view,Pka_acidic_view
import torch
from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer, CanonicalBondFeaturizer
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [14]:
def predict(smiles,model_view):

    node_featurizer = CanonicalAtomFeaturizer(atom_data_field='h')
    edge_featurizer = CanonicalBondFeaturizer(bond_data_field='h')
    bg = smiles_to_bigraph(smiles= smiles, 
                  node_featurizer=node_featurizer,
                  edge_featurizer=edge_featurizer,canonical_atom_order= False)

    with torch.no_grad():
        model_view.eval()
        molecule_pka,atom_pka = model_view(bg,bg.ndata['h'], bg.edata['h'])
        
    return molecule_pka,atom_pka

In [15]:
base_pred = Pka_basic_view(node_feat_size = 74,
                            edge_feat_size = 12,
                            output_size = 1,
                            num_layers= 6,
                            graph_feat_size=200,
                            dropout=0.2)

for i in range(1,4):
    base_pred.load_state_dict(torch.load('./Trained_model/basic_ramdom_split_{}.pkl'.format(i),map_location='cuda:0'))

    with open('./Dataset/basic_test_0.15_smiles.txt') as f: #acidic_test_0.15_smiles.txt,SAMPL7_acidic_smiles.txt
        pred = []
        label = []
        for line in f.readlines():
            line = line.replace('\n','').split('\t')
            molecule_pka,atom_pka = predict(line[0],base_pred)
            pred.append(molecule_pka)
            label.append(float(line[1]))

    d_list = []
    for i,j in zip(pred,label):
        d_list.append(abs(i-j))
        
    d2_list = []
    for i,j in zip(pred,label):
        d2_list.append(abs(i-j)**2)
    
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

0.9110421224780605

0.5766486753247032
0.6166209794878664
0.6043516157844735
0.8845844028211888
1.5420404639383587

0.8652425433781479

0.6469260504560032
0.5704522778006442
0.6734897048537107
0.814548833518229
1.4111270285737485

0.8816145259199463

0.5762918468577941
0.6021660024753364
0.6107592178209781
0.8912485931614685
1.4389301997137607



In [16]:
for i in range(1,4):
    with open("./Machine_learning_model/basic_RF_{}.pickle".format(i), 'rb') as fr:
        model = pickle.load(fr)

    df_test = pd.read_csv('./Dataset/basic_test_0.15_FP.csv',header = None)
    test_data_label = np.array(df_test)
    X_test, y_test = test_data_label[:,1:],test_data_label[:,0]

    y_pred = model.predict(X_test)

    d_list = []
    for i,j in zip(y_pred,y_test):
        d_list.append(abs(i-j))
        
    d2_list = []
    for i,j in zip(y_pred,y_test):
        d2_list.append(abs(i-j)**2)
    
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

1.1414826210854454

0.8147278989028993
0.9293459443648816
1.0389592686360367
1.094587787584898
1.758363846717608

1.1339964228979307

0.8714480526463362
0.8573745527792144
1.0409125099388004
1.0532508100004094
1.7719773871447133

1.145275365336734

0.8136049000950909
0.9181590951885016
1.046032147118292
1.0774319450408254
1.7787072444679473



In [17]:
for i in range(1,4):
    with open("./Machine_learning_model/basic_XGBoost_{}.pickle".format(i), 'rb') as fr:
        model = pickle.load(fr)

    df_test = pd.read_csv('./Dataset/basic_test_0.15_FP.csv',header = None)
    test_data_label = np.array(df_test)
    X_test, y_test = test_data_label[:,1:],test_data_label[:,0]

    y_pred = model.predict(X_test)

    d_list = []
    for i,j in zip(y_pred,y_test):
        d_list.append(abs(i-j))
        
    d2_list = []
    for i,j in zip(y_pred,y_test):
        d2_list.append(abs(i-j)**2)
    
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

1.0331973567541826

0.780927874461526
0.7477193804208799
0.8977731176622182
0.8747202273123698
1.6842267767376453

1.0319570102448734

0.7664035918635742
0.7522432780325791
0.8660247583977718
0.9262863740717959
1.6660811664385238

1.0394610733885348

0.7756269373654755
0.7303138748800975
0.8626538401085935
0.9072085919640279
1.7086433089073527



In [18]:
for i in range(1,4):
    with open("./Machine_learning_model/basic_MLP_{}.pickle".format(i), 'rb') as fr:
        model = pickle.load(fr)

    df_test = pd.read_csv('./Dataset/basic_test_0.15_FP.csv',header = None)
    test_data_label = np.array(df_test)
    X_test, y_test = test_data_label[:,1:],test_data_label[:,0]

    y_pred = model.predict(X_test)

    d_list = []
    for i,j in zip(y_pred,y_test):
        d_list.append(abs(i-j))
        
    d2_list = []
    for i,j in zip(y_pred,y_test):
        d2_list.append(abs(i-j)**2)
    
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

1.0278493970535

0.6483897593059985
0.810218254387377
0.7963417559928243
0.9106086861434695
1.667427469944438

1.007902096868391

0.6778327613516171
0.7838108214768322
0.7850063224646078
0.9233569771672505
1.622382597657536

1.02338181659889

0.6587051270865893
0.8150745866792389
0.8186051146825745
0.9183698999608358
1.651446704351898



In [22]:
for i in range(1,2):
    with open("./Machine_learning_model/basic_SVR_{}.pickle".format(i), 'rb') as fr:
        model = pickle.load(fr)

    df_test = pd.read_csv('./Dataset/basic_test_0.15_FP.csv',header = None)
    test_data_label = np.array(df_test)
    X_test, y_test = test_data_label[:,1:],test_data_label[:,0]

    y_pred = model.predict(X_test)

    d_list = []
    for i,j in zip(y_pred,y_test):
        d_list.append(abs(i-j))
        
    d2_list = []
    for i,j in zip(y_pred,y_test):
        d2_list.append(abs(i-j)**2)
    
#     print(np.mean(d_list))
#     print('')
    print(np.mean(d2_list)**0.5)
    print('')
    
#     Hierarchical_mae(similarity_list,d_list)
#     print('')
    Hierarchical_rmse(similarity_list,d2_list)
    print('')

1.0808744926648925

0.671343184321772
0.7575794504842869
0.8009306991486147
0.9853892223980613
1.8033601282008562

