# 1. Importing modules and functions

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
import shap
from tqdm.notebook import tqdm

# 2.Data entry and curation work set

In [3]:
uploaded_file_ws="datasets/HDAC6_work.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  3083 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [4]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [5]:
len(y_tr)

3083

# 3.Standardization SDF file for work set

In [6]:
all_mols_ws[:] = [x for i,x in enumerate(all_mols_ws) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToSmiles(all_mols_ws[i])
    records.append(record)

moldf_ws = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ws.append(m)
    
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  3083 molecules


In [7]:
records_ws = []
for i in range(len(moldf_ws)):
    record = Chem.MolToSmiles(moldf_ws[i])
    records_ws.append(record)

In [8]:
df = pd.DataFrame(records_ws, columns=["Smiles"])
df.to_csv('datasets/molecule_ws.smi', sep=',', index=False, header=False)

# 4.Data entry and curation test set

In [9]:
uploaded_file_ts="datasets/HDAC6_test.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  771 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [10]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [11]:
len(y_ts)

771

# 5.Standardization SDF file for test set

In [12]:
all_mols_ts[:] = [x for i,x in enumerate(all_mols_ts) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToSmiles(all_mols_ts[i])
    records.append(record)

moldf_ts = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ts.append(m)
    
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  771 molecules


In [13]:
records_ts = []
for i in range(len(moldf_ts)):
    record = Chem.MolToSmiles(moldf_ts[i])
    records_ts.append(record)

In [14]:
df_ts = pd.DataFrame(records_ts, columns=["Smiles"])
df_ts.to_csv('datasets/molecule_ts.smi', sep=',', index=False, header=False)

# 6.Descriptor calculation for work set

## PubChem FPs

In [163]:
from padelpy import padeldescriptor

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ws.smi', 
                d_file=fingerprint_output_file, #'Substructure.csv'
                #descriptortypes='SubstructureFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [15]:
descriptors_PubChem = pd.read_csv('PubChem.csv')

In [16]:
descriptors_PubChem

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,AUTOGEN_molecule_ws_1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUTOGEN_molecule_ws_2,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUTOGEN_molecule_ws_3,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUTOGEN_molecule_ws_4,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUTOGEN_molecule_ws_5,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,AUTOGEN_molecule_ws_3079,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3079,AUTOGEN_molecule_ws_3080,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3080,AUTOGEN_molecule_ws_3081,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3081,AUTOGEN_molecule_ws_3082,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
x_tr = descriptors_PubChem.drop('Name', axis=1)

In [18]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

In [35]:
savetxt('Models/Padels/x_tr_PubChem.csv', y_tr, delimiter=',')

In [36]:
x_tr.shape

(3083, 881)

# 7.Descriptor calculation for test set

In [179]:
from padelpy import padeldescriptor

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'_ts.csv']) #PubChem.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ts.smi', 
                d_file=fingerprint_output_file, #'PubChem.csv'
                #descriptortypes='PubChem.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=False,
                fingerprints=True)

In [19]:
descriptors_PubChem_ts = pd.read_csv('PubChem_ts.csv')

In [20]:
x_ts = descriptors_PubChem_ts.drop('Name', axis=1)

In [21]:
x_ts.shape

(771, 881)

# load the models from disk

In [22]:
best_svm = pickle.load(open('Models/Padels/HDAC6_SVM_PubChem.pkl', 'rb'))

In [23]:
best_gbr = pickle.load(open('Models/Padels/HDAC6_GBR_PubChem.pkl', 'rb'))

In [24]:
best_MLPR = pickle.load(open('Models/Padels/HDAC6_MLPR_PubChem.pkl', 'rb'))

In [25]:
best_kNN = pickle.load(open('Models/Padels/HDAC6_kNN_PubChem.pkl', 'rb'))

# Prediction for CV

In [26]:
y_pred_ws_GBR = best_gbr.predict(x_tr)

In [27]:
y_pred_ws_svm = best_svm.predict(x_tr)

In [28]:
y_pred_con=(y_pred_ws_GBR+y_pred_ws_svm)/2

In [29]:
R2_WS = round(r2_score(y_tr, y_pred_con), 2)
R2_WS

0.93

In [30]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con)), 2)
RMSE_WS

0.46

In [31]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [76]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [77]:
y_pred_CV_gbr = cross_val_predict(best_gbr, x_tr, y_tr, cv=cv)

In [78]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, x_tr, y_tr, cv=cv)

In [79]:
y_pred_CV_kNN = cross_val_predict(best_kNN, x_tr, y_tr, cv=cv)

# For all models

In [80]:
y_pred_con=(y_pred_CV_svm+y_pred_CV_gbr+y_pred_CV_MLPR+y_pred_CV_kNN)/4

In [81]:
Q2_CV = round(r2_score(y_tr, y_pred_con), 2)
Q2_CV

0.62

In [82]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con)),2)
RMSE_CV

0.7

# three models: svm+gbr+MLPR

In [83]:
y_pred_con=(y_pred_CV_svm+y_pred_CV_gbr+y_pred_CV_MLPR)/3

In [84]:
Q2_CV = round(r2_score(y_tr, y_pred_con), 2)
Q2_CV

0.62

In [85]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con)),2)
RMSE_CV

0.71

# two models: svm+gbr

In [86]:
y_pred_con_without_MLPR=(y_pred_CV_svm+y_pred_CV_gbr)/2

In [87]:
Q2_CV = round(r2_score(y_tr, y_pred_con_without_MLPR), 2)
Q2_CV

0.63

In [88]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con_without_MLPR)),2)
RMSE_CV

0.7

# Prediction for test set's molecules

In [45]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [46]:
y_pred_svm = best_svm.predict(x_ts)

In [47]:
y_pred_gbr = best_gbr.predict(x_ts)

In [48]:
y_pred_MLPR = best_MLPR.predict(x_ts)

In [49]:
y_pred_kNN = best_kNN.predict(x_ts)

In [50]:
y_pred_con=(y_pred_svm+y_pred_gbr)/2

In [51]:
Q2_TS = round(r2_score(y_ts, y_pred_con), 2)
Q2_TS

0.67

In [52]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_con)), 2)
RMSE_TS

0.68

# Estimating applicability domain. Method - Euclidian distances, K=1

In [53]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [54]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3073,3074,3075,3076,3077,3078,3079,3080,3081,3082
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2.449490,4.795832,4.898980,3.316625,6.928203,3.464102,7.549834,2.828427,2.236068,5.291502,...,2.000000,4.000000,2.236068,4.358899,2.000000,3.000000,0.000000,3.162278,0.000000,4.690416
2,6.855655,5.196152,5.385165,4.123106,7.000000,4.000000,7.681146,3.000000,4.690416,5.656854,...,3.000000,4.000000,2.236068,4.690416,2.449490,4.000000,1.732051,3.162278,1.732051,4.795832
3,7.071068,5.477226,6.244998,4.242640,7.071068,5.385165,7.874008,3.000000,4.690416,5.656854,...,3.464102,4.795832,2.828427,5.099020,2.645751,4.690416,2.449490,3.316625,2.449490,4.898980
4,7.141428,5.477226,6.244998,4.358899,7.141428,5.477226,8.000000,3.000000,5.099020,5.916080,...,3.605551,5.385165,3.000000,5.196152,3.000000,5.000000,2.449490,3.741657,2.449490,5.567764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,13.711309,12.328828,12.845233,13.784049,15.556349,13.453624,13.266500,13.416408,14.933185,12.529964,...,12.609520,13.000000,12.609520,12.845233,12.369317,12.727922,13.564660,12.328828,13.564660,13.527749
3079,13.820275,12.409674,12.884099,13.784049,15.620500,13.527749,13.304134,13.490738,15.000000,12.884099,...,12.688578,13.038404,12.688578,12.845233,12.489996,12.727922,13.674794,12.369317,13.674794,13.564660
3080,13.820275,12.529964,12.884099,14.352700,15.874508,13.527749,13.304134,13.490738,15.264338,12.961481,...,12.845233,13.038404,12.727922,12.922848,12.569805,12.767145,13.747727,12.369317,13.747727,13.601471
3081,13.856406,12.649111,12.922848,14.352700,15.874508,13.564660,13.304134,13.564660,15.264338,12.961481,...,12.845233,13.076696,12.845233,13.000000,12.569805,12.767145,13.820275,12.369317,13.820275,13.638182


In [55]:
similarity= neighbors_k

In [56]:
Dmean=np.mean(similarity[1,:])

In [57]:
round(Dmean, 2)

2.41

In [58]:
std=np.std(similarity[1,:])

In [59]:
round(std, 2)

1.57

In [60]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

3.19


In [61]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [62]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,768,769,770
0,4.898980,5.656854,3.464102,2.236068,5.656854,0.000000,3.464102,2.000000,0.000000,2.449490,...,1.732051,2.645751,2.828427,3.605551,1.414214,6.244998,2.828427,2.236068,0.000000,2.449490
1,6.000000,6.000000,3.741657,5.744563,5.656854,2.236068,4.000000,4.123106,0.000000,2.449490,...,3.316625,5.477226,3.464102,4.898980,1.414214,7.141428,3.464102,2.828427,4.000000,2.449490
2,6.164414,6.000000,4.123106,6.082763,6.244998,4.358899,4.472136,4.242640,0.000000,2.645751,...,3.316625,5.744563,4.123106,5.291502,1.414214,7.141428,4.123106,4.242640,5.099020,2.449490
3,6.244998,6.000000,4.472136,6.082763,6.244998,4.358899,4.898980,4.358899,3.872983,3.741657,...,4.000000,5.830952,4.898980,5.567764,3.000000,7.280110,4.898980,4.242640,5.196152,2.449490
4,6.324555,6.000000,4.898980,6.164414,6.403124,4.795832,4.898980,4.472136,4.795832,4.000000,...,4.123106,6.082763,5.000000,5.567764,3.000000,7.348469,4.898980,4.242640,5.291502,2.828427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,12.845233,13.152946,13.190906,12.369317,13.784049,14.832397,12.845233,13.341664,13.076696,13.820275,...,12.449900,14.247807,14.035668,12.961481,12.288206,14.387495,13.038404,12.884099,12.288206,13.747727
3079,12.884099,13.304134,13.266500,12.369317,13.820275,14.966630,12.845233,13.416408,13.076696,13.856406,...,12.569805,14.282857,14.106736,13.000000,12.288206,14.387495,13.038404,12.884099,12.409674,13.747727
3080,12.922848,13.601471,13.379088,12.489996,13.856406,15.297058,12.884099,13.490738,13.114877,13.856406,...,12.688578,14.456832,14.212670,13.000000,12.369317,14.422205,13.114877,12.922848,12.449900,13.747727
3081,12.961481,13.601471,13.416408,12.727922,14.317822,15.297058,12.922848,13.601471,13.152946,13.892444,...,12.727922,14.899665,14.212670,13.038404,12.449900,14.525839,13.114877,12.922848,12.489996,13.928389


In [63]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[4.899 5.657 3.464 2.236 5.657 0.    3.464 2.    0.    2.449 2.236 1.
 4.899 1.414 0.    1.414 1.414 5.568 1.414 1.732 1.732 2.828 3.464 4.583
 6.856 3.    1.    3.873 2.    3.162 2.828 3.606 3.317 3.162 2.449 3.
 2.    2.236 2.236 3.162 3.873 1.732 1.414 2.    2.    1.414 2.646 0.
 1.    1.    2.449 1.    4.123 2.449 2.    0.    1.414 3.873 5.099 2.236
 3.873 2.    1.    1.    4.    3.317 2.236 0.    2.    2.646 1.732 0.
 2.236 2.236 4.472 2.    0.    4.359 6.245 1.732 1.    3.    6.557 3.317
 5.196 2.236 2.828 0.    2.646 0.    1.    5.099 2.449 3.606 1.414 3.742
 3.317 2.236 2.646 2.449 1.    3.742 0.    5.099 2.828 5.568 1.414 3.162
 1.732 1.414 0.    1.414 1.732 2.449 3.162 0.    2.449 0.    1.414 1.
 4.243 1.    0.    4.243 2.646 3.317 1.414 1.414 3.742 2.828 3.606 2.828
 2.236 0.    4.359 1.414 5.831 0.    0.    0.    5.385 3.464 3.606 0.
 4.123 4.359 1.414 0.    3.    3.162 4.123 2.    5.    3.464 0.    2.828
 2.    5.    4.    0.    1.414 1.    2.    2.    2.    0.    1.414 4.

In [64]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[False False False  True False  True False  True  True  True  True  True
 False  True  True  True  True False  True  True  True  True False False
 False  True  True False  True  True  True False False  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True  True  True  True False  True  True  True  True False False  True
 False  True  True  True False False  True  True  True  True  True  True
  True  True False  True  True False False  True  True  True False False
 False  True  True  True  True  True  True False  True False  True False
 False  True  True  True  True False  True False  True False  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True  True False  True False  True  True False  True False  True
  True  True False  True False  True  True  True False False False  True
 False False  True  True  True  True False  True False False  True  True
  True False False  True  True  True  True  True  T

In [65]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7263294422827496


In [66]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  3   5   7   8   9  10  11  13  14  15  16  18  19  20  21  25  26  28
  29  30  33  34  35  36  37  38  39  41  42  43  44  45  46  47  48  49
  50  51  53  54  55  56  59  61  62  63  66  67  68  69  70  71  72  73
  75  76  79  80  81  85  86  87  88  89  90  92  94  97  98  99 100 102
 104 106 107 108 109 110 111 112 113 114 115 116 117 118 119 121 122 124
 126 127 129 131 132 133 135 137 138 139 143 146 147 148 149 151 154 155
 156 159 160 161 162 163 164 165 166 169 170 172 173 174 175 176 178 180
 181 183 184 185 187 188 189 191 192 193 195 196 198 199 200 201 202 203
 204 205 206 207 208 209 210 211 212 214 215 217 218 219 220 221 223 224
 225 226 227 228 230 231 232 233 234 235 237 239 240 241 244 245 246 248
 249 250 251 252 253 254 255 256 257 258 259 261 262 263 264 265 266 267
 268 270 272 276 278 279 280 281 282 283 284 286 287 288 289 290 291 292
 293 294 295 296 297 298 299 300 302 303 304 305 306 308 309 310 311 312
 313 315 31

In [67]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [68]:
y_pred_con_ad=list(y_pred_con)

In [69]:
y_pred_con_ad[:] = [x for i,x in enumerate(y_pred_con_ad) if i not in out_Ad]

In [70]:
len(y_pred_con_ad)

560

In [71]:
y_ts_ad=list(y_ts)

In [72]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [73]:
len(y_ts_ad)

560

In [74]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_con_ad), 2)
Q2_TS

0.75

In [75]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_con_ad)), 2)
RMSE_TS

0.63