# 1. Importing modules and functions

In [2]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import chembl_structure_pipeline
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

# 2.Data entry and curation work set

In [3]:
uploaded_file_ws="datasets/HDAC3_work.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  1400 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [4]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [5]:
len(y_tr)

1400

# 3.Standardization SDF file for work set

In [6]:
all_mols_ws[:] = [x for i,x in enumerate(all_mols_ws) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToSmiles(all_mols_ws[i])
    records.append(record)

moldf_ws = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ws.append(m)
    
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  1400 molecules


# 4.Data entry and curation test set

In [7]:
uploaded_file_ts="datasets/HDAC3_test.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  351 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [8]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [9]:
len(y_ts)

351

# 5.Standardization SDF file for test set

In [10]:
all_mols_ts[:] = [x for i,x in enumerate(all_mols_ts) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToSmiles(all_mols_ts[i])
    records.append(record)

moldf_ts = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ts.append(m)
    
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  351 molecules


# 6.Calculation MorganFingerprint for work set

In [10]:
fp_tr = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ws]

In [11]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [12]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [13]:
savetxt('Models/Morgan_fingerprint/x_tr_MF.csv', x_tr, delimiter=',')

In [14]:
x_tr.shape

(1400, 1024)

# 7.Calculation MorganFingerprint for test set

In [15]:
fp_ts = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ts]

In [16]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [17]:
x_ts = rdkit_numpy_convert(fp_ts)

In [18]:
x_ts.shape

(351, 1024)

In [19]:
type(x_tr)

numpy.ndarray

In [20]:
x_tr

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [21]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

In [22]:
y_tr

array([ 4.01,  4.04,  4.05, ..., 10.03, 10.06, 10.1 ], dtype=float32)

# 8. SVM model building and validation

In [23]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [24]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [25]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [26]:
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [27]:
svm.best_params_
best_svm = svm.best_estimator_

In [28]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [29]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.68

In [30]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.72

# 9. Prediction for test set's molecules

In [31]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [32]:
y_pred_svm = best_svm.predict(x_ts)

In [34]:
Q2_TS = round(r2_score(y_ts, y_pred_svm), 2)
Q2_TS

0.71

In [35]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_svm)), 2)
RMSE_TS

0.7

save the model to disk

In [36]:
pickle.dump(best_svm, open('Models/Morgan_fingerprint/HDAC3_SVM_MF.pkl', 'wb'))

load the model from disk

In [3]:
best_svm = pickle.load(open('Models/Morgan_fingerprint/HDAC3_SVM_MF.pkl', 'rb'))

# 10. Y-randomization RF model

In [38]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_svm, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=seed)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min


True score =  0.68 
Y-randomization =  -0.3 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.0min finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [39]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [40]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,3.316625,3.464102,3.316625,3.316625,3.316625,4.242640,4.000000,5.656854,4.123106,4.795832,...,4.000000,3.605551,4.123106,2.645751,3.605551,3.316625,3.605551,3.464102,5.744563,5.000000
2,3.872983,4.000000,4.358899,3.464102,3.872983,4.242640,4.123106,6.164414,4.898980,5.477226,...,5.744563,3.741657,4.582576,4.795832,3.872983,3.605551,4.000000,3.741657,6.557438,5.099020
3,4.000000,4.123106,4.582576,4.123106,4.472136,4.898980,4.472136,6.324555,5.656854,5.477226,...,6.480741,3.872983,4.795832,4.898980,4.000000,3.741657,4.690416,3.741657,6.782330,5.656854
4,4.000000,4.123106,5.477226,4.242640,5.196152,5.000000,4.898980,6.403124,5.830952,5.567764,...,6.557438,5.000000,5.000000,5.000000,4.000000,3.741657,5.196152,3.872983,6.782330,5.656854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,9.797959,9.848858,10.583005,10.000000,10.000000,9.643651,10.392304,10.099504,10.488089,10.344080,...,10.488089,10.246951,11.090536,10.908712,10.816654,10.954452,11.045361,11.135529,10.723805,11.135529
1396,9.848858,9.848858,10.583005,10.049875,10.049875,9.643651,10.392304,10.198039,10.488089,10.392304,...,10.630146,10.246951,11.135529,10.954452,10.908712,11.000000,11.090536,11.135529,10.770329,11.180340
1397,10.099504,10.049875,10.630146,10.148891,10.099504,9.848858,10.440307,10.344080,10.488089,10.440307,...,10.723805,10.295630,11.135529,11.045361,10.908712,11.045361,11.090536,11.180340,10.862781,11.180340
1398,10.148891,10.198039,10.723805,10.198039,10.148891,10.000000,10.440307,10.392304,10.630146,10.488089,...,10.770329,10.295630,11.224972,11.045361,10.908712,11.135529,11.180340,11.269427,10.954452,11.313708


In [41]:
similarity= neighbors_k

In [42]:
Dmean=np.mean(similarity[1,:])

In [43]:
round(Dmean, 2)

3.46

In [44]:
std=np.std(similarity[1,:])

In [45]:
round(std, 2)

1.43

In [46]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

4.17


In [47]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [48]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,341,342,343,344,345,346,347,348,349,350
0,0.000000,3.605551,8.000000,4.242640,4.690416,4.123106,4.123106,3.605551,3.464102,4.472136,...,3.464102,3.741657,3.162278,4.242640,4.242640,4.123106,3.872983,4.472136,5.000000,2.449490
1,3.464102,4.898980,8.124039,4.898980,5.000000,4.358899,4.123106,3.872983,3.741657,5.099020,...,3.605551,3.741657,3.316625,4.358899,4.690416,4.472136,4.000000,4.582576,5.291502,3.000000
2,3.872983,6.324555,8.124039,5.000000,5.744563,4.472136,4.795832,3.872983,5.099020,5.099020,...,3.741657,3.741657,3.464102,5.196152,4.898980,4.472136,4.472136,4.582576,5.567764,3.464102
3,3.872983,6.324555,8.185352,5.385165,5.916080,5.830952,5.830952,4.898980,5.567764,5.567764,...,4.358899,3.872983,4.123106,5.291502,5.099020,4.582576,4.690416,4.582576,5.830952,5.744563
4,4.000000,6.403124,8.246211,5.567764,7.211102,5.830952,6.082763,5.099020,6.000000,5.567764,...,4.582576,4.582576,4.582576,5.291502,5.916080,4.582576,5.000000,4.690416,6.782330,5.916080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,9.797959,9.899495,11.489125,11.045361,10.344080,10.440307,10.677078,9.949874,10.295630,10.488089,...,10.246951,10.198039,10.148891,11.135529,10.770329,10.954452,11.045361,10.816654,11.135529,11.180340
1396,9.899495,9.899495,11.532562,11.045361,10.488089,10.440307,10.723805,10.000000,10.344080,10.488089,...,10.344080,10.295630,10.246951,11.135529,10.816654,11.000000,11.090536,10.862781,11.135529,11.180340
1397,10.049875,9.949874,11.532562,11.090536,10.583005,10.440307,10.723805,10.000000,10.440307,10.488089,...,10.392304,10.440307,10.295630,11.180340,10.862781,11.000000,11.090536,10.908712,11.269427,11.180340
1398,10.099504,10.049875,11.747340,11.135529,10.630146,10.488089,10.862781,10.049875,10.583005,10.535654,...,10.488089,10.488089,10.392304,11.313708,10.954452,11.090536,11.180340,11.045361,11.313708,11.224972


In [49]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[0.    3.606 8.    4.243 4.69  4.123 4.123 3.606 3.464 4.472 4.796 3.742
 3.606 5.745 5.745 1.    2.449 3.464 5.099 4.    3.162 4.    4.899 1.
 3.317 1.    3.162 2.449 3.606 3.464 5.099 4.359 5.    4.123 4.123 2.
 4.472 3.606 2.449 3.742 4.69  5.831 3.464 5.385 3.873 4.583 5.477 4.359
 1.    1.414 3.606 4.583 3.464 3.317 2.236 6.    3.    3.162 3.606 3.162
 3.606 3.464 3.606 5.385 4.123 2.    5.099 4.472 1.    5.    5.099 3.162
 3.    3.742 1.    0.    3.317 3.742 3.317 4.359 4.472 2.646 3.317 3.162
 4.472 2.828 4.    3.464 2.449 4.899 2.828 6.557 4.69  3.162 0.    3.317
 2.828 4.243 0.    3.317 7.348 3.    4.    0.    4.    2.646 4.123 0.
 3.464 1.    2.236 3.606 3.    3.    6.164 1.    3.606 0.    4.123 3.317
 3.873 3.464 4.796 4.123 4.243 4.    3.606 3.    3.    3.873 1.732 4.123
 3.464 3.317 2.828 3.    3.    4.    4.359 1.414 6.164 3.873 3.464 3.606
 3.606 3.162 2.828 3.742 4.359 6.403 3.162 3.873 3.162 3.317 0.    2.828
 3.742 4.123 4.583 3.    3.873 3.606 3.162 3.873 0.    4.   

In [50]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True False False False  True  True  True  True False False  True
  True False False  True  True  True False  True  True  True False  True
  True  True  True  True  True  True False False False  True  True  True
 False  True  True  True False False  True False  True False False False
  True  True  True False  True  True  True False  True  True  True  True
  True  True  True False  True  True False False  True False False  True
  True  True  True  True  True  True  True False False  True  True  True
 False  True  True  True  True False  True False False  True  True  True
  True False  True  True False  True  True  True  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
  True  True False  True False  True  True  True  True  True  True  True
  True  True  True  True  True  True False  True False  True  True  True
  True  True  True  True False False  True  True  True  True  True  True
  True  True False  True  True  True  True  True  T

In [51]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7663817663817664


In [52]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   1   5   6   7   8  11  12  15  16  17  19  20  21  23  24  25  26
  27  28  29  33  34  35  37  38  39  42  44  48  49  50  52  53  54  56
  57  58  59  60  61  62  64  65  68  71  72  73  74  75  76  77  78  81
  82  83  85  86  87  88  90  93  94  95  96  98  99 101 102 103 104 105
 106 107 108 109 110 111 112 113 115 116 117 118 119 120 121 123 125 126
 127 128 129 130 131 132 133 134 135 136 137 139 141 142 143 144 145 146
 147 150 151 152 153 154 155 156 157 159 160 161 162 163 164 165 167 168
 169 170 171 172 173 174 176 177 178 179 180 183 184 185 186 187 188 189
 190 191 192 193 195 196 197 198 199 200 201 202 203 204 205 206 208 209
 210 211 212 213 214 215 216 217 218 220 221 222 223 225 226 227 228 230
 232 233 234 235 237 239 240 242 244 246 247 248 249 250 251 252 254 255
 256 257 258 259 261 263 264 265 266 267 268 269 270 271 272 275 277 278
 279 280 281 282 283 284 286 287 291 292 294 297 298 299 300 301 303 304
 306 307 30

In [53]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [54]:
y_pred_svm_ad=list(y_pred_svm)

In [55]:
y_pred_svm_ad[:] = [x for i,x in enumerate(y_pred_svm_ad) if i not in out_Ad]

In [56]:
len(y_pred_svm_ad)

269

In [57]:
y_ts_ad=list(y_ts)

In [58]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [59]:
len(y_ts_ad)

269

In [60]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_svm_ad), 2)
Q2_TS

0.75

In [61]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_svm_ad)), 2)
RMSE_TS

0.66