# 1. Importing modules and functions

In [51]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from padelpy import from_sdf
import shap
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# 2.Data entry and curation work set

In [2]:
uploaded_file_ws="datasets/HDAC6_work.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  3083 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [3]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [4]:
len(y_tr)

3083

# 3.Standardization SDF file for work set

In [5]:
all_mols_ws[:] = [x for i,x in enumerate(all_mols_ws) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToSmiles(all_mols_ws[i])
    records.append(record)

moldf_ws = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ws.append(m)
    
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  3083 molecules


# 4.Data entry and curation test set

In [6]:
uploaded_file_ts="datasets/HDAC6_test.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  771 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [7]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [8]:
len(y_ts)

771

# 5.Standardization SDF file for test set

In [9]:
all_mols_ts[:] = [x for i,x in enumerate(all_mols_ts) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToSmiles(all_mols_ts[i])
    records.append(record)

moldf_ts = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ts.append(m)
    
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  771 molecules


# 6.Calculation MorganFingerprint for work set

In [10]:
fp_tr = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ws]

In [11]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [12]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [13]:
x_tr.shape

(3083, 1024)

# 7.Calculation MorganFingerprint for test set

In [14]:
fp_ts = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ts]

In [15]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [16]:
x_ts = rdkit_numpy_convert(fp_ts)

In [17]:
x_ts.shape

(771, 1024)

In [18]:
type(x_tr)

numpy.ndarray

In [19]:
x_tr

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

In [22]:
y_tr

array([ 4.02,  4.03,  4.06, ...,  9.77,  9.85, 10.  ], dtype=float32)

 # GradientBoostingRegressor model building and validation

In [87]:
seed = 42

In [88]:
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [27]:
param_grid = {'learning_rate': [0.02,0.05],
                  'subsample'    : [0.9, 0.5, 0.1],
                  'n_estimators' : [100,500,1000],
                  'max_depth'    : [4, 10]
                 }

In [28]:
m = GridSearchCV(GradientBoostingRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [29]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [30]:
best_GBR = m.best_estimator_

In [31]:
m.best_params_

{'learning_rate': 0.02,
 'max_depth': 10,
 'n_estimators': 1000,
 'subsample': 0.5}

In [28]:
y_pred_ws_GBR = best_GBR.predict(x_tr)

In [29]:
R2_WS = round(r2_score(y_tr, y_pred_ws_GBR), 2)
R2_WS

0.96

In [30]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_ws_GBR)), 2)
RMSE_WS

0.4

In [32]:
y_pred_CV_GBR = cross_val_predict(best_GBR, x_tr, y_tr, cv=cv)

In [33]:
y_pred_CV_GBR

array([5.89651444, 5.34990513, 5.92955438, ..., 7.74835486, 7.69122844,
       8.01240632])

In [34]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_GBR), 2)
Q2_CV

0.68

In [35]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_GBR)), 2)
RMSE_CV

0.68

# 9. Prediction for test set's molecules

In [41]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [42]:
y_pred_GBR = best_GBR.predict(x_ts)

In [43]:
Q2_TS = round(r2_score(y_ts, y_pred_GBR), 2)
Q2_TS

0.7

In [44]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_GBR)), 2)
RMSE_TS

0.67

# save the model to disk

In [40]:
pickle.dump(best_GBR, open('Models/Morgan_fingerprint/HDAC6_GBR_MF.pkl', 'wb'))

# load the model from disk

In [21]:
best_GBR = pickle.load(open('Models/Morgan_fingerprint/HDAC6_GBR_MF.pkl', 'rb'))

# 10. Y-randomization GradientBoostingRegressor model

In [41]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_GBR, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=seed)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed: 102.5min


True score =  0.68 
Y-randomization =  -0.25 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 163.9min finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [22]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [23]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3073,3074,3075,3076,3077,3078,3079,3080,3081,3082
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,3.162278,5.656854,4.472136,3.872983,6.708204,3.316625,6.244998,5.000000,3.464102,6.480741,...,3.162278,3.605551,4.123106,4.242640,3.162278,3.605551,3.872983,3.741657,2.645751,4.795832
2,6.164414,6.000000,5.385165,4.358899,6.782330,3.872983,6.403124,5.099020,4.242640,6.633250,...,3.162278,5.567764,4.472136,4.358899,3.605551,5.099020,3.872983,4.000000,3.464102,5.477226
3,6.164414,6.082763,5.567764,4.795832,6.855655,4.472136,7.071068,5.099020,4.242640,6.708204,...,4.472136,5.916080,4.690416,4.472136,3.741657,5.291502,4.242640,4.242640,3.464102,5.567764
4,6.244998,6.082763,5.567764,4.795832,6.855655,5.196152,7.141428,5.099020,4.582576,6.708204,...,4.582576,6.000000,4.795832,4.690416,3.872983,5.477226,4.242640,4.242640,3.464102,5.567764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,11.000000,11.090536,11.269427,10.630146,11.313708,10.816654,11.532562,11.401754,11.401754,11.313708,...,11.090536,11.445523,11.401754,11.090536,11.000000,11.313708,11.090536,11.180340,11.135529,10.862781
3079,11.000000,11.090536,11.269427,10.677078,11.313708,10.816654,11.532562,11.401754,11.445523,11.313708,...,11.090536,11.445523,11.401754,11.090536,11.135529,11.313708,11.180340,11.180340,11.135529,11.180340
3080,11.000000,11.090536,11.269427,10.677078,11.313708,10.816654,11.532562,11.401754,11.445523,11.313708,...,11.180340,11.445523,11.401754,11.090536,11.135529,11.313708,11.269427,11.224972,11.269427,11.180340
3081,11.180340,11.269427,11.357817,10.677078,11.401754,11.045361,11.704700,11.489125,11.532562,11.313708,...,11.180340,11.445523,11.401754,11.313708,11.135529,11.313708,11.401754,11.224972,11.313708,11.180340


In [24]:
similarity= neighbors_k

In [25]:
Dmean=np.mean(similarity[1,:])

In [26]:
round(Dmean, 2)

3.23

In [27]:
std=np.std(similarity[1,:])

In [28]:
round(std, 2)

1.37

In [29]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

3.91


In [30]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [31]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,768,769,770
0,3.605551,4.358899,3.316625,4.898979,5.000000,1.000000,2.449490,5.000000,2.449490,3.605551,...,3.162278,3.316625,3.872983,3.605551,2.645751,4.582576,3.605551,3.605551,1.000000,4.898979
1,4.898979,4.582576,3.605551,5.099020,5.196152,3.316625,3.464102,5.099020,2.828427,3.872983,...,4.123106,4.582576,4.000000,3.605551,3.605551,6.855655,5.099020,4.690416,3.605551,5.000000
2,5.196152,4.898979,3.605551,6.633250,5.196152,4.123106,3.872983,5.291503,3.464102,4.000000,...,4.582576,4.582576,4.795832,4.123106,3.872983,6.855655,5.196152,4.795832,5.477226,5.099020
3,5.196152,5.196152,4.000000,7.211103,5.291503,4.358899,3.872983,5.291503,4.358899,4.358899,...,4.582576,4.582576,5.567764,4.582576,4.000000,6.855655,5.385165,4.898979,5.477226,5.099020
4,5.477226,5.385165,5.567764,7.211103,5.385165,4.472136,4.123106,5.291503,6.633250,4.898979,...,4.582576,4.690416,6.082763,4.898979,4.242641,6.928203,5.385165,5.099020,5.656854,5.196152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,10.816654,11.224972,11.000000,11.313708,11.180340,11.401754,10.723805,11.489125,11.489125,10.862780,...,11.180340,10.677078,11.357817,11.489125,11.045361,11.180340,11.269428,11.090537,11.045361,11.090537
3079,10.816654,11.313708,11.401754,11.357817,11.180340,11.401754,11.000000,11.532563,11.532563,10.862780,...,11.224972,10.677078,11.532563,11.575837,11.090537,11.224972,11.269428,11.090537,11.045361,11.135529
3080,10.816654,11.313708,11.401754,11.357817,11.180340,11.401754,11.000000,11.575837,11.532563,10.862780,...,11.224972,10.770330,11.532563,11.789826,11.090537,11.269428,11.269428,11.090537,11.090537,11.135529
3081,10.816654,11.357817,11.401754,11.401754,11.180340,11.401754,11.000000,11.618950,11.532563,11.224972,...,11.224972,10.770330,11.532563,11.789826,11.090537,11.357817,11.269428,11.180340,11.090537,11.180340


In [32]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.606 4.359 3.317 4.899 5.    1.    2.449 5.    2.449 3.606 3.464 2.828
 6.928 3.162 0.    4.123 1.    5.    3.162 4.123 2.646 3.162 4.899 4.359
 6.    2.828 3.464 4.583 3.317 4.    3.742 7.    3.606 4.    4.243 3.606
 3.873 3.873 3.317 3.606 4.    1.    3.317 4.472 2.646 4.359 2.828 2.828
 3.162 3.464 3.873 3.464 5.916 3.162 4.359 1.732 3.    3.606 6.481 3.742
 3.606 3.742 3.464 2.828 5.099 4.123 3.162 3.464 3.    3.606 3.742 3.162
 1.    4.899 4.472 3.464 3.    4.472 6.856 1.    3.606 4.359 3.742 3.606
 5.831 3.464 4.359 2.    4.69  1.    2.    4.69  3.742 4.69  2.646 4.583
 3.742 3.162 1.414 3.464 1.414 6.481 0.    4.    2.646 4.359 2.828 3.873
 3.162 3.464 1.    3.606 4.359 3.    3.464 3.317 2.449 2.236 3.317 1.
 3.606 2.236 0.    4.243 4.243 3.742 3.464 4.243 4.583 2.449 3.873 0.
 2.828 2.449 3.873 1.732 5.099 3.464 0.    2.449 3.317 3.    3.606 0.
 3.    4.    0.    2.    3.606 1.414 3.742 1.414 4.    5.477 2.828 4.
 4.123 4.    4.899 0.    3.742 3.    3.873 3.162 3.464 0.    2.

In [33]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True False  True False False  True  True False  True  True  True  True
 False  True  True False  True False  True False  True  True False False
 False  True  True False  True False  True False  True False False  True
  True  True  True  True False  True  True False  True False  True  True
  True  True  True  True False  True False  True  True  True False  True
  True  True  True  True False False  True  True  True  True  True  True
  True False False  True  True False False  True  True False  True  True
 False  True False  True False  True  True False  True False  True False
  True  True  True  True  True False  True False  True False  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True  True  True False False  True  True False False  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True False False  True False
 False False False  True  True  True  True  True  T

In [34]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.72


In [35]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   2   5   6   8   9  10  11  13  14  16  18  20  21  25  26  28  30
  32  35  36  37  38  39  41  42  44  46  47  48  49  50  51  53  55  56
  57  59  60  61  62  63  66  67  68  69  70  71  72  75  76  79  80  82
  83  85  87  89  90  92  94  96  97  98  99 100 102 104 106 107 108 109
 110 111 113 114 115 116 117 118 119 120 121 122 125 126 129 130 131 132
 133 134 135 137 138 139 140 141 142 143 144 146 147 148 149 150 151 154
 159 160 161 162 163 164 165 166 168 169 170 172 174 175 176 181 182 183
 184 185 187 188 189 190 191 193 195 196 197 198 200 201 202 203 204 205
 206 207 208 209 210 211 212 214 217 218 219 220 221 223 224 225 226 228
 230 231 232 233 237 238 239 240 241 244 245 246 248 249 250 251 252 253
 254 255 256 257 258 259 261 262 264 265 266 268 270 271 273 274 275 276
 278 279 280 281 282 284 285 286 288 289 290 291 292 293 294 295 296 297
 298 299 300 302 305 306 309 310 311 312 314 315 316 317 320 321 324 327
 328 329 33

In [36]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [45]:
y_pred_GBR_ad=list(y_pred_GBR)

In [46]:
y_pred_GBR_ad[:] = [x for i,x in enumerate(y_pred_GBR_ad) if i not in out_Ad]

In [47]:
len(y_pred_GBR_ad)

553

In [48]:
y_ts_ad=list(y_ts)

In [49]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [50]:
len(y_ts_ad)

553

In [51]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_GBR_ad), 2)
Q2_TS

0.72

In [52]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_GBR_ad)), 2)
RMSE_TS

0.64

# SVM model building and validation

In [73]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [25]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [75]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [76]:
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [77]:
svm.best_params_
best_svm = svm.best_estimator_

In [78]:
svm.best_params_

{'C': 10, 'gamma': 0.01}

In [33]:
y_pred_ws_svm = best_svm.predict(x_tr)

In [35]:
R2_WS = round(r2_score(y_tr, y_pred_ws_svm), 2)
R2_WS

0.94

In [36]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_ws_svm)), 2)
RMSE_WS

0.45

In [79]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [81]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.66

In [82]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.69

# 9. Prediction for test set's molecules

In [83]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [54]:
y_pred_svm = best_svm.predict(x_ts)

In [55]:
Q2_TS = round(r2_score(y_ts, y_pred_svm), 2)
Q2_TS

0.68

In [56]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_svm)), 2)
RMSE_TS

0.68

save the model to disk

In [87]:
pickle.dump(best_svm, open('Models/Morgan_fingerprint/HDAC6_SVM_MF.pkl', 'wb'))

load the model from disk

In [53]:
best_svm = pickle.load(open('Models/Morgan_fingerprint/HDAC6_SVM_MF.pkl', 'rb'))

# 10. Y-randomization SVM model

In [88]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_svm, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=seed)
print('True score = ', score.round(3),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  7.5min


True score =  0.655 
Y-randomization =  -0.31 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 11.8min finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [57]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [58]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3073,3074,3075,3076,3077,3078,3079,3080,3081,3082
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,3.162278,5.656854,4.472136,3.872983,6.708204,3.316625,6.244998,5.000000,3.464102,6.480741,...,3.162278,3.605551,4.123106,4.242640,3.162278,3.605551,3.872983,3.741657,2.645751,4.795832
2,6.164414,6.000000,5.385165,4.358899,6.782330,3.872983,6.403124,5.099020,4.242640,6.633250,...,3.162278,5.567764,4.472136,4.358899,3.605551,5.099020,3.872983,4.000000,3.464102,5.477226
3,6.164414,6.082763,5.567764,4.795832,6.855655,4.472136,7.071068,5.099020,4.242640,6.708204,...,4.472136,5.916080,4.690416,4.472136,3.741657,5.291502,4.242640,4.242640,3.464102,5.567764
4,6.244998,6.082763,5.567764,4.795832,6.855655,5.196152,7.141428,5.099020,4.582576,6.708204,...,4.582576,6.000000,4.795832,4.690416,3.872983,5.477226,4.242640,4.242640,3.464102,5.567764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,11.000000,11.090536,11.269427,10.630146,11.313708,10.816654,11.532562,11.401754,11.401754,11.313708,...,11.090536,11.445523,11.401754,11.090536,11.000000,11.313708,11.090536,11.180340,11.135529,10.862781
3079,11.000000,11.090536,11.269427,10.677078,11.313708,10.816654,11.532562,11.401754,11.445523,11.313708,...,11.090536,11.445523,11.401754,11.090536,11.135529,11.313708,11.180340,11.180340,11.135529,11.180340
3080,11.000000,11.090536,11.269427,10.677078,11.313708,10.816654,11.532562,11.401754,11.445523,11.313708,...,11.180340,11.445523,11.401754,11.090536,11.135529,11.313708,11.269427,11.224972,11.269427,11.180340
3081,11.180340,11.269427,11.357817,10.677078,11.401754,11.045361,11.704700,11.489125,11.532562,11.313708,...,11.180340,11.445523,11.401754,11.313708,11.135529,11.313708,11.401754,11.224972,11.313708,11.180340


In [59]:
similarity= neighbors_k

In [60]:
Dmean=np.mean(similarity[1,:])

In [61]:
round(Dmean, 2)

3.23

In [62]:
std=np.std(similarity[1,:])

In [63]:
round(std, 2)

1.37

In [64]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

3.91


In [65]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [66]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,768,769,770
0,3.605551,4.358899,3.316625,4.898980,5.000000,1.000000,2.449490,5.000000,2.449490,3.605551,...,3.162278,3.316625,3.872983,3.605551,2.645751,4.582576,3.605551,3.605551,1.000000,4.898980
1,4.898980,4.582576,3.605551,5.099020,5.196152,3.316625,3.464102,5.099020,2.828427,3.872983,...,4.123106,4.582576,4.000000,3.605551,3.605551,6.855655,5.099020,4.690416,3.605551,5.000000
2,5.196152,4.898980,3.605551,6.633250,5.196152,4.123106,3.872983,5.291502,3.464102,4.000000,...,4.582576,4.582576,4.795832,4.123106,3.872983,6.855655,5.196152,4.795832,5.477226,5.099020
3,5.196152,5.196152,4.000000,7.211102,5.291502,4.358899,3.872983,5.291502,4.358899,4.358899,...,4.582576,4.582576,5.567764,4.582576,4.000000,6.855655,5.385165,4.898980,5.477226,5.099020
4,5.477226,5.385165,5.567764,7.211102,5.385165,4.472136,4.123106,5.291502,6.633250,4.898980,...,4.582576,4.690416,6.082763,4.898980,4.242640,6.928203,5.385165,5.099020,5.656854,5.196152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,10.816654,11.224972,11.000000,11.313708,11.180340,11.401754,10.723805,11.489125,11.489125,10.862781,...,11.180340,10.677078,11.357817,11.489125,11.045361,11.180340,11.269427,11.090536,11.045361,11.090536
3079,10.816654,11.313708,11.401754,11.357817,11.180340,11.401754,11.000000,11.532562,11.532562,10.862781,...,11.224972,10.677078,11.532562,11.575837,11.090536,11.224972,11.269427,11.090536,11.045361,11.135529
3080,10.816654,11.313708,11.401754,11.357817,11.180340,11.401754,11.000000,11.575837,11.532562,10.862781,...,11.224972,10.770329,11.532562,11.789826,11.090536,11.269427,11.269427,11.090536,11.090536,11.135529
3081,10.816654,11.357817,11.401754,11.401754,11.180340,11.401754,11.000000,11.618950,11.532562,11.224972,...,11.224972,10.770329,11.532562,11.789826,11.090536,11.357817,11.269427,11.180340,11.090536,11.180340


In [67]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.606 4.359 3.317 4.899 5.    1.    2.449 5.    2.449 3.606 3.464 2.828
 6.928 3.162 0.    4.123 1.    5.    3.162 4.123 2.646 3.162 4.899 4.359
 6.    2.828 3.464 4.583 3.317 4.    3.742 7.    3.606 4.    4.243 3.606
 3.873 3.873 3.317 3.606 4.    1.    3.317 4.472 2.646 4.359 2.828 2.828
 3.162 3.464 3.873 3.464 5.916 3.162 4.359 1.732 3.    3.606 6.481 3.742
 3.606 3.742 3.464 2.828 5.099 4.123 3.162 3.464 3.    3.606 3.742 3.162
 1.    4.899 4.472 3.464 3.    4.472 6.856 1.    3.606 4.359 3.742 3.606
 5.831 3.464 4.359 2.    4.69  1.    2.    4.69  3.742 4.69  2.646 4.583
 3.742 3.162 1.414 3.464 1.414 6.481 0.    4.    2.646 4.359 2.828 3.873
 3.162 3.464 1.    3.606 4.359 3.    3.464 3.317 2.449 2.236 3.317 1.
 3.606 2.236 0.    4.243 4.243 3.742 3.464 4.243 4.583 2.449 3.873 0.
 2.828 2.449 3.873 1.732 5.099 3.464 0.    2.449 3.317 3.    3.606 0.
 3.    4.    0.    2.    3.606 1.414 3.742 1.414 4.    5.477 2.828 4.
 4.123 4.    4.899 0.    3.742 3.    3.873 3.162 3.464 0.    2.

In [68]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True False  True False False  True  True False  True  True  True  True
 False  True  True False  True False  True False  True  True False False
 False  True  True False  True False  True False  True False False  True
  True  True  True  True False  True  True False  True False  True  True
  True  True  True  True False  True False  True  True  True False  True
  True  True  True  True False False  True  True  True  True  True  True
  True False False  True  True False False  True  True False  True  True
 False  True False  True False  True  True False  True False  True False
  True  True  True  True  True False  True False  True False  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True  True  True False False  True  True False False  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True False False  True False
 False False False  True  True  True  True  True  T

In [69]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.72


In [70]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   2   5   6   8   9  10  11  13  14  16  18  20  21  25  26  28  30
  32  35  36  37  38  39  41  42  44  46  47  48  49  50  51  53  55  56
  57  59  60  61  62  63  66  67  68  69  70  71  72  75  76  79  80  82
  83  85  87  89  90  92  94  96  97  98  99 100 102 104 106 107 108 109
 110 111 113 114 115 116 117 118 119 120 121 122 125 126 129 130 131 132
 133 134 135 137 138 139 140 141 142 143 144 146 147 148 149 150 151 154
 159 160 161 162 163 164 165 166 168 169 170 172 174 175 176 181 182 183
 184 185 187 188 189 190 191 193 195 196 197 198 200 201 202 203 204 205
 206 207 208 209 210 211 212 214 217 218 219 220 221 223 224 225 226 228
 230 231 232 233 237 238 239 240 241 244 245 246 248 249 250 251 252 253
 254 255 256 257 258 259 261 262 264 265 266 268 270 271 273 274 275 276
 278 279 280 281 282 284 285 286 288 289 290 291 292 293 294 295 296 297
 298 299 300 302 305 306 309 310 311 312 314 315 316 317 320 321 324 327
 328 329 33

In [71]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [72]:
y_pred_svm_ad=list(y_pred_svm)

In [73]:
y_pred_svm_ad[:] = [x for i,x in enumerate(y_pred_svm_ad) if i not in out_Ad]

In [74]:
len(y_pred_svm_ad)

553

In [75]:
y_ts_ad=list(y_ts)

In [76]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [77]:
len(y_ts_ad)

553

In [78]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_svm_ad), 2)
Q2_TS

0.73

In [79]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_svm_ad)), 2)
RMSE_TS

0.64

# Multi-layer Perceptron regressor

In [84]:
from sklearn.neural_network import MLPRegressor

In [90]:
param_grid ={"hidden_layer_sizes": [(400, 300, 200, 100),(100, 100, 100), (10, 10, 10),(50,)], "activation": ["tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [0.00005,0.0005], 'max_iter': [1000, 2000]}

In [91]:
m = GridSearchCV(MLPRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [92]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [93]:
best_MLPR = m.best_estimator_

In [94]:
m.best_params_

{'activation': 'relu',
 'alpha': 5e-05,
 'hidden_layer_sizes': (400, 300, 200, 100),
 'max_iter': 1000,
 'solver': 'lbfgs'}

In [95]:
y_pred_ws_MLPR = best_MLPR.predict(x_tr)

In [96]:
R2_WS = round(r2_score(y_tr, y_pred_ws_MLPR), 2)
R2_WS

0.99

In [97]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_ws_MLPR)), 2)
RMSE_WS

0.15

In [101]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, x_tr, y_tr, cv=cv)

In [102]:
y_pred_CV_MLPR

array([5.63223732, 5.59104909, 5.3910536 , ..., 8.25383691, 6.65215014,
       7.93132945])

In [103]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_MLPR), 2)
Q2_CV

0.61

In [104]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_MLPR)), 2)
RMSE_CV

0.71

# 9. Prediction for test set's molecules

In [24]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [25]:
y_pred_MLPR = best_MLPR.predict(x_ts)

In [26]:
Q2_TS = round(r2_score(y_ts, y_pred_MLPR), 2)
Q2_TS

0.64

In [27]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_MLPR)), 2)
RMSE_TS

0.69

# save the model to disk

In [110]:
pickle.dump(best_MLPR, open('Models/Morgan_fingerprint/HDAC6_MLPR_MF.pkl', 'wb'))

# load the model from disk

In [23]:
best_MLPR = pickle.load(open('Models/Morgan_fingerprint/HDAC6_MLPR_MF.pkl', 'rb'))

# 10. Y-randomization MLPR

In [111]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_MLPR, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=seed)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed: 124.2min


True score =  0.61 
Y-randomization =  -0.74 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 209.5min finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [28]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [29]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3073,3074,3075,3076,3077,3078,3079,3080,3081,3082
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,3.162278,5.656854,4.472136,3.872983,6.708204,3.316625,6.244998,5.000000,3.464102,6.480741,...,3.162278,3.605551,4.123106,4.242640,3.162278,3.605551,3.872983,3.741657,2.645751,4.795832
2,6.164414,6.000000,5.385165,4.358899,6.782330,3.872983,6.403124,5.099020,4.242640,6.633250,...,3.162278,5.567764,4.472136,4.358899,3.605551,5.099020,3.872983,4.000000,3.464102,5.477226
3,6.164414,6.082763,5.567764,4.795832,6.855655,4.472136,7.071068,5.099020,4.242640,6.708204,...,4.472136,5.916080,4.690416,4.472136,3.741657,5.291502,4.242640,4.242640,3.464102,5.567764
4,6.244998,6.082763,5.567764,4.795832,6.855655,5.196152,7.141428,5.099020,4.582576,6.708204,...,4.582576,6.000000,4.795832,4.690416,3.872983,5.477226,4.242640,4.242640,3.464102,5.567764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,11.000000,11.090536,11.269427,10.630146,11.313708,10.816654,11.532562,11.401754,11.401754,11.313708,...,11.090536,11.445523,11.401754,11.090536,11.000000,11.313708,11.090536,11.180340,11.135529,10.862781
3079,11.000000,11.090536,11.269427,10.677078,11.313708,10.816654,11.532562,11.401754,11.445523,11.313708,...,11.090536,11.445523,11.401754,11.090536,11.135529,11.313708,11.180340,11.180340,11.135529,11.180340
3080,11.000000,11.090536,11.269427,10.677078,11.313708,10.816654,11.532562,11.401754,11.445523,11.313708,...,11.180340,11.445523,11.401754,11.090536,11.135529,11.313708,11.269427,11.224972,11.269427,11.180340
3081,11.180340,11.269427,11.357817,10.677078,11.401754,11.045361,11.704700,11.489125,11.532562,11.313708,...,11.180340,11.445523,11.401754,11.313708,11.135529,11.313708,11.401754,11.224972,11.313708,11.180340


In [30]:
similarity= neighbors_k

In [31]:
Dmean=np.mean(similarity[1,:])

In [32]:
round(Dmean, 2)

3.23

In [33]:
std=np.std(similarity[1,:])

In [34]:
round(std, 2)

1.37

In [35]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

3.91


In [36]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [37]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,768,769,770
0,3.605551,4.358899,3.316625,4.898980,5.000000,1.000000,2.449490,5.000000,2.449490,3.605551,...,3.162278,3.316625,3.872983,3.605551,2.645751,4.582576,3.605551,3.605551,1.000000,4.898980
1,4.898980,4.582576,3.605551,5.099020,5.196152,3.316625,3.464102,5.099020,2.828427,3.872983,...,4.123106,4.582576,4.000000,3.605551,3.605551,6.855655,5.099020,4.690416,3.605551,5.000000
2,5.196152,4.898980,3.605551,6.633250,5.196152,4.123106,3.872983,5.291502,3.464102,4.000000,...,4.582576,4.582576,4.795832,4.123106,3.872983,6.855655,5.196152,4.795832,5.477226,5.099020
3,5.196152,5.196152,4.000000,7.211102,5.291502,4.358899,3.872983,5.291502,4.358899,4.358899,...,4.582576,4.582576,5.567764,4.582576,4.000000,6.855655,5.385165,4.898980,5.477226,5.099020
4,5.477226,5.385165,5.567764,7.211102,5.385165,4.472136,4.123106,5.291502,6.633250,4.898980,...,4.582576,4.690416,6.082763,4.898980,4.242640,6.928203,5.385165,5.099020,5.656854,5.196152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,10.816654,11.224972,11.000000,11.313708,11.180340,11.401754,10.723805,11.489125,11.489125,10.862781,...,11.180340,10.677078,11.357817,11.489125,11.045361,11.180340,11.269427,11.090536,11.045361,11.090536
3079,10.816654,11.313708,11.401754,11.357817,11.180340,11.401754,11.000000,11.532562,11.532562,10.862781,...,11.224972,10.677078,11.532562,11.575837,11.090536,11.224972,11.269427,11.090536,11.045361,11.135529
3080,10.816654,11.313708,11.401754,11.357817,11.180340,11.401754,11.000000,11.575837,11.532562,10.862781,...,11.224972,10.770329,11.532562,11.789826,11.090536,11.269427,11.269427,11.090536,11.090536,11.135529
3081,10.816654,11.357817,11.401754,11.401754,11.180340,11.401754,11.000000,11.618950,11.532562,11.224972,...,11.224972,10.770329,11.532562,11.789826,11.090536,11.357817,11.269427,11.180340,11.090536,11.180340


In [38]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.606 4.359 3.317 4.899 5.    1.    2.449 5.    2.449 3.606 3.464 2.828
 6.928 3.162 0.    4.123 1.    5.    3.162 4.123 2.646 3.162 4.899 4.359
 6.    2.828 3.464 4.583 3.317 4.    3.742 7.    3.606 4.    4.243 3.606
 3.873 3.873 3.317 3.606 4.    1.    3.317 4.472 2.646 4.359 2.828 2.828
 3.162 3.464 3.873 3.464 5.916 3.162 4.359 1.732 3.    3.606 6.481 3.742
 3.606 3.742 3.464 2.828 5.099 4.123 3.162 3.464 3.    3.606 3.742 3.162
 1.    4.899 4.472 3.464 3.    4.472 6.856 1.    3.606 4.359 3.742 3.606
 5.831 3.464 4.359 2.    4.69  1.    2.    4.69  3.742 4.69  2.646 4.583
 3.742 3.162 1.414 3.464 1.414 6.481 0.    4.    2.646 4.359 2.828 3.873
 3.162 3.464 1.    3.606 4.359 3.    3.464 3.317 2.449 2.236 3.317 1.
 3.606 2.236 0.    4.243 4.243 3.742 3.464 4.243 4.583 2.449 3.873 0.
 2.828 2.449 3.873 1.732 5.099 3.464 0.    2.449 3.317 3.    3.606 0.
 3.    4.    0.    2.    3.606 1.414 3.742 1.414 4.    5.477 2.828 4.
 4.123 4.    4.899 0.    3.742 3.    3.873 3.162 3.464 0.    2.

In [39]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True False  True False False  True  True False  True  True  True  True
 False  True  True False  True False  True False  True  True False False
 False  True  True False  True False  True False  True False False  True
  True  True  True  True False  True  True False  True False  True  True
  True  True  True  True False  True False  True  True  True False  True
  True  True  True  True False False  True  True  True  True  True  True
  True False False  True  True False False  True  True False  True  True
 False  True False  True False  True  True False  True False  True False
  True  True  True  True  True False  True False  True False  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True  True  True False False  True  True False False  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True False False  True False
 False False False  True  True  True  True  True  T

In [40]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.72


In [41]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   2   5   6   8   9  10  11  13  14  16  18  20  21  25  26  28  30
  32  35  36  37  38  39  41  42  44  46  47  48  49  50  51  53  55  56
  57  59  60  61  62  63  66  67  68  69  70  71  72  75  76  79  80  82
  83  85  87  89  90  92  94  96  97  98  99 100 102 104 106 107 108 109
 110 111 113 114 115 116 117 118 119 120 121 122 125 126 129 130 131 132
 133 134 135 137 138 139 140 141 142 143 144 146 147 148 149 150 151 154
 159 160 161 162 163 164 165 166 168 169 170 172 174 175 176 181 182 183
 184 185 187 188 189 190 191 193 195 196 197 198 200 201 202 203 204 205
 206 207 208 209 210 211 212 214 217 218 219 220 221 223 224 225 226 228
 230 231 232 233 237 238 239 240 241 244 245 246 248 249 250 251 252 253
 254 255 256 257 258 259 261 262 264 265 266 268 270 271 273 274 275 276
 278 279 280 281 282 284 285 286 288 289 290 291 292 293 294 295 296 297
 298 299 300 302 305 306 309 310 311 312 314 315 316 317 320 321 324 327
 328 329 33

In [42]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [43]:
y_pred_MLPR_ad=list(y_pred_MLPR)

In [44]:
y_pred_MLPR_ad[:] = [x for i,x in enumerate(y_pred_MLPR_ad) if i not in out_Ad]

In [45]:
len(y_pred_MLPR_ad)

553

In [46]:
y_ts_ad=list(y_ts)

In [47]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [48]:
len(y_ts_ad)

553

In [49]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_MLPR_ad), 2)
Q2_TS

0.69

In [50]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_MLPR_ad)), 2)
RMSE_TS

0.66

# k-nearest neighbors

In [140]:
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

In [141]:
m = GridSearchCV(KNeighborsRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [142]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [143]:
best_kNN = m.best_estimator_

In [144]:
m.best_params_

{'n_neighbors': 3}

In [145]:
y_pred_ws_kNN = best_kNN.predict(x_tr)

In [146]:
R2_WS = round(r2_score(y_tr, y_pred_ws_kNN), 2)
R2_WS

0.82

In [147]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_ws_kNN)), 2)
RMSE_WS

0.58

In [148]:
y_pred_CV_kNN = cross_val_predict(best_kNN, x_tr, y_tr, cv=cv)

In [149]:
y_pred_CV_kNN

array([4.605    , 5.343333 , 5.343333 , ..., 7.4083333, 7.39     ,
       7.6666665], dtype=float32)

In [150]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_kNN), 2)
Q2_CV

0.6

In [151]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_kNN)), 2)
RMSE_CV

0.71

# 9. Prediction for test set's molecules

In [152]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [153]:
y_pred_kNN = best_kNN.predict(x_ts)

In [154]:
Q2_TS = round(r2_score(y_ts, y_pred_kNN), 2)
Q2_TS

0.6

In [155]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_kNN)), 2)
RMSE_TS

0.7

# save the model to disk

In [156]:
pickle.dump(best_kNN, open('Models/Morgan_fingerprint/HDAC6_kNN_MF.pkl', 'wb'))

# load the model from disk

In [138]:
best_kNN = pickle.load(open('Models/Morgan_fingerprint/HDAC6_kNN_MF.pkl', 'rb'))

# 10. Y-randomization MLPR

In [145]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_kNN, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=seed)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.4s


True score =  0.59 
Y-randomization =  -0.34 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    7.6s finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [157]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [158]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3073,3074,3075,3076,3077,3078,3079,3080,3081,3082
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,3.162278,5.656854,4.472136,3.872983,6.708204,3.316625,6.244998,5.000000,3.464102,6.480741,...,3.162278,3.605551,4.123106,4.242640,3.162278,3.605551,3.872983,3.741657,2.645751,4.795832
2,6.164414,6.000000,5.385165,4.358899,6.782330,3.872983,6.403124,5.099020,4.242640,6.633250,...,3.162278,5.567764,4.472136,4.358899,3.605551,5.099020,3.872983,4.000000,3.464102,5.477226
3,6.164414,6.082763,5.567764,4.795832,6.855655,4.472136,7.071068,5.099020,4.242640,6.708204,...,4.472136,5.916080,4.690416,4.472136,3.741657,5.291502,4.242640,4.242640,3.464102,5.567764
4,6.244998,6.082763,5.567764,4.795832,6.855655,5.196152,7.141428,5.099020,4.582576,6.708204,...,4.582576,6.000000,4.795832,4.690416,3.872983,5.477226,4.242640,4.242640,3.464102,5.567764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,11.000000,11.090536,11.269427,10.630146,11.313708,10.816654,11.532562,11.401754,11.401754,11.313708,...,11.090536,11.445523,11.401754,11.090536,11.000000,11.313708,11.090536,11.180340,11.135529,10.862781
3079,11.000000,11.090536,11.269427,10.677078,11.313708,10.816654,11.532562,11.401754,11.445523,11.313708,...,11.090536,11.445523,11.401754,11.090536,11.135529,11.313708,11.180340,11.180340,11.135529,11.180340
3080,11.000000,11.090536,11.269427,10.677078,11.313708,10.816654,11.532562,11.401754,11.445523,11.313708,...,11.180340,11.445523,11.401754,11.090536,11.135529,11.313708,11.269427,11.224972,11.269427,11.180340
3081,11.180340,11.269427,11.357817,10.677078,11.401754,11.045361,11.704700,11.489125,11.532562,11.313708,...,11.180340,11.445523,11.401754,11.313708,11.135529,11.313708,11.401754,11.224972,11.313708,11.180340


In [159]:
similarity= neighbors_k

In [160]:
Dmean=np.mean(similarity[1,:])

In [161]:
round(Dmean, 2)

3.23

In [162]:
std=np.std(similarity[1,:])

In [163]:
round(std, 2)

1.37

In [164]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

3.91


In [165]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [166]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,768,769,770
0,3.605551,4.358899,3.316625,4.898980,5.000000,1.000000,2.449490,5.000000,2.449490,3.605551,...,3.162278,3.316625,3.872983,3.605551,2.645751,4.582576,3.605551,3.605551,1.000000,4.898980
1,4.898980,4.582576,3.605551,5.099020,5.196152,3.316625,3.464102,5.099020,2.828427,3.872983,...,4.123106,4.582576,4.000000,3.605551,3.605551,6.855655,5.099020,4.690416,3.605551,5.000000
2,5.196152,4.898980,3.605551,6.633250,5.196152,4.123106,3.872983,5.291502,3.464102,4.000000,...,4.582576,4.582576,4.795832,4.123106,3.872983,6.855655,5.196152,4.795832,5.477226,5.099020
3,5.196152,5.196152,4.000000,7.211102,5.291502,4.358899,3.872983,5.291502,4.358899,4.358899,...,4.582576,4.582576,5.567764,4.582576,4.000000,6.855655,5.385165,4.898980,5.477226,5.099020
4,5.477226,5.385165,5.567764,7.211102,5.385165,4.472136,4.123106,5.291502,6.633250,4.898980,...,4.582576,4.690416,6.082763,4.898980,4.242640,6.928203,5.385165,5.099020,5.656854,5.196152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,10.816654,11.224972,11.000000,11.313708,11.180340,11.401754,10.723805,11.489125,11.489125,10.862781,...,11.180340,10.677078,11.357817,11.489125,11.045361,11.180340,11.269427,11.090536,11.045361,11.090536
3079,10.816654,11.313708,11.401754,11.357817,11.180340,11.401754,11.000000,11.532562,11.532562,10.862781,...,11.224972,10.677078,11.532562,11.575837,11.090536,11.224972,11.269427,11.090536,11.045361,11.135529
3080,10.816654,11.313708,11.401754,11.357817,11.180340,11.401754,11.000000,11.575837,11.532562,10.862781,...,11.224972,10.770329,11.532562,11.789826,11.090536,11.269427,11.269427,11.090536,11.090536,11.135529
3081,10.816654,11.357817,11.401754,11.401754,11.180340,11.401754,11.000000,11.618950,11.532562,11.224972,...,11.224972,10.770329,11.532562,11.789826,11.090536,11.357817,11.269427,11.180340,11.090536,11.180340


In [167]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.606 4.359 3.317 4.899 5.    1.    2.449 5.    2.449 3.606 3.464 2.828
 6.928 3.162 0.    4.123 1.    5.    3.162 4.123 2.646 3.162 4.899 4.359
 6.    2.828 3.464 4.583 3.317 4.    3.742 7.    3.606 4.    4.243 3.606
 3.873 3.873 3.317 3.606 4.    1.    3.317 4.472 2.646 4.359 2.828 2.828
 3.162 3.464 3.873 3.464 5.916 3.162 4.359 1.732 3.    3.606 6.481 3.742
 3.606 3.742 3.464 2.828 5.099 4.123 3.162 3.464 3.    3.606 3.742 3.162
 1.    4.899 4.472 3.464 3.    4.472 6.856 1.    3.606 4.359 3.742 3.606
 5.831 3.464 4.359 2.    4.69  1.    2.    4.69  3.742 4.69  2.646 4.583
 3.742 3.162 1.414 3.464 1.414 6.481 0.    4.    2.646 4.359 2.828 3.873
 3.162 3.464 1.    3.606 4.359 3.    3.464 3.317 2.449 2.236 3.317 1.
 3.606 2.236 0.    4.243 4.243 3.742 3.464 4.243 4.583 2.449 3.873 0.
 2.828 2.449 3.873 1.732 5.099 3.464 0.    2.449 3.317 3.    3.606 0.
 3.    4.    0.    2.    3.606 1.414 3.742 1.414 4.    5.477 2.828 4.
 4.123 4.    4.899 0.    3.742 3.    3.873 3.162 3.464 0.    2.

In [168]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True False  True False False  True  True False  True  True  True  True
 False  True  True False  True False  True False  True  True False False
 False  True  True False  True False  True False  True False False  True
  True  True  True  True False  True  True False  True False  True  True
  True  True  True  True False  True False  True  True  True False  True
  True  True  True  True False False  True  True  True  True  True  True
  True False False  True  True False False  True  True False  True  True
 False  True False  True False  True  True False  True False  True False
  True  True  True  True  True False  True False  True False  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True  True  True False False  True  True False False  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True False False  True False
 False False False  True  True  True  True  True  T

In [169]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.72


In [170]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   2   5   6   8   9  10  11  13  14  16  18  20  21  25  26  28  30
  32  35  36  37  38  39  41  42  44  46  47  48  49  50  51  53  55  56
  57  59  60  61  62  63  66  67  68  69  70  71  72  75  76  79  80  82
  83  85  87  89  90  92  94  96  97  98  99 100 102 104 106 107 108 109
 110 111 113 114 115 116 117 118 119 120 121 122 125 126 129 130 131 132
 133 134 135 137 138 139 140 141 142 143 144 146 147 148 149 150 151 154
 159 160 161 162 163 164 165 166 168 169 170 172 174 175 176 181 182 183
 184 185 187 188 189 190 191 193 195 196 197 198 200 201 202 203 204 205
 206 207 208 209 210 211 212 214 217 218 219 220 221 223 224 225 226 228
 230 231 232 233 237 238 239 240 241 244 245 246 248 249 250 251 252 253
 254 255 256 257 258 259 261 262 264 265 266 268 270 271 273 274 275 276
 278 279 280 281 282 284 285 286 288 289 290 291 292 293 294 295 296 297
 298 299 300 302 305 306 309 310 311 312 314 315 316 317 320 321 324 327
 328 329 33

In [171]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [172]:
y_pred_kNN_ad=list(y_pred_kNN)

In [173]:
y_pred_kNN_ad[:] = [x for i,x in enumerate(y_pred_kNN_ad) if i not in out_Ad]

In [174]:
len(y_pred_kNN_ad)

553

In [175]:
y_ts_ad=list(y_ts)

In [176]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [177]:
len(y_ts_ad)

553

In [178]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_kNN_ad), 2)
Q2_TS

0.64

In [179]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_kNN_ad)), 2)
RMSE_TS

0.67