# 1. Importing modules and functions

In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import MACCSkeys
import chembl_structure_pipeline
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.neural_network import MLPRegressor

# 2.Data entry and curation work set

In [2]:
uploaded_file_ws="datasets/HDAC2_work.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  1288 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [3]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [4]:
len(y_tr)

1288

# 3.Standardization SDF file for work set

In [5]:
records_ws = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToMolBlock(all_mols_ws[i])
    records_ws.append(record)
            
mols_ws = []
for i,record in enumerate(records_ws):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ws.append(m)
           
moldf_ws = []
for val in mols_ws:
    if val != None:
        moldf_ws.append(val)
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  1288 molecules


# 4.Data entry and curation test set

In [6]:
uploaded_file_ts="datasets/HDAC2_test.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  323 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [7]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [8]:
len(y_ts)

323

# 5.Standardization SDF file for test set

In [9]:
records_ts = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToMolBlock(all_mols_ts[i])
    records_ts.append(record)
            
mols_ts = []
for i,record in enumerate(records_ts):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ts.append(m)
           
moldf_ts = []
for val in mols_ts:
    if val != None:
        moldf_ts.append(val)
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  323 molecules


## Calculation MACCS Fingerprints for work set

In [10]:
from rdkit.Chem import MACCSkeys

In [11]:
fp_tr = [MACCSkeys.GenMACCSKeys(m) for m in moldf_ws]

In [12]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [13]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [14]:
savetxt('Models/MACCS/x_tr_MACCS.csv', x_tr, delimiter=',')

In [15]:
x_tr.shape

(1288, 167)

## Calculation  MACCS Fingerprint for test set

In [16]:
fp_ts = [MACCSkeys.GenMACCSKeys(m) for m in moldf_ts]

In [17]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [18]:
x_ts = rdkit_numpy_convert(fp_ts)

In [19]:
x_ts.shape

(323, 167)

In [20]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

# 8. Random forest model building and validation

In [21]:
seed = 42

In [22]:
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [23]:
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3, x_tr.shape[1] // 2],
              "n_estimators": [100, 250, 500, 1000]}

In [24]:
m = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [25]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestRegressor(), n_jobs=2,
             param_grid={'max_features': [16, 23, 33, 55, 83],
                         'n_estimators': [100, 250, 500, 1000]},
             verbose=1)

In [26]:
m.best_params_
best_RF = m.best_estimator_

In [27]:
y_pred_CV_RF = cross_val_predict(best_RF, x_tr, y_tr, cv=cv)

In [28]:
y_pred_CV_RF

array([5.13801312, 6.28675251, 5.88243634, ..., 7.43714888, 7.70960394,
       7.99381461])

In [29]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_RF), 2)
Q2_CV

0.54

In [30]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_RF)), 2)
RMSE_CV

0.74

# 9. Prediction for test set's molecules

In [31]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [32]:
y_pred_rf = best_RF.predict(x_ts)

In [33]:
Q2_TS = round(r2_score(y_ts, y_pred_rf), 2)
Q2_TS

0.56

In [34]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_rf)), 2)
RMSE_TS

0.73

# save the model to disk

In [35]:
pickle.dump(best_RF, open('Models/MACCS/HDAC2_RF_MACCS.pkl', 'wb'))

# load the model from disk

In [36]:
best_RF = pickle.load(open('Models/MACCS/HDAC2_RF_MACCS.pkl', 'rb'))

# 10. Y-randomization RF model

In [37]:
permutations = 100
score, permutation_scores, pvalue = permutation_test_score(best_RF, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.2min


True score =  0.53 
Y-randomization =  -0.22 
p-value =  0.0099


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.5min finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [38]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [39]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1278,1279,1280,1281,1282,1283,1284,1285,1286,1287
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.732051,4.795832,3.605551,4.358899,2.449490,3.162278,4.358899,1.414214,3.464102,4.000000,...,1.000000,1.414214,1.000000,0.000000,0.000000,3.162278,2.000000,1.000000,1.414214,2.236068
2,3.162278,4.795832,3.741657,4.898980,2.449490,3.872983,4.472136,2.000000,3.605551,5.291502,...,1.000000,2.236068,2.449490,0.000000,1.414214,3.464102,2.000000,2.236068,1.414214,2.236068
3,3.162278,5.196152,3.872983,4.898980,4.242640,3.872983,4.582576,2.236068,3.872983,5.477226,...,1.732051,2.449490,2.828427,1.000000,1.414214,3.605551,2.449490,2.645751,1.732051,2.236068
4,3.464102,5.196152,4.123106,4.898980,4.358899,4.000000,4.795832,3.000000,3.872983,5.656854,...,2.828427,4.242640,3.464102,1.000000,1.414214,3.741657,2.449490,3.605551,2.236068,2.236068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,7.745967,7.937254,8.062258,8.306623,8.366600,8.124039,8.124039,7.745967,8.246211,8.000000,...,7.874008,7.937254,8.185352,8.366600,8.366600,8.000000,8.124039,8.124039,7.937254,8.185352
1284,7.745967,7.937254,8.306623,8.306623,8.485281,8.124039,8.185352,7.745967,8.246211,8.000000,...,7.874008,7.937254,8.185352,8.426149,8.485281,8.062258,8.185352,8.124039,7.937254,8.185352
1285,7.810250,7.937254,8.306623,8.306623,8.485281,8.185352,8.246211,7.937254,8.246211,8.000000,...,8.000000,7.937254,8.185352,8.485281,8.485281,8.124039,8.246211,8.124039,7.937254,8.185352
1286,7.874008,8.000000,8.366600,8.366600,8.602325,8.185352,8.306623,8.246211,8.366600,8.000000,...,8.062258,8.185352,8.426149,8.544003,8.544003,8.246211,8.246211,8.366600,8.124039,8.366600


In [40]:
similarity= neighbors_k

In [41]:
Dmean=np.mean(similarity[1,:])

In [42]:
round(Dmean, 2)

1.63

In [43]:
std=np.std(similarity[1,:])

In [44]:
round(std, 2)

1.19

In [45]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.22


In [46]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [47]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,313,314,315,316,317,318,319,320,321,322
0,3.316625,3.464102,1.000000,3.464102,0.000000,0.000000,3.741657,1.000000,2.449490,1.000000,...,1.000000,1.414214,3.162278,0.000000,1.414214,1.414214,3.162278,2.645751,2.449490,1.000000
1,3.316625,3.605551,4.242640,4.242640,0.000000,0.000000,4.242640,2.449490,2.828427,1.414214,...,1.414214,2.645751,3.316625,1.414214,1.732051,2.000000,3.316625,2.828427,3.162278,1.000000
2,3.316625,4.123106,4.358899,4.795832,1.000000,1.414214,4.358899,3.162278,3.162278,2.236068,...,1.732051,3.464102,3.316625,1.414214,2.000000,2.449490,3.316625,3.000000,3.162278,1.414214
3,3.464102,4.242640,4.472136,4.898980,2.000000,2.236068,4.358899,3.872983,3.162278,2.645751,...,2.000000,3.741657,3.316625,1.414214,2.000000,2.449490,3.464102,3.162278,3.316625,1.732051
4,3.605551,4.795832,4.690416,5.000000,2.000000,2.449490,4.358899,4.795832,3.316625,3.872983,...,2.236068,4.123106,3.316625,1.732051,2.236068,2.645751,3.605551,3.464102,3.316625,1.732051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,7.937254,8.485281,8.185352,8.062258,7.937254,7.681146,8.124039,8.717798,7.810250,8.185352,...,8.185352,8.306623,8.306623,8.246211,8.246211,7.937254,7.937254,7.810250,8.246211,8.306623
1284,7.937254,8.485281,8.246211,8.062258,8.000000,7.745967,8.246211,8.774964,7.874008,8.185352,...,8.246211,8.366600,8.306623,8.246211,8.366600,8.000000,8.000000,7.937254,8.306623,8.426149
1285,8.000000,8.485281,8.306623,8.246211,8.062258,7.810250,8.306623,8.774964,7.937254,8.185352,...,8.246211,8.366600,8.306623,8.306623,8.366600,8.000000,8.000000,7.937254,8.306623,8.426149
1286,8.062258,8.544003,8.366600,8.366600,8.124039,7.810250,8.426149,8.831760,8.000000,8.185352,...,8.246211,8.426149,8.306623,8.306623,8.426149,8.000000,8.062258,8.000000,8.426149,8.485281


In [48]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.317 3.464 1.    3.464 0.    0.    3.742 1.    2.449 1.    1.732 3.
 4.243 2.    1.414 1.    2.646 0.    4.123 0.    2.    3.162 1.732 3.162
 1.414 2.646 0.    3.317 0.    2.    3.    1.    2.449 4.243 1.    1.414
 2.236 1.732 0.    1.    1.    2.    2.646 1.414 3.162 1.    4.243 1.414
 1.414 0.    3.873 1.    0.    0.    0.    1.414 2.236 0.    0.    1.
 3.464 2.236 1.    2.    3.606 0.    1.732 2.    1.414 0.    1.732 0.
 0.    0.    3.317 0.    0.    2.236 2.236 1.    1.732 2.    0.    2.236
 1.414 2.646 1.    2.449 2.449 0.    1.    0.    3.162 3.464 2.    1.
 0.    0.    0.    3.464 0.    2.449 0.    2.    2.236 3.742 0.    3.
 1.414 3.    1.    1.414 1.414 0.    3.317 0.    1.    2.236 0.    1.414
 1.732 1.732 2.236 3.873 1.414 0.    2.449 1.    2.    1.    3.162 0.
 2.236 3.606 3.    4.359 2.236 0.    3.162 4.472 1.414 2.    2.236 0.
 1.    1.414 2.646 2.    1.732 0.    1.414 1.    0.    1.414 1.    1.
 2.449 2.449 2.828 2.646 2.449 2.    0.    0.    1.414 0.    0.    1.414
 0

In [49]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[False False  True False  True  True False  True False  True  True False
 False  True  True  True False  True False  True  True False  True False
  True False  True False  True  True False  True False False  True  True
 False  True  True  True  True  True False  True False  True False  True
  True  True False  True  True  True  True  True False  True  True  True
 False False  True  True False  True  True  True  True  True  True  True
  True  True False  True  True False False  True  True  True  True False
  True False  True False False  True  True  True False False  True  True
  True  True  True False  True False  True  True False False  True False
  True False  True  True  True  True False  True  True False  True  True
  True  True False False  True  True False  True  True  True False  True
 False False False False False  True False False  True  True False  True
  True  True False  True  True  True  True  True  True  True  True  True
 False False False False False  True  True  True  T

In [50]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.653250773993808


In [51]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  2   4   5   7   9  10  13  14  15  17  19  20  22  24  26  28  29  31
  34  35  37  38  39  40  41  43  45  47  48  49  51  52  53  54  55  57
  58  59  62  63  65  66  67  68  69  70  71  72  73  75  76  79  80  81
  82  84  86  89  90  91  94  95  96  97  98 100 102 103 106 108 110 111
 112 113 115 116 118 119 120 121 124 125 127 128 129 131 137 140 141 143
 144 145 147 148 149 150 151 152 153 154 155 161 162 163 164 165 166 167
 168 169 172 173 174 176 177 178 181 182 184 187 188 189 191 192 194 196
 199 200 201 205 208 209 210 211 212 214 216 217 218 219 221 223 226 228
 229 230 232 233 234 235 236 237 238 239 240 241 242 244 245 246 247 248
 249 250 252 253 254 255 256 257 260 261 262 263 264 265 267 268 269 270
 272 273 277 278 279 281 284 285 286 287 288 289 290 291 292 293 295 298
 303 304 305 307 308 309 312 313 314 316 317 318 322]


In [52]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [53]:
y_pred_rf_ad=list(y_pred_rf)

In [54]:
y_pred_rf_ad[:] = [x for i,x in enumerate(y_pred_rf_ad) if i not in out_Ad]

In [55]:
len(y_pred_rf_ad)

211

In [56]:
y_ts_ad=list(y_ts)

In [57]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [58]:
len(y_ts_ad)

211

In [59]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_rf_ad), 2)
Q2_TS

0.64

In [60]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_rf_ad)), 2)
RMSE_TS

0.69

In [61]:
x_tr

array([[0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       ...,
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 1., 1., 0.]], dtype=float32)