# Importing modules and functions

In [13]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from padelpy import from_sdf
import shap
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# 2.Data entry and curation work set

In [14]:
uploaded_file_ws="datasets/HDAC6_work.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  3083 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [15]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [16]:
len(y_tr)

3083

# 3.Standardization SDF file for work set

In [17]:
all_mols_ws[:] = [x for i,x in enumerate(all_mols_ws) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToSmiles(all_mols_ws[i])
    records.append(record)

moldf_ws = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ws.append(m)
    
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  3083 molecules


# 4.Data entry and curation test set

In [18]:
uploaded_file_ts="datasets/HDAC6_test.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  771 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [19]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [20]:
len(y_ts)

771

# 5.Standardization SDF file for test set

In [21]:
all_mols_ts[:] = [x for i,x in enumerate(all_mols_ts) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToSmiles(all_mols_ts[i])
    records.append(record)

moldf_ts = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ts.append(m)
    
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  771 molecules


## Calculation RDKit_2D descriptors for work set

In [22]:
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
header = calc.GetDescriptorNames()

In [23]:
descr_tr= []
for m in moldf_ws:
    descr_tr.append(calc.CalcDescriptors(m))
x_tr = np.asarray(descr_tr)

In [24]:
df_RDKit_2D = pd.DataFrame(x_tr,columns=header)

In [25]:
df_RDKit_2D.head(2)

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,12.242419,12.242419,0.074853,-0.968233,0.872997,278.264,264.152,278.090272,106.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.174333,12.174333,0.042267,-0.371769,0.17684,416.909,391.709,416.161518,154.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [26]:
df_RDKit_2D.isna().mean().sort_values(ascending=False)

BCUT2D_MWLOW      0.003568
BCUT2D_LOGPLOW    0.003568
BCUT2D_CHGLO      0.003568
BCUT2D_CHGHI      0.003568
BCUT2D_MRHI       0.003568
                    ...   
SlogP_VSA8        0.000000
SlogP_VSA9        0.000000
TPSA              0.000000
EState_VSA1       0.000000
fr_urea           0.000000
Length: 209, dtype: float64

In [27]:
df_RDKit_2D=df_RDKit_2D.dropna(axis=1)
df_RDKit_2D

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,12.242419,12.242419,0.074853,-0.968233,0.872997,278.264,264.152,278.090272,106.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.174333,12.174333,0.042267,-0.371769,0.176840,416.909,391.709,416.161518,154.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
2,11.990951,11.990951,0.054892,-0.264842,0.815196,301.302,286.182,301.106256,114.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10.873288,10.873288,0.534320,-0.534320,0.456792,213.236,202.148,213.078979,80.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.581454,2.581454,0.445824,0.445824,0.454787,272.476,240.220,272.250401,112.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,14.120048,14.120048,0.000665,-0.791896,0.586203,301.321,285.193,301.122655,114.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3079,12.247689,12.247689,0.057760,-0.384776,0.069031,516.606,484.350,516.259737,198.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
3080,11.417085,11.417085,0.345049,-3.209919,0.425745,387.461,366.293,387.125277,142.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3081,12.132469,12.132469,0.021659,-0.374570,0.098503,504.595,472.339,504.259737,194.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0


In [28]:
x_tr= df_RDKit_2D.to_numpy ()

In [29]:
# Data Standardization
from sklearn.preprocessing import StandardScaler
Scaler=StandardScaler()
x_tr=Scaler.fit_transform(x_tr)

In [30]:
x_tr.shape

(3083, 197)

In [45]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

In [31]:
savetxt('Models/RDKiT/x_tr_RDKiT.csv', x_tr, delimiter=',')

## Calculation  RDKit_2D descriptors for test set

In [32]:
descr_ts = []
for m in moldf_ts:
    descr_ts.append(calc.CalcDescriptors(m))
x_ts = np.asarray(descr_ts)

In [33]:
x_ts.shape

(771, 209)

In [34]:
df_RDKit_2D_ts = pd.DataFrame(x_ts,columns=header)
df_RDKit_2D_ts=df_RDKit_2D_ts.dropna(axis=1)
df_RDKit_2D_ts

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,10.462074,10.462074,0.122184,-0.789763,0.825546,204.229,192.133,204.089878,78.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.347016,12.347016,0.044100,-0.408075,0.249061,494.720,448.352,494.350843,198.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,11.705734,11.705734,0.208479,-0.548897,0.249396,446.547,416.307,446.220557,172.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13.575602,13.575602,0.046998,-0.826780,0.142426,540.661,500.341,540.294785,212.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
4,11.474395,11.474395,0.062912,-0.557235,0.285460,338.433,316.257,338.141262,126.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
766,12.491330,12.491330,0.078379,-0.334371,0.199201,464.636,428.348,464.256960,178.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0
767,13.985815,13.985815,0.000132,-0.822281,0.592558,290.298,275.178,290.117904,110.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
768,14.047607,14.047607,0.154280,-1.051567,0.597350,323.274,311.178,323.088161,120.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
769,12.260501,12.260501,0.092378,-0.391658,0.193074,396.447,372.255,396.179755,152.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [35]:
x_ts= df_RDKit_2D_ts.to_numpy ()

In [36]:
x_ts = Scaler.transform(x_ts)

In [46]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

# load the models from disk

In [37]:
best_svm = pickle.load(open('Models/RDKiT/HDAC6_SVM_RDKiT.pkl', 'rb'))

In [38]:
best_gbr = pickle.load(open('Models/RDKiT/HDAC6_GBR_RDKiT.pkl', 'rb'))

In [39]:
best_MLPR = pickle.load(open('Models/RDKiT/HDAC6_MLPR_RDKiT.pkl', 'rb'))

In [40]:
best_kNN = pickle.load(open('Models/RDKiT/HDAC6_kNN_RDKiT.pkl', 'rb'))

# Prediction for CV

In [47]:
y_pred_ws_GBR = best_gbr.predict(x_tr)

In [48]:
y_pred_ws_svm = best_svm.predict(x_tr)

In [49]:
y_pred_con=(y_pred_ws_GBR+y_pred_ws_svm)/2

In [50]:
R2_WS = round(r2_score(y_tr, y_pred_con), 2)
R2_WS

0.97

In [51]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con)), 2)
RMSE_WS

0.37

In [52]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [53]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [54]:
y_pred_CV_gbr = cross_val_predict(best_gbr, x_tr, y_tr, cv=cv)

In [55]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, x_tr, y_tr, cv=cv)

In [56]:
y_pred_CV_kNN = cross_val_predict(best_kNN, x_tr, y_tr, cv=cv)

# For all models

In [57]:
y_pred_con=(y_pred_CV_svm+y_pred_CV_gbr+y_pred_CV_MLPR+y_pred_CV_kNN)/4

In [58]:
Q2_CV = round(r2_score(y_tr, y_pred_con), 2)
Q2_CV

0.63

In [59]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con)),2)
RMSE_CV

0.7

# three models: svm+gbr+MLPR

In [60]:
y_pred_con=(y_pred_CV_svm+y_pred_CV_gbr+y_pred_CV_MLPR)/3

In [61]:
Q2_CV = round(r2_score(y_tr, y_pred_con), 2)
Q2_CV

0.62

In [62]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con)),2)
RMSE_CV

0.7

# two models: svm+gbr

In [63]:
y_pred_con_without_MLPR=(y_pred_CV_svm+y_pred_CV_gbr)/2

In [64]:
Q2_CV = round(r2_score(y_tr, y_pred_con_without_MLPR), 2)
Q2_CV

0.63

In [65]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con_without_MLPR)),2)
RMSE_CV

0.7

# Prediction for test set's molecules

In [66]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [67]:
y_pred_svm = best_svm.predict(x_ts)

In [68]:
y_pred_gbr = best_gbr.predict(x_ts)

In [69]:
y_pred_con=(y_pred_svm+y_pred_gbr)/2

In [70]:
Q2_TS = round(r2_score(y_ts, y_pred_con), 2)
Q2_TS

0.67

In [71]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_con)), 2)
RMSE_TS

0.68

# Estimating applicability domain. Method - Euclidian distances, K=1

In [72]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [73]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3073,3074,3075,3076,3077,3078,3079,3080,3081,3082
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2.900746,7.490754,4.863770,4.216553,17.930073,5.799909,12.561265,5.479323,7.492610,7.368469,...,4.920031,4.072734,4.076962,5.708169,0.658189,4.072734,4.407693,4.652277,3.443594,4.879563
2,7.109359,8.714100,6.138494,4.775836,17.975319,10.456244,13.250570,5.497796,7.601954,7.723112,...,5.171291,5.859732,4.348378,6.024629,0.988269,5.060231,5.224220,6.131215,4.407693,4.903585
3,8.137948,9.820560,6.308080,4.847589,18.707184,10.577019,13.336942,5.805781,7.664309,7.915602,...,6.813833,6.497852,4.839753,6.250508,1.010266,5.063005,5.662558,6.684643,4.833030,5.560298
4,9.989728,9.854204,7.085804,4.960750,18.861732,10.596408,13.549530,5.840825,9.903274,7.977097,...,7.405154,6.540346,5.370556,6.365889,1.255771,5.138348,6.034671,6.959353,4.916375,5.952220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,59.709450,57.089840,58.331394,58.048542,60.361908,59.614731,60.295795,56.343266,59.286362,57.215279,...,57.023624,58.033653,55.505810,57.839581,57.789265,57.883804,52.422710,58.245987,52.542534,58.314781
3079,64.622200,58.628857,61.113022,65.252991,66.739883,63.430611,64.864288,59.616123,60.062920,58.115875,...,58.553734,61.865906,58.377594,59.986343,61.820324,61.826904,59.121777,59.020580,59.263229,58.379990
3080,81.340500,75.280838,78.340683,82.818199,84.909103,80.951973,81.381165,74.253502,77.427872,74.428055,...,73.744194,78.953934,72.996773,77.820648,79.473152,79.284737,70.352531,76.791962,70.447548,76.546432
3081,85.425507,79.865417,82.594971,86.833618,88.959114,85.144142,85.474640,78.858673,81.937691,78.974007,...,78.352058,83.185013,77.640060,82.157242,83.693398,83.512268,75.306541,81.202240,75.398499,80.949966


In [74]:
similarity= neighbors_k

In [75]:
Dmean=np.mean(similarity[1,:])

In [76]:
round(Dmean, 2)

3.81

In [77]:
std=np.std(similarity[1,:])

In [78]:
round(std, 2)

2.93

In [79]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

5.28


In [80]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [81]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,768,769,770
0,5.562876,9.408307,2.476244,7.263117,5.282289,1.583892,4.525869,4.086624,1.226136,5.644353,...,3.599905,5.858726,2.397434,6.870955,0.624128,5.352804,4.178713,4.182506,2.802792,6.270564
1,6.275527,9.772277,3.742494,9.391856,5.966987,6.967967,4.654638,5.942024,1.366599,5.758951,...,4.647412,7.501709,2.406615,7.081888,1.123838,8.497024,4.706113,4.375260,6.100465,6.275194
2,9.078235,10.222068,5.327841,10.445791,6.363236,7.111341,4.662265,6.425875,2.359035,5.914622,...,4.780885,8.883467,3.993276,7.182392,1.419483,8.678516,5.038717,4.899669,6.551455,6.719943
3,9.173492,10.937669,5.398200,10.565662,6.781344,7.248561,4.665929,6.427113,8.193204,6.700883,...,4.980605,9.732864,4.486266,7.539695,1.828747,8.755605,6.219784,5.480123,6.700244,6.963034
4,10.453269,11.627253,8.801223,10.939350,6.816641,9.321835,4.682167,6.652034,9.358028,7.951190,...,5.120647,9.746282,6.545360,7.592781,2.017845,8.781173,6.347568,5.648501,6.784176,7.693971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,59.781460,56.179749,57.971455,52.362705,57.781895,58.590824,57.778069,51.572186,57.697525,57.476196,...,58.058849,59.868187,56.113762,54.403896,57.757385,54.564102,58.043728,58.252922,55.523853,54.207008
3079,67.205528,60.326141,58.778210,59.597603,59.278900,60.038208,62.330917,60.609188,58.992355,59.067913,...,61.397564,62.523899,59.718113,59.075142,61.826778,58.758202,62.399639,61.650723,58.134670,59.524067
3080,84.251755,74.074989,75.902603,70.851189,77.011703,75.895538,80.048813,71.155571,74.744392,74.372246,...,79.184990,79.435165,73.884781,73.209869,79.448738,72.676376,79.293182,78.995605,73.459663,71.604591
3081,88.179581,78.844856,80.434784,75.772133,81.400314,80.516991,84.226692,76.132507,79.301971,78.991112,...,83.416595,83.711998,78.526108,77.863152,83.664680,77.495522,83.496391,83.220802,78.117104,76.416847


In [82]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[5.56300e+00 9.40800e+00 2.47600e+00 7.26300e+00 5.28200e+00 1.58400e+00
 4.52600e+00 4.08700e+00 1.22600e+00 5.64400e+00 3.68800e+00 2.49300e+00
 1.16520e+01 5.60800e+00 1.98100e+00 2.12100e+00 1.44400e+00 5.20800e+00
 1.76500e+00 4.26400e+00 1.88300e+00 6.91600e+00 9.43100e+00 4.68400e+00
 1.49200e+01 2.24000e+00 2.77300e+00 7.36600e+00 1.96200e+00 4.84400e+00
 2.91800e+00 6.29500e+00 4.33100e+00 9.20700e+00 4.91700e+00 3.02300e+00
 4.15400e+00 5.92200e+00 5.15800e+00 4.62200e+00 7.16300e+00 3.86700e+00
 3.04700e+00 1.13160e+01 1.32220e+01 2.01300e+00 3.88900e+00 1.92400e+00
 3.34400e+00 3.48100e+00 4.09700e+00 1.64800e+00 7.86500e+00 1.32260e+01
 5.01000e+00 2.94400e+00 3.82500e+00 4.52400e+00 1.37280e+01 2.89200e+00
 1.03440e+01 2.26700e+00 3.71600e+00 5.77700e+00 6.72900e+00 6.57400e+00
 3.56200e+00 1.62200e+00 1.30300e+00 3.93600e+00 3.38100e+00 2.17900e+00
 1.41400e+00 8.35800e+00 1.30980e+01 2.54800e+00 7.36000e-01 6.53300e+00
 1.25400e+01 3.85900e+00 3.29900e+00 5.27200e+00 6.

In [83]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[False False  True False False  True  True  True  True False  True  True
 False False  True  True  True  True  True  True  True False False  True
 False  True  True False  True  True  True False  True False  True  True
  True False  True  True False  True  True False False  True  True  True
  True  True  True  True False False  True  True  True  True False  True
 False  True  True False False False  True  True  True  True  True  True
  True False False  True  True False False  True  True  True False  True
 False  True  True  True  True  True  True False  True False  True False
 False  True  True False  True False  True False False  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True  True  True False  True False  True
  True  True False  True  True  True  True  True  True  True  True  True
  True False  True  True False  True  True  True False False  True  True
  True False  True  True  True  True  True  True  T

In [84]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7833981841763943


In [85]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  2   5   6   7   8  10  11  14  15  16  17  18  19  20  23  25  26  28
  29  30  32  34  35  36  38  39  41  42  45  46  47  48  49  50  51  54
  55  56  57  59  61  62  66  67  68  69  70  71  72  75  76  79  80  81
  83  85  86  87  88  89  90  92  94  97  98 100 102 105 106 107 108 109
 110 111 112 113 115 116 117 118 119 120 121 122 123 124 125 126 127 129
 131 132 133 135 136 137 138 139 140 141 142 143 144 146 147 149 150 151
 154 155 156 158 159 160 161 162 163 164 165 166 169 170 172 173 174 175
 176 178 179 180 181 182 183 184 185 186 187 189 190 191 192 193 194 195
 196 197 198 199 200 202 203 204 205 206 207 208 210 211 214 215 217 218
 219 220 221 222 223 224 225 226 228 230 231 232 233 234 236 237 238 239
 240 241 244 245 246 248 249 251 252 253 254 256 257 258 259 261 262 263
 264 265 266 268 270 271 272 273 275 278 279 280 281 282 283 284 285 286
 287 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
 306 307 30

In [86]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [87]:
y_pred_con_ad=list(y_pred_con)

In [88]:
y_pred_con_ad[:] = [x for i,x in enumerate(y_pred_con_ad) if i not in out_Ad]

In [89]:
len(y_pred_con_ad)

604

In [90]:
y_ts_ad=list(y_ts)

In [91]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [92]:
len(y_ts_ad)

604

In [93]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_con_ad), 2)
Q2_TS

0.71

In [94]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_con_ad)), 2)
RMSE_TS

0.64