# 1. Importing modules and functions

In [16]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
import shap
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [17]:
import glob
xml_files = glob.glob("fingerprints_xml/*.xml")
xml_files.sort()
xml_files

['fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'fingerprints_xml\\EStateFingerprinter.xml',
 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'fingerprints_xml\\Fingerprinter.xml',
 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'fingerprints_xml\\MACCSFingerprinter.xml',
 'fingerprints_xml\\PubchemFingerprinter.xml',
 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'fingerprints_xml\\SubstructureFingerprinter.xml']

In [18]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [19]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'EState': 'fingerprints_xml\\EStateFingerprinter.xml',
 'CDKextended': 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'CDK': 'fingerprints_xml\\Fingerprinter.xml',
 'CDKgraphonly': 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'MACCS': 'fingerprints_xml\\MACCSFingerprinter.xml',
 'PubChem': 'fingerprints_xml\\PubchemFingerprinter.xml',
 'SubstructureCount': 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'Substructure': 'fingerprints_xml\\SubstructureFingerprinter.xml'}

# 2.Data entry and curation work set

In [20]:
uploaded_file_ws="datasets/HDAC3_work.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  1400 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [21]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [22]:
len(y_tr)

1400

# 3.Standardization SDF file for work set

In [23]:
all_mols_ws[:] = [x for i,x in enumerate(all_mols_ws) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToSmiles(all_mols_ws[i])
    records.append(record)

moldf_ws = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ws.append(m)
    
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  1400 molecules


In [24]:
records_ws = []
for i in range(len(moldf_ws)):
    record = Chem.MolToSmiles(moldf_ws[i])
    records_ws.append(record)

In [25]:
df = pd.DataFrame(records_ws, columns=["Smiles"])
df.to_csv('datasets/molecule_ws.smi', sep=',', index=False, header=False)

# 4.Data entry and curation test set

In [26]:
uploaded_file_ts="datasets/HDAC3_test.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  351 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [27]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [28]:
len(y_ts)

351

# 5.Standardization SDF file for test set

In [29]:
all_mols_ts[:] = [x for i,x in enumerate(all_mols_ts) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToSmiles(all_mols_ts[i])
    records.append(record)

moldf_ts = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ts.append(m)
    
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  351 molecules


In [30]:
records_ts = []
for i in range(len(moldf_ts)):
    record = Chem.MolToSmiles(moldf_ts[i])
    records_ts.append(record)

In [31]:
df_ts = pd.DataFrame(records_ts, columns=["Smiles"])
df_ts.to_csv('datasets/molecule_ts.smi', sep=',', index=False, header=False)

In [32]:
from pathlib import Path

In [33]:
path = Path('feature_name_rfecv_PubchemFP.txt')
feature_name_rfecv_MF = path.read_text().splitlines()

# 6.Descriptor calculation for work set

In [17]:
fp

{'AtomPairs2DCount': 'fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'EState': 'fingerprints_xml\\EStateFingerprinter.xml',
 'CDKextended': 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'CDK': 'fingerprints_xml\\Fingerprinter.xml',
 'CDKgraphonly': 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'MACCS': 'fingerprints_xml\\MACCSFingerprinter.xml',
 'PubChem': 'fingerprints_xml\\PubchemFingerprinter.xml',
 'SubstructureCount': 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'Substructure': 'fingerprints_xml\\SubstructureFingerprinter.xml'}

In [19]:
from padelpy import padeldescriptor

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'.csv']) 
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ws.smi', 
                d_file=fingerprint_output_file, 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [34]:
descriptors_PubChem = pd.read_csv('PubChem.csv')

In [35]:
descriptors_PubChem

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,AUTOGEN_molecule_ws_1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUTOGEN_molecule_ws_2,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUTOGEN_molecule_ws_3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUTOGEN_molecule_ws_4,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUTOGEN_molecule_ws_5,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,AUTOGEN_molecule_ws_1396,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1396,AUTOGEN_molecule_ws_1397,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1397,AUTOGEN_molecule_ws_1398,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1398,AUTOGEN_molecule_ws_1399,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
descriptors_ws = descriptors_PubChem.drop('Name', axis=1)

In [37]:
x_tr=descriptors_ws[feature_name_rfecv_MF]

In [38]:
y_tr = np.array(y_tr, dtype=np.float32)
len(y_tr)

1400

# 7.Descriptor calculation for test set

In [39]:
from padelpy import padeldescriptor

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'_ts.csv']) #PubChem.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ts.smi', 
                d_file=fingerprint_output_file, #'PubChem.csv'
                #descriptortypes='PubChem.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=False,
                fingerprints=True)

In [40]:
descriptors_PubChem_ts = pd.read_csv('PubChem_ts.csv')

In [41]:
x_ts = descriptors_PubChem_ts.drop('Name', axis=1)

In [42]:
desc_ts=x_ts

In [43]:
x_ts=x_ts[feature_name_rfecv_MF]

In [44]:
x_ts

Unnamed: 0,PubchemFP1,PubchemFP2,PubchemFP12,PubchemFP13,PubchemFP16,PubchemFP19,PubchemFP20,PubchemFP24,PubchemFP34,PubchemFP116,...,PubchemFP776,PubchemFP777,PubchemFP779,PubchemFP797,PubchemFP798,PubchemFP800,PubchemFP818,PubchemFP819,PubchemFP821,PubchemFP833
0,1,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,1,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,1,1,1,1,1,1,1,0,0,0,...,0,1,1,0,0,1,0,0,1,1
4,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,0,0,1,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
347,0,0,1,0,1,1,0,0,0,1,...,0,0,1,1,0,0,0,0,1,0
348,0,0,1,1,1,1,0,0,0,1,...,0,0,1,1,0,0,0,0,1,0
349,1,1,1,1,1,1,1,0,0,1,...,1,1,0,1,1,0,1,1,0,0


# load the models from disk

In [45]:
best_svm = pickle.load(open('Models/Padels/HDAC3_SVM_PubChem.pkl', 'rb'))

In [46]:
best_gbr = pickle.load(open('Models/Padels/HDAC3_GBR_Pubchem_final_FS.pkl', 'rb'))

# Prediction for CV

In [47]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [48]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [49]:
y_pred_CV_gbr = cross_val_predict(best_gbr, x_tr, y_tr, cv=cv)

In [50]:
y_pred_con=(y_pred_CV_svm+y_pred_CV_gbr)/2

In [51]:
Q2_CV = round(r2_score(y_tr, y_pred_con), 2)
Q2_CV

0.64

In [52]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_con)),2)
RMSE_CV

0.73

# Prediction for test set's molecules

In [53]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [54]:
y_pred_svm = best_svm.predict(x_ts)

In [55]:
y_pred_gbr = best_gbr.predict(x_ts)

In [56]:
y_pred_con_ts=(y_pred_svm+y_pred_gbr)/2

In [57]:
Q2_TS = round(r2_score(y_ts, y_pred_con_ts), 2)
Q2_TS

0.68

In [58]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_con_ts)), 2)
RMSE_TS

0.7

# Estimating applicability domain. Method - Euclidian distances, K=1

In [59]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [60]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,1.732051,0.000000,2.449490,2.449490,1.000000,3.000000,3.316625,3.162278,...,2.236068,1.000000,2.000000,2.000000,2.000000,0.000000,1.000000,0.000000,4.690416,2.828427
2,0.000000,0.000000,2.449490,2.828427,3.162278,3.464102,3.316625,3.605551,3.464102,3.316625,...,4.123106,2.236068,2.236068,2.449490,2.449490,0.000000,2.449490,0.000000,4.898979,2.828427
3,0.000000,1.000000,2.449490,2.828427,4.242641,3.872983,3.741657,4.000000,3.872983,3.605551,...,4.123106,2.449490,2.236068,2.449490,2.645751,1.000000,2.828427,1.000000,5.000000,3.162278
4,1.000000,2.236068,2.645751,2.828427,4.242641,4.000000,4.000000,4.123106,3.872983,3.605551,...,4.123106,4.123106,2.236068,2.645751,2.645751,1.414214,2.828427,1.414214,5.000000,3.316625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,8.602325,8.485281,8.000000,8.306624,8.831761,8.944272,8.888194,8.831761,8.944272,8.366600,...,8.246211,8.660254,8.944272,8.831761,8.831761,8.944272,9.165151,8.944272,9.055385,9.380832
1396,8.602325,8.485281,8.000000,8.306624,8.888194,8.944272,8.888194,8.888194,8.944272,8.366600,...,8.306624,8.660254,8.944272,8.944272,8.831761,9.000000,9.219544,9.000000,9.110434,9.380832
1397,8.602325,8.544004,8.000000,8.366600,8.888194,9.000000,8.888194,8.944272,9.000000,8.366600,...,8.366600,8.717798,8.944272,8.944272,8.888194,9.000000,9.219544,9.000000,9.165151,9.539392
1398,8.660254,8.544004,8.124038,8.426150,8.944272,9.000000,8.944272,8.944272,9.000000,8.426150,...,8.485281,8.717798,9.110434,9.000000,8.888194,9.000000,9.219544,9.000000,9.165151,9.591663


In [61]:
similarity= neighbors_k

In [62]:
Dmean=np.mean(similarity[1,:])

In [63]:
round(Dmean, 2)

1.89

In [64]:
std=np.std(similarity[1,:])

In [65]:
round(std, 2)

1.27

In [66]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.53


In [67]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [68]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,341,342,343,344,345,346,347,348,349,350
0,0.000000,3.162278,5.567764,2.000000,1.732051,3.605551,1.000000,1.732051,2.828427,3.162278,...,0.000000,2.645751,2.000000,1.732051,1.732051,2.000000,1.414214,2.000000,3.464102,2.236068
1,0.000000,4.898979,5.656854,3.316625,2.449490,4.000000,3.316625,1.732051,2.828427,3.162278,...,1.000000,3.000000,2.236068,2.000000,3.316625,2.236068,2.000000,2.236068,3.605551,2.645751
2,0.000000,5.744563,5.656854,3.464102,2.828427,4.123106,3.464102,2.000000,4.000000,3.316625,...,2.236068,3.000000,2.645751,3.000000,3.316625,2.236068,2.449490,2.449490,3.605551,2.645751
3,0.000000,5.744563,5.656854,3.605551,3.464102,4.123106,3.741657,2.828427,4.242641,3.872983,...,2.449490,3.000000,2.828427,3.000000,3.316625,2.236068,2.645751,2.449490,3.605551,2.828427
4,1.000000,5.830952,5.830952,3.872983,5.099020,4.242641,4.000000,3.316625,4.472136,4.242641,...,3.464102,3.162278,4.358899,3.162278,3.741657,2.449490,2.645751,2.645751,3.605551,2.828427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,8.602325,8.426150,8.717798,9.165151,9.110434,9.219544,9.000000,9.433981,8.185353,9.433981,...,8.544004,8.485281,8.774964,9.327379,9.219544,8.888194,9.165151,9.000000,9.055385,8.888194
1396,8.602325,8.426150,8.717798,9.219544,9.219544,9.219544,9.000000,9.486833,8.246211,9.486833,...,8.544004,8.544004,8.774964,9.380832,9.219544,8.888194,9.165151,9.055385,9.110434,8.944272
1397,8.602325,8.485281,8.774964,9.380832,9.273618,9.273618,9.055385,9.486833,8.246211,9.486833,...,8.544004,8.544004,8.831761,9.380832,9.380832,8.888194,9.273618,9.110434,9.327379,8.944272
1398,8.660254,8.485281,8.831761,9.433981,9.327379,9.327379,9.055385,9.591663,8.246211,9.591663,...,8.602325,8.602325,8.888194,9.380832,9.433981,8.888194,9.327379,9.273618,9.433981,9.000000


In [69]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[0.    3.162 5.568 2.    1.732 3.606 1.    1.732 2.828 3.162 3.    2.
 4.123 2.236 2.236 1.732 1.    2.449 4.472 2.828 1.414 2.449 4.243 0.
 0.    1.414 1.    1.    2.646 3.162 1.414 2.    2.449 2.646 2.828 2.
 4.    2.    3.317 3.162 5.    4.123 2.828 4.359 3.162 3.    4.583 3.464
 0.    0.    1.    3.742 2.646 1.    1.414 4.359 2.    1.    0.    1.
 1.    2.    3.317 4.123 4.243 0.    3.464 2.    0.    3.317 3.317 1.414
 2.646 1.414 1.    0.    1.    1.    0.    2.236 1.    3.162 2.236 2.
 4.    0.    2.449 1.    0.    5.385 0.    5.099 2.449 2.646 0.    2.449
 1.    1.732 0.    1.414 3.742 1.    3.873 1.    1.732 0.    2.646 1.
 1.    1.414 1.    1.    1.414 1.    4.123 1.414 1.    0.    1.732 3.162
 3.    2.    1.414 2.646 1.732 2.236 2.236 3.606 1.732 2.    1.    4.
 0.    1.732 1.414 0.    0.    1.    3.606 1.    4.472 1.414 1.414 1.732
 0.    3.    1.414 1.    3.    3.742 1.    3.    1.414 1.414 0.    2.449
 1.732 1.414 2.828 1.732 2.    2.    1.    2.    0.    2.449 4.    0.
 3

In [70]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True False False  True  True False  True  True False False False  True
 False  True  True  True  True  True False False  True  True False  True
  True  True  True  True False False  True  True  True False False  True
 False  True False False False False False False False False False False
  True  True  True False False  True  True False  True  True  True  True
  True  True False False False  True False  True  True False False  True
 False  True  True  True  True  True  True  True  True False  True  True
 False  True  True  True  True False  True False  True False  True  True
  True  True  True  True False  True False  True  True  True False  True
  True  True  True  True  True  True False  True  True  True  True False
 False  True  True False  True  True  True False  True  True  True False
  True  True  True  True  True  True False  True False  True  True  True
  True False  True  True False False  True False  True  True  True  True
  True  True False  True  True  True  True  True  T

In [71]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7236467236467237


In [72]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   3   4   6   7  11  13  14  15  16  17  20  21  23  24  25  26  27
  30  31  32  35  37  48  49  50  53  54  56  57  58  59  60  61  65  67
  68  71  73  74  75  76  77  78  79  80  82  83  85  86  87  88  90  92
  94  95  96  97  98  99 101 103 104 105 107 108 109 110 111 112 113 115
 116 117 118 121 122 124 125 126 128 129 130 132 133 134 135 136 137 139
 141 142 143 144 146 147 150 152 153 154 155 156 157 159 160 161 162 163
 164 165 167 169 170 171 172 173 176 177 178 179 180 183 184 186 187 188
 189 190 191 192 193 194 196 197 198 199 201 202 203 204 205 206 207 208
 209 210 211 212 213 214 215 216 217 218 220 221 222 223 226 227 228 230
 231 232 233 234 235 237 238 239 240 242 244 245 246 247 249 250 251 252
 253 254 255 256 257 258 259 261 263 264 266 267 268 269 270 271 272 274
 275 277 280 281 282 283 284 286 287 289 291 294 296 297 298 299 300 301
 303 304 306 308 311 312 315 316 317 319 320 321 322 323 324 325 327 328
 329 330 33

In [73]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [74]:
y_pred_con_ad=list(y_pred_con_ts)

In [75]:
y_pred_con_ad[:] = [x for i,x in enumerate(y_pred_con_ad) if i not in out_Ad]

In [76]:
len(y_pred_con_ad)

254

In [77]:
y_ts_ad=list(y_ts)

In [78]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [79]:
len(y_ts_ad)

254

In [80]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_con_ad), 2)
Q2_TS

0.73

In [81]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_con_ad)), 2)
RMSE_TS

0.62