# Importing modules and functions

In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import chembl_structure_pipeline
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.svm import SVR

[06:53:48] Initializing Normalizer


# Data entry and curation work set

In [2]:
uploaded_file_ws="datasets/HDAC2_work.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  1288 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [3]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [4]:
len(y_tr)

1288

# Standardization SDF file for work set

In [5]:
records_ws = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToMolBlock(all_mols_ws[i])
    records_ws.append(record)
            
mols_ws = []
for i,record in enumerate(records_ws):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ws.append(m)
           
moldf_ws = []
for val in mols_ws:
    if val != None:
        moldf_ws.append(val)
print('Kept data: ', len(moldf_ws), 'molecules')

[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharg

[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharg

[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:03] Running Normalizer
[06:54:03] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharg

[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharg

[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharg

[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharger
[06:54:04] Running Normalizer
[06:54:04] Running Uncharg

[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharg

Kept data:  1288 molecules


[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharger
[06:54:05] Running Normalizer
[06:54:05] Running Uncharg

# Data entry and curation test set

In [6]:
uploaded_file_ts="datasets/HDAC2_test.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  323 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [7]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [8]:
len(y_ts)

323

# Standardization SDF file for test set

In [9]:
records_ts = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToMolBlock(all_mols_ts[i])
    records_ts.append(record)
            
mols_ts = []
for i,record in enumerate(records_ts):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ts.append(m)
           
moldf_ts = []
for val in mols_ts:
    if val != None:
        moldf_ts.append(val)
print('Kept data: ', len(moldf_ts), 'molecules')

[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharger
[06:54:20] Running Normalizer
[06:54:20] Running Uncharg

[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharger
[06:54:21] Running Normalizer
[06:54:21] Running Uncharg

Kept data:  323 molecules


# Calculation MorganFingerprint for work set

In [10]:
fp_tr = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ws]

In [11]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [12]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [13]:
savetxt('Models/Morgan_fingerprint/x_tr_MF.csv', x_tr, delimiter=',')

In [13]:
x_tr.shape

(1288, 1024)

# Calculation MorganFingerprint for test set

In [14]:
fp_ts = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ts]

In [15]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [16]:
x_ts = rdkit_numpy_convert(fp_ts)

In [17]:
x_ts.shape

(323, 1024)

In [18]:
type(x_tr)

numpy.ndarray

In [19]:
x_tr

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [21]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

In [22]:
y_tr

array([4.  , 4.02, 4.02, ..., 9.33, 9.37, 9.51], dtype=float32)

# load the models from disk

In [22]:
best_svm = pickle.load(open('Models/Morgan_fingerprint/HDAC2_SVM_MF.pkl', 'rb'))

In [23]:
best_rf = pickle.load(open('Models/Morgan_fingerprint/HDAC2_RF_MF.pkl', 'rb'))

# Prediction for CV

In [24]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [25]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [26]:
y_pred_CV_rf = cross_val_predict(best_rf, x_tr, y_tr, cv=cv)

In [27]:
y_pred_con=(y_pred_CV_svm+y_pred_CV_rf)/2

In [28]:
Q2_CV = round(r2_score(y_tr, y_pred_con), 2)
Q2_CV

  y_true = check_array(y_true, ensure_2d=False, dtype=dtype)


0.61

In [29]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_con)),2)
RMSE_CV

  y_true = check_array(y_true, ensure_2d=False, dtype=dtype)


0.71

# Prediction for test set's molecules

In [44]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [45]:
y_pred_svm = best_svm.predict(x_ts)

In [46]:
y_pred_rf = best_rf.predict(x_ts)

In [47]:
y_pred_rf

array([5.05754477, 5.09423667, 4.88222423, 5.16448243, 5.96548853,
       4.85904003, 6.52622664, 5.86305917, 5.63448947, 5.30151335,
       5.85791163, 5.71388142, 4.9296615 , 4.91481497, 6.13973698,
       4.92667665, 5.36327664, 5.11219774, 5.92625296, 6.41117162,
       5.59331041, 5.83877252, 5.28647334, 5.44856339, 5.19632002,
       4.75833503, 6.23872274, 5.44326339, 5.56005167, 5.30325762,
       5.85725384, 5.32491835, 5.5107295 , 5.74321244, 6.17025007,
       5.40317246, 5.68546248, 5.54187997, 5.72614333, 5.88774604,
       5.49297864, 5.61991838, 5.41519496, 5.56469251, 6.00207923,
       6.07183274, 6.67898008, 5.49573512, 5.42085275, 5.41043916,
       5.93836324, 4.95863782, 5.97919993, 5.59686873, 5.30286439,
       5.55407767, 5.15623834, 5.60356589, 5.14552009, 5.72035918,
       6.25638369, 5.68131344, 6.2848234 , 5.92610577, 5.83999916,
       5.64270771, 5.90778497, 5.76278082, 6.87331395, 6.14463708,
       6.49636793, 5.94988682, 6.1729336 , 5.8765599 , 5.67206

In [48]:
y_pred_con=(y_pred_svm+y_pred_rf)/2

In [50]:
Q2_TS = round(r2_score(y_ts, y_pred_con), 2)
Q2_TS

0.64

In [51]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_con)), 2)
RMSE_TS

0.69

# Estimating applicability domain. Method - Euclidian distances, K=1

In [52]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [53]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1278,1279,1280,1281,1282,1283,1284,1285,1286,1287
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,3.605551,5.291502,6.324555,5.744563,3.605551,5.744563,6.782330,3.162278,5.477226,5.000000,...,3.464102,4.000000,3.872983,4.898980,3.464102,4.898980,3.000000,3.872983,2.449490,4.358899
2,3.872983,5.385165,6.403124,7.280110,4.582576,5.830952,6.782330,5.000000,5.830952,6.244998,...,5.477226,4.472136,5.830952,5.099020,3.741657,6.708204,4.000000,4.472136,3.000000,5.000000
3,4.358899,5.744563,6.403124,7.280110,5.385165,5.830952,6.928203,5.196152,5.916080,6.480741,...,5.744563,5.916080,5.916080,5.656854,3.872983,7.211102,4.472136,4.898980,3.464102,5.291502
4,4.358899,5.830952,6.480741,7.416198,5.477226,6.000000,7.000000,5.196152,6.082763,6.557438,...,5.916080,6.403124,6.164414,5.830952,3.872983,7.483315,4.472136,5.000000,4.358899,5.567764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,10.392304,10.246951,10.583005,10.862781,11.135529,10.583005,10.770329,10.246951,10.723805,10.049875,...,11.090536,10.862781,10.816654,11.313708,11.357817,10.908712,11.224972,10.862781,11.224972,11.313708
1284,10.440307,10.295630,10.723805,10.954452,11.135529,10.630146,10.770329,10.246951,10.816654,10.198039,...,11.180340,10.908712,10.862781,11.357817,11.401754,10.908712,11.269427,10.908712,11.313708,11.357817
1285,10.488089,10.344080,10.862781,11.000000,11.135529,10.816654,10.816654,10.295630,10.816654,10.246951,...,11.180340,11.045361,10.954452,11.445523,11.575837,10.954452,11.269427,10.954452,11.313708,11.445523
1286,10.583005,10.488089,10.862781,11.045361,11.180340,10.862781,10.816654,10.488089,10.908712,10.344080,...,11.489125,11.045361,11.180340,11.575837,11.575837,11.045361,11.269427,11.313708,11.445523,11.575837


In [54]:
similarity= neighbors_k

In [55]:
Dmean=np.mean(similarity[1,:])

In [56]:
round(Dmean, 2)

3.64

In [57]:
std=np.std(similarity[1,:])

In [58]:
round(std, 2)

1.29

In [59]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

4.28


In [60]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [61]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,313,314,315,316,317,318,319,320,321,322
0,4.123106,4.472136,3.464102,3.605551,3.872983,2.449490,4.000000,5.656854,4.795832,4.242640,...,4.690416,2.828427,4.472136,3.000000,3.872983,3.000000,6.324555,4.472136,4.690416,4.000000
1,4.242640,5.567764,6.480741,4.898980,3.872983,3.464102,7.810250,6.244998,5.291502,4.358899,...,5.099020,5.477226,4.472136,3.162278,4.123106,3.872983,6.324555,4.582576,5.916080,5.291502
2,4.242640,7.280110,6.633250,6.244998,4.000000,3.464102,8.000000,6.403124,5.291502,4.582576,...,5.099020,5.567764,4.582576,3.872983,4.795832,4.123106,6.403124,4.690416,5.916080,5.385165
3,4.242640,7.348469,6.633250,6.324555,4.358899,3.872983,8.062258,6.480741,5.291502,6.244998,...,5.291502,6.782330,4.582576,4.000000,5.196152,4.358899,6.403124,4.690416,6.082763,5.385165
4,4.472136,7.416198,6.633250,6.403124,4.472136,4.000000,8.246211,6.557438,5.291502,6.324555,...,5.291502,6.782330,4.898980,4.123106,5.656854,4.472136,6.480741,4.795832,6.244998,5.477226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,9.899495,10.862781,10.630146,10.099504,9.899495,10.392304,11.269427,10.535654,9.848858,11.357817,...,11.045361,11.090536,10.198039,11.135529,11.180340,11.224972,10.440307,9.899495,10.630146,11.357817
1284,9.949874,10.954452,10.630146,10.392304,10.049875,10.488089,11.269427,10.535654,9.949874,11.401754,...,11.045361,11.090536,10.246951,11.224972,11.224972,11.224972,10.488089,10.049875,10.630146,11.661903
1285,10.000000,10.954452,10.630146,10.440307,10.049875,10.535654,11.357817,10.535654,10.099504,11.401754,...,11.045361,11.357817,10.295630,11.224972,11.269427,11.313708,10.488089,10.148891,10.630146,11.704700
1286,10.000000,11.090536,10.677078,10.440307,10.099504,10.630146,11.357817,10.583005,10.148891,11.445523,...,11.224972,11.489125,10.295630,11.532562,11.445523,11.313708,10.677078,10.148891,10.954452,11.747340


In [62]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[4.123 4.472 3.464 3.606 3.873 2.449 4.    5.657 4.796 4.243 3.464 3.606
 3.873 3.742 4.69  2.    4.123 0.    5.292 3.    3.873 4.123 3.464 5.099
 4.243 3.606 3.    4.472 2.449 3.162 6.557 4.    3.    4.69  3.162 3.
 4.583 3.317 2.646 3.317 3.317 3.162 4.123 2.646 3.317 4.69  5.568 3.873
 4.796 4.123 5.831 2.646 2.449 2.236 0.    4.69  3.464 0.    3.464 3.873
 4.123 3.873 1.    4.123 5.099 1.    3.    3.873 2.    1.    3.    1.
 0.    3.162 6.164 2.828 2.646 3.606 3.742 3.742 4.243 3.    0.    4.
 3.162 4.    5.385 4.123 4.899 0.    3.    1.732 4.69  3.    3.317 3.162
 3.606 3.    0.    4.123 3.162 3.873 2.646 3.    5.    6.782 3.317 3.464
 5.099 3.742 1.    2.646 1.    0.    5.831 3.464 3.317 3.742 3.    4.472
 4.123 2.828 3.464 6.245 3.742 0.    4.123 3.742 3.873 2.646 3.873 0.
 4.472 4.69  3.464 7.141 5.831 2.646 4.472 6.782 2.646 3.464 4.123 2.828
 1.    3.606 3.742 3.742 3.606 4.    2.646 3.    3.162 3.    3.317 1.
 3.873 4.899 6.403 4.    3.606 3.162 3.464 3.317 3.464 3.162 3.162

In [63]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True False  True  True  True  True  True False False  True  True  True
  True  True False  True  True  True False  True  True  True  True False
  True  True  True False  True  True False  True  True False  True  True
 False  True  True  True  True  True  True  True  True False False  True
 False  True False  True  True  True  True False  True  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True False  True False  True  True  True False  True  True  True
  True  True  True  True  True  True  True  True False False  True  True
 False  True  True  True  True  True False  True  True  True  True False
  True  True  True False  True  True  True  True  True  True  True  True
 False False  True False False  True False False  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True False False  True  True  True  True  True  T

In [64]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7523219814241486


In [65]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   2   3   4   5   6   9  10  11  12  13  15  16  17  19  20  21  22
  24  25  26  28  29  31  32  34  35  37  38  39  40  41  42  43  44  47
  49  51  52  53  54  56  57  58  59  60  61  62  63  65  66  67  68  69
  70  71  72  73  75  76  77  78  79  80  81  82  83  84  85  87  89  90
  91  93  94  95  96  97  98  99 100 101 102 103 106 107 109 110 111 112
 113 115 116 117 118 120 121 122 124 125 126 127 128 129 130 131 134 137
 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 159
 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
 178 180 182 183 184 187 188 189 190 191 192 193 194 195 196 198 199 200
 201 205 206 208 209 210 212 214 215 216 217 218 219 220 221 222 223 225
 226 228 229 230 232 235 236 237 238 240 244 245 246 247 248 249 250 251
 252 253 255 256 258 260 261 263 264 265 267 269 270 271 273 274 275 277
 278 283 285 286 288 289 290 291 292 293 294 295 298 300 302 303 304 305
 307 309 31

In [66]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [67]:
y_pred_con_ad=list(y_pred_con)

In [68]:
y_pred_con_ad[:] = [x for i,x in enumerate(y_pred_con_ad) if i not in out_Ad]

In [69]:
len(y_pred_con_ad)

243

In [70]:
y_ts_ad=list(y_ts)

In [71]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [72]:
len(y_ts_ad)

243

In [73]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_con_ad), 2)
Q2_TS

0.71

In [74]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_con_ad)), 2)
RMSE_TS

0.64