# Importing modules and functions

In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import chembl_structure_pipeline
from molvs import standardize_smiles
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split,KFold, StratifiedKFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import joblib
import pickle
from numpy import savetxt
from IPython.display import HTML
from rdkit.Chem import PandasTools

[17:33:03] Initializing Normalizer


In [4]:
def convert_smi_to_canon_smi(smi):
    
    try:
        canon_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi),isomericSmiles = False)
    except:
        canon_smi='wrong_smiles'
    return canon_smi

In [5]:
def standart(smi):
    global m
    if smi!='wrong_smiles':
        try:
            smiles=standardize_smiles(smi)
            m = Chem.MolFromSmiles(smi)
        except:
            smi='error kekule'
    else:
        m = 'check the smiles'
    return m

# MORGAN FP_Gradient Boosting Mouse intravenous LD50, mg/kg  

## Load data and curation work set

In [6]:
# Set file path
df_ws=pd.read_csv('mouse_intravenous_LD50_WS.csv')
df_ws

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,67-56-1,CO,0.83270,4707.7000
1,19491-72-6,CCCNCC(Br)c1ccccc1,0.89274,30857.5000
2,64790-46-1,CN1C2CCC1CC(OC(=O)C(=O)c1ccccc1)C2,0.90777,33776.0000
3,34758-84-4,COC(CN1CCN(CC(O)C(OC)c2ccccc2)CC1)c1ccccc1,0.93852,44268.0000
4,9005-66-7,CCCCCCCCCCCC(=O)OCCOCC(OCCO)C1OCC(OCCO)C1OCCO,1.01926,49968.0000
...,...,...,...,...
12049,115722-23-1,CCC(C)C(NC(=O)C(N)CC(C)C)C(=O)N1CCCC1C(=O)N1CC...,7.35180,0.0400
12050,4026-95-3,CC12CCC(O)CC1CCC1C2CCC2(C)C(c3ccc(=O)oc3)C(O)C...,7.42650,0.0150
12051,465-19-0,CC12CCC(O)CC1CCC1C2CCC2(C)C(c3ccc(=O)oc3)C(O)C...,7.60480,0.0100
12052,35523-89-8,NC(=O)OCC1N=C(N)N2CCC(O)(O)C23NC(N)=NC13,7.74560,0.0054


 Convert a SMILES string to canonical SMILES

In [7]:
df_ws1 = deepcopy(df_ws)
df_ws1["SMILES"] = df_ws1.apply(lambda x: convert_smi_to_canon_smi(x.SMILES), axis=1)
df_ws1

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,67-56-1,CO,0.83270,4707.7000
1,19491-72-6,CCCNCC(Br)c1ccccc1,0.89274,30857.5000
2,64790-46-1,CN1C2CCC1CC(OC(=O)C(=O)c1ccccc1)C2,0.90777,33776.0000
3,34758-84-4,COC(CN1CCN(CC(O)C(OC)c2ccccc2)CC1)c1ccccc1,0.93852,44268.0000
4,9005-66-7,CCCCCCCCCCCC(=O)OCCOCC(OCCO)C1OCC(OCCO)C1OCCO,1.01926,49968.0000
...,...,...,...,...
12049,115722-23-1,CCC(C)C(NC(=O)C(N)CC(C)C)C(=O)N1CCCC1C(=O)N1CC...,7.35180,0.0400
12050,4026-95-3,CC12CCC(O)CC1CCC1C2CCC2(C)C(c3ccc(=O)oc3)C(O)C...,7.42650,0.0150
12051,465-19-0,CC12CCC(O)CC1CCC1C2CCC2(C)C(c3ccc(=O)oc3)C(O)C...,7.60480,0.0100
12052,35523-89-8,NC(=O)OCC1N=C(N)N2CCC(O)(O)C23NC(N)=NC13,7.74560,0.0054


In [8]:
print('Original data: ', len(df_ws), 'molecules')
print('Failed data: ', len(df_ws1[df_ws1['SMILES']=='wrong_smiles']), 'molecules')

Original data:  12054 molecules
Failed data:  0 molecules


In [9]:
index=df_ws1.index[df_ws1['SMILES']=='wrong_smiles'].tolist()
wrong_smiles=df_ws.iloc[index]
wrong_smiles=wrong_smiles.SMILES
number=[x+1 for x in index]
bad_molecules = pd.DataFrame({'No. failed smiles in original set': number, 'SMILES of wrong structure: ': wrong_smiles}, index=None)
bad_molecules = bad_molecules.set_index('No. failed smiles in original set')
bad_molecules

Unnamed: 0_level_0,SMILES of wrong structure:
No. failed smiles in original set,Unnamed: 1_level_1


##  Standardization  for work set

In [10]:
df_ws1["Molecule"] = df_ws1.apply(lambda x: standart(x.SMILES), axis=1)
moldf_ws=df_ws1[df_ws1['SMILES']!='wrong_smiles']
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  12054 molecules


In [11]:
moldf_ws

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg",Molecule
0,67-56-1,CO,0.83270,4707.7000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
1,19491-72-6,CCCNCC(Br)c1ccccc1,0.89274,30857.5000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
2,64790-46-1,CN1C2CCC1CC(OC(=O)C(=O)c1ccccc1)C2,0.90777,33776.0000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
3,34758-84-4,COC(CN1CCN(CC(O)C(OC)c2ccccc2)CC1)c1ccccc1,0.93852,44268.0000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
4,9005-66-7,CCCCCCCCCCCC(=O)OCCOCC(OCCO)C1OCC(OCCO)C1OCCO,1.01926,49968.0000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
...,...,...,...,...,...
12049,115722-23-1,CCC(C)C(NC(=O)C(N)CC(C)C)C(=O)N1CCCC1C(=O)N1CC...,7.35180,0.0400,<rdkit.Chem.rdchem.Mol object at 0x00000188382...
12050,4026-95-3,CC12CCC(O)CC1CCC1C2CCC2(C)C(c3ccc(=O)oc3)C(O)C...,7.42650,0.0150,<rdkit.Chem.rdchem.Mol object at 0x00000188382...
12051,465-19-0,CC12CCC(O)CC1CCC1C2CCC2(C)C(c3ccc(=O)oc3)C(O)C...,7.60480,0.0100,<rdkit.Chem.rdchem.Mol object at 0x00000188382...
12052,35523-89-8,NC(=O)OCC1N=C(N)N2CCC(O)(O)C23NC(N)=NC13,7.74560,0.0054,<rdkit.Chem.rdchem.Mol object at 0x00000188382...


In [12]:
y_tr=moldf_ws.pLD50
y_tr

0        0.83270
1        0.89274
2        0.90777
3        0.93852
4        1.01926
          ...   
12049    7.35180
12050    7.42650
12051    7.60480
12052    7.74560
12053    7.89320
Name: pLD50, Length: 12054, dtype: float64

In [13]:
moldf_ws=moldf_ws.Molecule

##  Load data and curation test set

In [14]:
df_ts=pd.read_csv('mouse_intravenous_LD50_TS.csv')
df_ts

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,60-35-5,CC(N)=O,0.77135,9994.8000
1,358-21-4,FC(F)(F)C(F)(F)OC(F)(F)C(F)(F)F,0.94248,28995.0000
2,58-86-6,O=CC(O)C(O)C(O)CO,1.12339,11294.2000
3,61-82-5,Nc1nnc[nH]1,1.22573,4997.7000
4,79-16-3,CNC(C)=O,1.26020,4012.7000
...,...,...,...,...
3009,81131-98-8,CC(CCCCCCCC1CC(OS(=O)(=O)O)CCCCCCCCCCCC(OS(=O)...,6.73300,0.1798
3010,2001-95-8,CC1OC(=O)C(C(C)C)NC(=O)C(C(C)C)OC(=O)C(C(C)C)N...,6.79060,0.1799
3011,124-97-0,CCC(C)C(=O)OC1C(O)C2C(CN3CC(C)CCC3C2(C)O)C2CC3...,7.09550,0.0650
3012,13602-52-3,CCC(C)C1C(=O)OCC(NC(=O)c2cnc3ccccc3n2)C(=O)NC(...,7.32040,0.0540


 Convert a SMILES string to canonical SMILES

In [15]:
df_ts1 = deepcopy(df_ts)
df_ts1["SMILES"] = df_ts1.apply(lambda x: convert_smi_to_canon_smi(x.SMILES), axis=1)
df_ts1

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,60-35-5,CC(N)=O,0.77135,9994.8000
1,358-21-4,FC(F)(F)C(F)(F)OC(F)(F)C(F)(F)F,0.94248,28995.0000
2,58-86-6,O=CC(O)C(O)C(O)CO,1.12339,11294.2000
3,61-82-5,Nc1nnc[nH]1,1.22573,4997.7000
4,79-16-3,CNC(C)=O,1.26020,4012.7000
...,...,...,...,...
3009,81131-98-8,CC(CCCCCCCC1CC(OS(=O)(=O)O)CCCCCCCCCCCC(OS(=O)...,6.73300,0.1798
3010,2001-95-8,CC1OC(=O)C(C(C)C)NC(=O)C(C(C)C)OC(=O)C(C(C)C)N...,6.79060,0.1799
3011,124-97-0,CCC(C)C(=O)OC1C(O)C2C(CN3CC(C)CCC3C2(C)O)C2CC3...,7.09550,0.0650
3012,13602-52-3,CCC(C)C1C(=O)OCC(NC(=O)c2cnc3ccccc3n2)C(=O)NC(...,7.32040,0.0540


In [16]:
print('Original data: ', len(df_ts), 'molecules')
print('Failed data: ', len(df_ts1[df_ts1['SMILES']=='wrong_smiles']), 'molecules')

Original data:  3014 molecules
Failed data:  0 molecules


In [17]:
index=df_ts1.index[df_ts1['SMILES']=='wrong_smiles'].tolist()
wrong_smiles=df_ts.iloc[index]
wrong_smiles=wrong_smiles.SMILES
number=[x+1 for x in index]
bad_molecules = pd.DataFrame({'No. failed smiles in original set': number, 'SMILES of wrong structure: ': wrong_smiles}, index=None)
bad_molecules = bad_molecules.set_index('No. failed smiles in original set')
bad_molecules

Unnamed: 0_level_0,SMILES of wrong structure:
No. failed smiles in original set,Unnamed: 1_level_1


##  Standardization  for test set

In [18]:
df_ts1["Molecule"] = df_ts1.apply(lambda x: standart(x.SMILES), axis=1)
moldf_ts=df_ts1[df_ts1['SMILES']!='wrong_smiles']
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  3014 molecules


In [19]:
moldf_ts

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg",Molecule
0,60-35-5,CC(N)=O,0.77135,9994.8000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
1,358-21-4,FC(F)(F)C(F)(F)OC(F)(F)C(F)(F)F,0.94248,28995.0000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
2,58-86-6,O=CC(O)C(O)C(O)CO,1.12339,11294.2000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
3,61-82-5,Nc1nnc[nH]1,1.22573,4997.7000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
4,79-16-3,CNC(C)=O,1.26020,4012.7000,<rdkit.Chem.rdchem.Mol object at 0x00000188381...
...,...,...,...,...,...
3009,81131-98-8,CC(CCCCCCCC1CC(OS(=O)(=O)O)CCCCCCCCCCCC(OS(=O)...,6.73300,0.1798,<rdkit.Chem.rdchem.Mol object at 0x00000188383...
3010,2001-95-8,CC1OC(=O)C(C(C)C)NC(=O)C(C(C)C)OC(=O)C(C(C)C)N...,6.79060,0.1799,<rdkit.Chem.rdchem.Mol object at 0x00000188383...
3011,124-97-0,CCC(C)C(=O)OC1C(O)C2C(CN3CC(C)CCC3C2(C)O)C2CC3...,7.09550,0.0650,<rdkit.Chem.rdchem.Mol object at 0x00000188383...
3012,13602-52-3,CCC(C)C1C(=O)OCC(NC(=O)c2cnc3ccccc3n2)C(=O)NC(...,7.32040,0.0540,<rdkit.Chem.rdchem.Mol object at 0x00000188383...


In [20]:
y_ts=moldf_ts.pLD50
y_ts

0       0.77135
1       0.94248
2       1.12339
3       1.22573
4       1.26020
         ...   
3009    6.73300
3010    6.79060
3011    7.09550
3012    7.32040
3013    7.64080
Name: pLD50, Length: 3014, dtype: float64

In [21]:
moldf_ts=moldf_ts.Molecule

## Calculation MorganFingerprint for work set

In [22]:
fp_tr = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ws]

In [23]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [24]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [25]:
savetxt('Models/MorganFingerprint/x_tr.csv', x_tr, delimiter=',')

In [26]:
x_tr.shape

(12054, 1024)

## Calculation MorganFingerprint for test set

In [27]:
fp_ts = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ts]

In [28]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [29]:
x_ts = rdkit_numpy_convert(fp_ts)

In [30]:
x_ts.shape

(3014, 1024)

In [31]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

## GradientBoostingRegressor model building and validation

In [32]:
seed = 42

In [33]:
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [32]:
param_grid = {'learning_rate': [0.01, 0.04],
                  'subsample'    : [0.9, 0.5],
                  'n_estimators' : [10, 100,1000],
                  'max_depth'    : [4, 10]
                 }

In [34]:
m = GridSearchCV(GradientBoostingRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [None]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [36]:
m.best_params_
best_GBR = m.best_estimator_

In [37]:
m.best_params_

{'learning_rate': 0.04,
 'max_depth': 10,
 'n_estimators': 1000,
 'subsample': 0.5}

In [35]:
y_pred_CV_GBR = cross_val_predict(best_GBR, x_tr, y_tr, cv=cv)

In [36]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_GBR), 2)
Q2_CV

0.52

In [37]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_GBR)), 2)
RMSE_CV

0.47

##  Prediction for test set's molecules

In [38]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [39]:
len(y_ts)

3014

In [40]:
y_pred_GBR = best_GBR.predict(x_ts)

In [41]:
Q2_TS = round(r2_score(y_ts, y_pred_GBR), 2)
Q2_TS

0.54

In [42]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_GBR)), 2)
RMSE_TS

0.46

## save the model to disk

In [46]:
pickle.dump(best_GBR, open('Models/MorganFingerprint/LD50_mouse_intravenous_GBR_MFP.pkl', 'wb'))

## load the model from disk

In [34]:
best_GBR = pickle.load(open('Models/MorganFingerprint/LD50_mouse_intravenous_GBR_MFP.pkl', 'rb'))

##  Y-randomization GBR model

In [31]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_GBR, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 412.8min


True score =  0.52 
Y-randomization =  -0.21 
p-value =  0.0196


[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 634.1min finished


##  Estimating applicability domain. Method - Euclidian distances, K=1

In [43]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [44]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12044,12045,12046,12047,12048,12049,12050,12051,12052,12053
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2.000000,2.449490,3.162278,4.358899,5.385165,2.236068,3.741657,1.732051,3.316625,4.242640,...,3.162278,5.196152,9.110434,7.071068,1.000000,3.605551,3.872983,3.605551,6.164414,7.280110
2,2.236068,2.828427,4.123106,4.690416,6.164414,2.449490,3.872983,2.449490,4.000000,4.582576,...,3.464102,7.483315,9.165152,7.483315,1.732051,7.810250,4.242640,4.358899,6.324555,7.615773
3,2.236068,3.162278,4.123106,4.898980,6.164414,2.449490,4.123106,2.449490,4.123106,4.582576,...,4.472136,7.615773,9.165152,8.426149,3.162278,7.874008,4.358899,4.472136,6.403124,7.937254
4,2.236068,3.605551,4.123106,4.898980,6.164414,2.645751,4.123106,2.449490,4.123106,4.690416,...,5.196152,7.615773,9.219544,8.544003,3.316625,8.000000,4.472136,5.291502,6.403124,8.124039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,11.916375,12.041595,12.165525,12.000000,12.369317,11.789826,11.401754,11.789826,12.083046,12.041595,...,13.341664,12.884099,13.152946,13.266500,12.124355,13.152946,12.884099,12.845233,12.369317,13.190906
12050,11.958261,12.124355,12.288206,12.124355,12.409674,11.958261,11.401754,11.916375,12.206555,12.165525,...,13.379088,13.000000,13.152946,13.304134,12.247449,13.266500,12.961481,12.922848,12.449900,13.341664
12051,12.083046,12.409674,12.328828,12.409674,12.609520,12.000000,11.704700,11.958261,12.206555,12.206555,...,13.564660,13.038404,13.152946,13.341664,12.288206,13.304134,13.076696,13.114877,12.489996,13.453624
12052,12.165525,12.489996,12.489996,12.806249,13.076696,12.124355,12.449900,12.124355,12.369317,12.288206,...,13.601471,13.038404,13.266500,13.453624,12.369317,13.638182,13.228757,13.266500,12.569805,13.527749


In [45]:
similarity= neighbors_k

In [46]:
Dmean=np.mean(similarity[1,:])

In [47]:
round(Dmean, 2)

3.6

In [48]:
std=np.std(similarity[1,:])

In [49]:
round(std, 2)

1.18

In [50]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

4.19


In [51]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [52]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3004,3005,3006,3007,3008,3009,3010,3011,3012,3013
0,2.000000,3.162278,2.828427,4.123106,2.645751,2.828427,3.464102,3.464102,2.000000,2.645751,...,3.162278,6.557438,4.898980,3.464102,3.872983,3.605551,4.898980,3.000000,3.162278,5.656854
1,2.236068,3.162278,3.000000,4.242640,3.000000,3.000000,3.464102,3.872983,2.236068,2.828427,...,4.795832,6.782330,5.744563,3.605551,5.291502,5.385165,5.385165,7.141428,3.316625,5.916080
2,2.236068,3.316625,3.000000,4.242640,3.162278,3.000000,3.464102,4.000000,2.236068,3.316625,...,5.000000,7.615773,5.744563,3.741657,5.291502,6.855655,5.385165,7.549834,7.937254,6.082763
3,2.645751,3.316625,3.316625,4.242640,3.162278,3.162278,4.123106,4.000000,2.645751,3.316625,...,5.099020,7.745967,5.830952,3.872983,5.477226,7.000000,5.477226,8.485281,8.000000,6.082763
4,2.828427,3.464102,3.464102,4.358899,3.162278,3.162278,4.242640,4.000000,2.645751,3.316625,...,5.099020,7.810250,5.916080,4.000000,5.477226,7.000000,5.477226,8.485281,8.062258,6.244998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,11.789826,12.288206,11.958261,11.789826,11.958261,11.916375,11.958261,12.000000,11.874342,11.958261,...,12.328828,12.449900,13.266500,12.206555,12.409674,12.727922,11.789826,13.304134,12.922848,12.165525
12050,11.916375,12.328828,11.958261,11.958261,12.083046,11.916375,12.083046,12.165525,12.041595,12.124355,...,12.449900,12.529964,13.416408,12.328828,12.449900,12.806249,11.832160,13.304134,12.961481,12.288206
12051,11.958261,12.449900,12.000000,12.247449,12.124355,12.041595,12.247449,12.206555,12.165525,12.206555,...,12.529964,12.727922,13.674794,12.369317,12.649111,12.922848,12.000000,13.341664,12.961481,12.569805
12052,12.124355,12.529964,12.124355,12.247449,12.288206,12.165525,12.727922,12.328828,12.206555,12.247449,...,12.609520,12.727922,13.747727,12.449900,12.806249,12.922848,12.083046,13.490738,13.152946,12.569805


In [53]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[2.    3.162 2.828 ... 3.    3.162 5.657]


In [54]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True  True False]


In [55]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD),2))

Coverage =  0.74


In [56]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 3009 3011 3012]


In [57]:
out_Ad=list(np.where(cpd_AD == 0)[0])

## Prediction only for molecules included in  AD

In [58]:
y_pred_GBR_ad=list(y_pred_GBR)

In [59]:
y_pred_GBR_ad[:] = [x for i,x in enumerate(y_pred_GBR_ad) if i not in out_Ad]

In [60]:
len(y_pred_GBR_ad)

2242

In [61]:
y_ts_ad=list(y_ts)

In [62]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [63]:
len(y_ts_ad)

2242

In [64]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_GBR_ad), 2)
Q2_TS

0.64

In [65]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_GBR_ad)), 2)
RMSE_TS

0.41

# MORGAN FP_SVM model building and validation

In [70]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [71]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [72]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=2, cv=cv, verbose=1)

In [73]:
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [74]:
svm.best_params_
best_svm = svm.best_estimator_

In [67]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [68]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.51

In [76]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.47

## Prediction for test set's molecules

In [70]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [71]:
y_pred_svm = best_svm.predict(x_ts)

In [72]:
Q2_TS = round(r2_score(y_ts, y_pred_svm), 2)
Q2_TS

0.52

In [77]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_svm)), 2)
RMSE_TS

0.47

save the model to disk

In [110]:
pickle.dump(best_svm, open('Models/MorganFingerprint/LD50_mouse_intravenous_SVM_MF.pkl', 'wb'))

load the model from disk

In [66]:
best_svm = pickle.load(open('Models/MorganFingerprint/LD50_mouse_intravenous_SVM_MF.pkl', 'rb'))

## Y-randomization SVM model

In [33]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_svm, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed: 90.7min


True score =  0.51 
Y-randomization =  -0.19 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 144.3min finished


## Estimating applicability domain. Method - Euclidian distances, K=1

In [78]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [79]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12044,12045,12046,12047,12048,12049,12050,12051,12052,12053
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2.000000,2.449490,3.162278,4.358899,5.385165,2.236068,3.741657,1.732051,3.316625,4.242640,...,3.162278,5.196152,9.110434,7.071068,1.000000,3.605551,3.872983,3.605551,6.164414,7.280110
2,2.236068,2.828427,4.123106,4.690416,6.164414,2.449490,3.872983,2.449490,4.000000,4.582576,...,3.464102,7.483315,9.165152,7.483315,1.732051,7.810250,4.242640,4.358899,6.324555,7.615773
3,2.236068,3.162278,4.123106,4.898980,6.164414,2.449490,4.123106,2.449490,4.123106,4.582576,...,4.472136,7.615773,9.165152,8.426149,3.162278,7.874008,4.358899,4.472136,6.403124,7.937254
4,2.236068,3.605551,4.123106,4.898980,6.164414,2.645751,4.123106,2.449490,4.123106,4.690416,...,5.196152,7.615773,9.219544,8.544003,3.316625,8.000000,4.472136,5.291502,6.403124,8.124039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,11.916375,12.041595,12.165525,12.000000,12.369317,11.789826,11.401754,11.789826,12.083046,12.041595,...,13.341664,12.884099,13.152946,13.266500,12.124355,13.152946,12.884099,12.845233,12.369317,13.190906
12050,11.958261,12.124355,12.288206,12.124355,12.409674,11.958261,11.401754,11.916375,12.206555,12.165525,...,13.379088,13.000000,13.152946,13.304134,12.247449,13.266500,12.961481,12.922848,12.449900,13.341664
12051,12.083046,12.409674,12.328828,12.409674,12.609520,12.000000,11.704700,11.958261,12.206555,12.206555,...,13.564660,13.038404,13.152946,13.341664,12.288206,13.304134,13.076696,13.114877,12.489996,13.453624
12052,12.165525,12.489996,12.489996,12.806249,13.076696,12.124355,12.449900,12.124355,12.369317,12.288206,...,13.601471,13.038404,13.266500,13.453624,12.369317,13.638182,13.228757,13.266500,12.569805,13.527749


In [80]:
similarity= neighbors_k

In [81]:
Dmean=np.mean(similarity[1,:])

In [82]:
round(Dmean, 2)

3.6

In [83]:
std=np.std(similarity[1,:])

In [84]:
round(std, 2)

1.18

In [85]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

4.19


In [86]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [87]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3004,3005,3006,3007,3008,3009,3010,3011,3012,3013
0,2.000000,3.162278,2.828427,4.123106,2.645751,2.828427,3.464102,3.464102,2.000000,2.645751,...,3.162278,6.557438,4.898980,3.464102,3.872983,3.605551,4.898980,3.000000,3.162278,5.656854
1,2.236068,3.162278,3.000000,4.242640,3.000000,3.000000,3.464102,3.872983,2.236068,2.828427,...,4.795832,6.782330,5.744563,3.605551,5.291502,5.385165,5.385165,7.141428,3.316625,5.916080
2,2.236068,3.316625,3.000000,4.242640,3.162278,3.000000,3.464102,4.000000,2.236068,3.316625,...,5.000000,7.615773,5.744563,3.741657,5.291502,6.855655,5.385165,7.549834,7.937254,6.082763
3,2.645751,3.316625,3.316625,4.242640,3.162278,3.162278,4.123106,4.000000,2.645751,3.316625,...,5.099020,7.745967,5.830952,3.872983,5.477226,7.000000,5.477226,8.485281,8.000000,6.082763
4,2.828427,3.464102,3.464102,4.358899,3.162278,3.162278,4.242640,4.000000,2.645751,3.316625,...,5.099020,7.810250,5.916080,4.000000,5.477226,7.000000,5.477226,8.485281,8.062258,6.244998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,11.789826,12.288206,11.958261,11.789826,11.958261,11.916375,11.958261,12.000000,11.874342,11.958261,...,12.328828,12.449900,13.266500,12.206555,12.409674,12.727922,11.789826,13.304134,12.922848,12.165525
12050,11.916375,12.328828,11.958261,11.958261,12.083046,11.916375,12.083046,12.165525,12.041595,12.124355,...,12.449900,12.529964,13.416408,12.328828,12.449900,12.806249,11.832160,13.304134,12.961481,12.288206
12051,11.958261,12.449900,12.000000,12.247449,12.124355,12.041595,12.247449,12.206555,12.165525,12.206555,...,12.529964,12.727922,13.674794,12.369317,12.649111,12.922848,12.000000,13.341664,12.961481,12.569805
12052,12.124355,12.529964,12.124355,12.247449,12.288206,12.165525,12.727922,12.328828,12.206555,12.247449,...,12.609520,12.727922,13.747727,12.449900,12.806249,12.922848,12.083046,13.490738,13.152946,12.569805


In [88]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[2.    3.162 2.828 ... 3.    3.162 5.657]


In [89]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True  True False]


In [90]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7438619774386198


In [91]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 3009 3011 3012]


In [92]:
out_Ad=list(np.where(cpd_AD == 0)[0])

##  Prediction only for molecules included in  AD

In [93]:
y_pred_svm_ad=list(y_pred_svm)

In [94]:
y_pred_svm_ad[:] = [x for i,x in enumerate(y_pred_svm_ad) if i not in out_Ad]

In [95]:
len(y_pred_svm_ad)

2242

In [96]:
y_ts_ad=list(y_ts)

In [97]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [98]:
len(y_ts_ad)

2242

In [99]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_svm_ad), 2)
Q2_TS

0.61

In [100]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_svm_ad)), 2)
RMSE_TS

0.42

# Consensus

##  load the models from disk

In [101]:
best_svm = pickle.load(open('Models/MorganFingerprint/LD50_mouse_intravenous_SVM_MF.pkl', 'rb'))

In [102]:
best_GBR = pickle.load(open('Models/MorganFingerprint/LD50_mouse_intravenous_GBR_MFP.pkl', 'rb'))

## Prediction for CV

In [103]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [113]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [114]:
y_pred_CV_GBR = cross_val_predict(best_GBR, x_tr, y_tr, cv=cv)

In [115]:
y_pred_con=(y_pred_CV_svm+y_pred_CV_GBR)/2

In [116]:
Q2_CV = round(r2_score(y_tr, y_pred_con), 2)
Q2_CV

0.54

In [117]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_con)),2)
RMSE_CV

0.46

## Prediction for test set's molecules

In [118]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [119]:
y_pred_svm = best_svm.predict(x_ts)

In [120]:
y_pred_GBR = best_GBR.predict(x_ts)

In [121]:
y_pred_GBR

array([2.65314081, 3.00667763, 1.68043968, ..., 6.60570482, 6.02856091,
       3.74930424])

In [122]:
y_pred_con=(y_pred_svm+y_pred_GBR)/2

In [123]:
Q2_TS = round(r2_score(y_ts, y_pred_con), 2)
Q2_TS

0.55

In [124]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_con)), 2)
RMSE_TS

0.45

## Estimating applicability domain. Method - Euclidian distances, K=1

In [125]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [126]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12044,12045,12046,12047,12048,12049,12050,12051,12052,12053
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2.000000,2.449490,3.162278,4.358899,5.385165,2.236068,3.741657,1.732051,3.316625,4.242640,...,3.162278,5.196152,9.110434,7.071068,1.000000,3.605551,3.872983,3.605551,6.164414,7.280110
2,2.236068,2.828427,4.123106,4.690416,6.164414,2.449490,3.872983,2.449490,4.000000,4.582576,...,3.464102,7.483315,9.165152,7.483315,1.732051,7.810250,4.242640,4.358899,6.324555,7.615773
3,2.236068,3.162278,4.123106,4.898980,6.164414,2.449490,4.123106,2.449490,4.123106,4.582576,...,4.472136,7.615773,9.165152,8.426149,3.162278,7.874008,4.358899,4.472136,6.403124,7.937254
4,2.236068,3.605551,4.123106,4.898980,6.164414,2.645751,4.123106,2.449490,4.123106,4.690416,...,5.196152,7.615773,9.219544,8.544003,3.316625,8.000000,4.472136,5.291502,6.403124,8.124039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,11.916375,12.041595,12.165525,12.000000,12.369317,11.789826,11.401754,11.789826,12.083046,12.041595,...,13.341664,12.884099,13.152946,13.266500,12.124355,13.152946,12.884099,12.845233,12.369317,13.190906
12050,11.958261,12.124355,12.288206,12.124355,12.409674,11.958261,11.401754,11.916375,12.206555,12.165525,...,13.379088,13.000000,13.152946,13.304134,12.247449,13.266500,12.961481,12.922848,12.449900,13.341664
12051,12.083046,12.409674,12.328828,12.409674,12.609520,12.000000,11.704700,11.958261,12.206555,12.206555,...,13.564660,13.038404,13.152946,13.341664,12.288206,13.304134,13.076696,13.114877,12.489996,13.453624
12052,12.165525,12.489996,12.489996,12.806249,13.076696,12.124355,12.449900,12.124355,12.369317,12.288206,...,13.601471,13.038404,13.266500,13.453624,12.369317,13.638182,13.228757,13.266500,12.569805,13.527749


In [127]:
similarity= neighbors_k

In [128]:
Dmean=np.mean(similarity[1,:])

In [129]:
round(Dmean, 2)

3.6

In [130]:
std=np.std(similarity[1,:])

In [131]:
round(std, 2)

1.18

In [132]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

4.19


In [133]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [134]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3004,3005,3006,3007,3008,3009,3010,3011,3012,3013
0,2.000000,3.162278,2.828427,4.123106,2.645751,2.828427,3.464102,3.464102,2.000000,2.645751,...,3.162278,6.557438,4.898980,3.464102,3.872983,3.605551,4.898980,3.000000,3.162278,5.656854
1,2.236068,3.162278,3.000000,4.242640,3.000000,3.000000,3.464102,3.872983,2.236068,2.828427,...,4.795832,6.782330,5.744563,3.605551,5.291502,5.385165,5.385165,7.141428,3.316625,5.916080
2,2.236068,3.316625,3.000000,4.242640,3.162278,3.000000,3.464102,4.000000,2.236068,3.316625,...,5.000000,7.615773,5.744563,3.741657,5.291502,6.855655,5.385165,7.549834,7.937254,6.082763
3,2.645751,3.316625,3.316625,4.242640,3.162278,3.162278,4.123106,4.000000,2.645751,3.316625,...,5.099020,7.745967,5.830952,3.872983,5.477226,7.000000,5.477226,8.485281,8.000000,6.082763
4,2.828427,3.464102,3.464102,4.358899,3.162278,3.162278,4.242640,4.000000,2.645751,3.316625,...,5.099020,7.810250,5.916080,4.000000,5.477226,7.000000,5.477226,8.485281,8.062258,6.244998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,11.789826,12.288206,11.958261,11.789826,11.958261,11.916375,11.958261,12.000000,11.874342,11.958261,...,12.328828,12.449900,13.266500,12.206555,12.409674,12.727922,11.789826,13.304134,12.922848,12.165525
12050,11.916375,12.328828,11.958261,11.958261,12.083046,11.916375,12.083046,12.165525,12.041595,12.124355,...,12.449900,12.529964,13.416408,12.328828,12.449900,12.806249,11.832160,13.304134,12.961481,12.288206
12051,11.958261,12.449900,12.000000,12.247449,12.124355,12.041595,12.247449,12.206555,12.165525,12.206555,...,12.529964,12.727922,13.674794,12.369317,12.649111,12.922848,12.000000,13.341664,12.961481,12.569805
12052,12.124355,12.529964,12.124355,12.247449,12.288206,12.165525,12.727922,12.328828,12.206555,12.247449,...,12.609520,12.727922,13.747727,12.449900,12.806249,12.922848,12.083046,13.490738,13.152946,12.569805


In [135]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[2.    3.162 2.828 ... 3.    3.162 5.657]


In [136]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True  True False]


In [137]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7438619774386198


In [138]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 3009 3011 3012]


In [139]:
out_Ad=list(np.where(cpd_AD == 0)[0])

## Prediction only for molecules included in  AD

In [140]:
y_pred_con_ad=list(y_pred_con)

In [141]:
y_pred_con_ad[:] = [x for i,x in enumerate(y_pred_con_ad) if i not in out_Ad]

In [142]:
len(y_pred_con_ad)

2242

In [143]:
y_ts_ad=list(y_ts)

In [144]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [145]:
len(y_ts_ad)

2242

In [148]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_con_ad), 2)
Q2_TS

0.64

In [149]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_con_ad)), 2)
RMSE_TS

0.4

# MACCS -RF

## Calculation MACCS Fingerprints for work set

In [150]:
from rdkit.Chem import MACCSkeys

In [151]:
fp_tr = [MACCSkeys.GenMACCSKeys(m) for m in moldf_ws]

In [152]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [153]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [154]:
savetxt('Models/MACCS/x_tr_MACCS.csv', x_tr, delimiter=',')

In [155]:
x_tr.shape

(12054, 167)

## Calculation  MACCS Fingerprint for test set

In [156]:
fp_ts = [MACCSkeys.GenMACCSKeys(m) for m in moldf_ts]

In [157]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [158]:
x_ts = rdkit_numpy_convert(fp_ts)

In [159]:
x_ts.shape

(3014, 167)

In [160]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

##  Random forest model building and validation¶

In [161]:
seed = 42

In [162]:
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [46]:
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3, x_tr.shape[1] // 2],
              "n_estimators": [100, 250, 500, 1000]}

In [47]:
m = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [165]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [166]:
m.best_params_
best_RF = m.best_estimator_

In [167]:
m.best_params_

{'max_features': 55, 'n_estimators': 1000}

In [164]:
y_pred_CV_RF = cross_val_predict(best_RF, x_tr, y_tr, cv=cv)

In [165]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_RF), 2)
Q2_CV

0.53

In [167]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_RF)), 2)
RMSE_CV

0.46

##  Prediction for test set's molecules

In [168]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [169]:
len(y_ts)

3014

In [170]:
y_pred_rf = best_RF.predict(x_ts)

In [171]:
Q2_TS = round(r2_score(y_ts, y_pred_rf), 2)
Q2_TS

0.55

In [173]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_rf)), 2)
RMSE_TS

0.45

## save the model to disk

In [176]:
pickle.dump(best_RF, open('Models/MACCS/LD50_mouse_introvenus_RF_MACCS.pkl', 'wb'))

## load the model from disk

In [163]:
best_RF = pickle.load(open('Models/MACCS/LD50_mouse_introvenus_RF_MACCS.pkl', 'rb'))

##  Y-randomization RF model

In [49]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_RF, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=10,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 39.7min


True score =  0.53 
Y-randomization =  -0.14 
p-value =  0.0196


[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 66.2min finished


##  Estimating applicability domain. Method - Euclidian distances, K=1

In [174]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [175]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12044,12045,12046,12047,12048,12049,12050,12051,12052,12053
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2.000000,2.000000,1.414214,3.316625,3.464102,2.449490,2.236068,2.828427,2.000000,3.000000,...,1.000000,1.000000,4.358899,2.000000,1.414214,2.449490,1.000000,0.000000,3.872983,3.872983
2,2.000000,2.645751,1.732051,3.316625,3.605551,2.645751,2.449490,2.828427,2.645751,3.000000,...,1.000000,4.000000,4.582576,2.000000,1.414214,2.449490,1.414214,1.414214,3.872983,4.000000
3,2.000000,2.645751,1.732051,3.316625,3.605551,2.645751,2.449490,3.000000,2.828427,3.741657,...,1.414214,4.000000,4.582576,2.449490,1.414214,3.464102,1.732051,1.732051,3.872983,4.242640
4,2.449490,2.828427,2.000000,3.464102,3.741657,2.828427,2.645751,3.162278,2.828427,3.872983,...,1.732051,4.358899,4.690416,2.449490,2.000000,3.741657,2.000000,2.000000,4.000000,4.242640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,9.110434,9.110434,8.246211,8.544003,9.055386,9.000000,8.717798,8.831760,8.426149,8.306623,...,8.774964,8.944272,9.165152,8.944272,8.717798,8.717798,8.831760,8.831760,8.831760,9.219544
12050,9.165152,9.165152,8.366600,8.544003,9.055386,9.000000,8.774964,8.888194,8.426149,8.306623,...,8.774964,8.944272,9.219544,8.944272,8.774964,8.717798,8.888194,8.831760,8.831760,9.273619
12051,9.165152,9.219544,8.426149,8.602325,9.055386,9.055386,8.774964,8.944272,8.485281,8.366600,...,8.774964,8.944272,9.219544,8.944272,8.774964,8.717798,8.888194,8.888194,8.888194,9.273619
12052,9.219544,9.219544,8.426149,8.602325,9.110434,9.110434,8.831760,9.000000,8.485281,8.366600,...,8.774964,8.944272,9.219544,8.944272,8.888194,8.717798,9.000000,8.944272,8.888194,9.273619


In [176]:
similarity= neighbors_k

In [177]:
Dmean=np.mean(similarity[1,:])

In [178]:
round(Dmean, 2)

1.92

In [179]:
std=np.std(similarity[1,:])

In [180]:
round(std, 2)

1.05

In [181]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.45


In [182]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [183]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3004,3005,3006,3007,3008,3009,3010,3011,3012,3013
0,2.000000,1.414214,1.000000,2.236068,1.732051,1.000000,1.732051,2.645751,2.000000,2.828427,...,1.000000,2.000000,0.000000,2.449490,2.000000,0.000000,3.605551,1.414214,2.000000,2.000000
1,2.449490,1.732051,1.414214,2.236068,2.645751,1.414214,2.000000,2.645751,2.000000,2.828427,...,3.162278,2.828427,1.000000,3.162278,3.872983,1.000000,3.872983,2.000000,2.000000,2.236068
2,2.645751,1.732051,1.732051,2.645751,2.828427,1.732051,2.236068,3.162278,2.236068,2.828427,...,3.162278,4.358899,1.000000,3.316625,3.872983,3.872983,3.872983,2.000000,4.582576,2.236068
3,2.828427,1.732051,2.000000,2.645751,2.828427,2.000000,2.236068,3.316625,2.449490,3.162278,...,3.162278,4.582576,1.414214,3.872983,4.242640,3.872983,4.000000,2.449490,4.582576,2.449490
4,2.828427,1.732051,2.236068,2.828427,3.000000,2.236068,2.236068,3.464102,2.645751,3.316625,...,3.162278,4.582576,1.414214,3.872983,4.898980,3.872983,4.000000,2.449490,4.582576,2.645751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,8.831760,9.327379,8.888194,8.888194,8.831760,8.888194,8.366600,8.831760,9.055386,9.165152,...,8.888194,8.944272,8.774964,8.485281,8.544003,9.000000,8.485281,8.888194,8.602325,8.888194
12050,8.888194,9.327379,8.944272,8.944272,8.831760,8.944272,8.366600,8.831760,9.055386,9.219544,...,8.888194,8.944272,8.774964,8.544003,8.544003,9.055386,8.485281,8.944272,8.602325,8.888194
12051,8.888194,9.380832,9.000000,8.944272,8.831760,9.000000,8.426149,8.944272,9.110434,9.219544,...,8.888194,8.944272,8.831760,8.660254,8.544003,9.055386,8.544003,8.944272,8.660254,8.888194
12052,8.944272,9.433981,9.000000,9.000000,8.888194,9.000000,8.485281,8.944272,9.165152,9.273619,...,8.888194,8.944272,8.831760,8.717798,8.660254,9.055386,8.602325,8.944272,8.660254,8.944272


In [184]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[2.    1.414 1.    ... 1.414 2.    2.   ]


In [185]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True  True  True]


In [186]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD),2))

Coverage =  0.72


In [187]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 3011 3012 3013]


In [188]:
out_Ad=list(np.where(cpd_AD == 0)[0])

## Prediction only for molecules included in  AD

In [193]:
y_pred_rf_ad=list(y_pred_rf)

In [194]:
y_pred_rf_ad[:] = [x for i,x in enumerate(y_pred_rf_ad) if i not in out_Ad]

In [195]:
len(y_pred_rf_ad)

2177

In [196]:
y_ts_ad=list(y_ts)

In [197]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [198]:
len(y_ts_ad)

2177

In [201]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_rf_ad), 2)
Q2_TS

0.64

In [200]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_rf_ad)), 2)
RMSE_TS

0.41

# MACCS FP_SVM model building and validation

In [200]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [201]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [202]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=2, cv=cv, verbose=1)

In [203]:
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [204]:
svm.best_params_
best_svm = svm.best_estimator_

In [203]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [204]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.53

In [206]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.46

## Prediction for test set's molecules

In [207]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [208]:
y_pred_svm = best_svm.predict(x_ts)

In [209]:
Q2_TS = round(r2_score(y_ts, y_pred_svm), 2)
Q2_TS

0.55

In [210]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_svm)), 2)
RMSE_TS

0.46

save the model to disk

In [212]:
pickle.dump(best_svm, open('Models/MACCS/LD50_mouse_introvenus_SVM_MACCS.pkl', 'wb'))

load the model from disk

In [202]:
best_svm = pickle.load(open('Models/MACCS/LD50_mouse_introvenus_SVM_MACCS.pkl', 'rb'))

## Y-randomization SVM model

In [51]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_svm, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=10,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 13.7min


True score =  0.53 
Y-randomization =  -0.1 
p-value =  0.0196


[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed: 19.3min finished


## Estimating applicability domain. Method - Euclidian distances, K=1

In [211]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [212]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12044,12045,12046,12047,12048,12049,12050,12051,12052,12053
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2.000000,2.000000,1.414214,3.316625,3.464102,2.449490,2.236068,2.828427,2.000000,3.000000,...,1.000000,1.000000,4.358899,2.000000,1.414214,2.449490,1.000000,0.000000,3.872983,3.872983
2,2.000000,2.645751,1.732051,3.316625,3.605551,2.645751,2.449490,2.828427,2.645751,3.000000,...,1.000000,4.000000,4.582576,2.000000,1.414214,2.449490,1.414214,1.414214,3.872983,4.000000
3,2.000000,2.645751,1.732051,3.316625,3.605551,2.645751,2.449490,3.000000,2.828427,3.741657,...,1.414214,4.000000,4.582576,2.449490,1.414214,3.464102,1.732051,1.732051,3.872983,4.242640
4,2.449490,2.828427,2.000000,3.464102,3.741657,2.828427,2.645751,3.162278,2.828427,3.872983,...,1.732051,4.358899,4.690416,2.449490,2.000000,3.741657,2.000000,2.000000,4.000000,4.242640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,9.110434,9.110434,8.246211,8.544003,9.055386,9.000000,8.717798,8.831760,8.426149,8.306623,...,8.774964,8.944272,9.165152,8.944272,8.717798,8.717798,8.831760,8.831760,8.831760,9.219544
12050,9.165152,9.165152,8.366600,8.544003,9.055386,9.000000,8.774964,8.888194,8.426149,8.306623,...,8.774964,8.944272,9.219544,8.944272,8.774964,8.717798,8.888194,8.831760,8.831760,9.273619
12051,9.165152,9.219544,8.426149,8.602325,9.055386,9.055386,8.774964,8.944272,8.485281,8.366600,...,8.774964,8.944272,9.219544,8.944272,8.774964,8.717798,8.888194,8.888194,8.888194,9.273619
12052,9.219544,9.219544,8.426149,8.602325,9.110434,9.110434,8.831760,9.000000,8.485281,8.366600,...,8.774964,8.944272,9.219544,8.944272,8.888194,8.717798,9.000000,8.944272,8.888194,9.273619


In [213]:
similarity= neighbors_k

In [214]:
Dmean=np.mean(similarity[1,:])

In [215]:
round(Dmean, 2)

1.92

In [216]:
std=np.std(similarity[1,:])

In [217]:
round(std, 2)

1.05

In [218]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.45


In [219]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [220]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3004,3005,3006,3007,3008,3009,3010,3011,3012,3013
0,2.000000,1.414214,1.000000,2.236068,1.732051,1.000000,1.732051,2.645751,2.000000,2.828427,...,1.000000,2.000000,0.000000,2.449490,2.000000,0.000000,3.605551,1.414214,2.000000,2.000000
1,2.449490,1.732051,1.414214,2.236068,2.645751,1.414214,2.000000,2.645751,2.000000,2.828427,...,3.162278,2.828427,1.000000,3.162278,3.872983,1.000000,3.872983,2.000000,2.000000,2.236068
2,2.645751,1.732051,1.732051,2.645751,2.828427,1.732051,2.236068,3.162278,2.236068,2.828427,...,3.162278,4.358899,1.000000,3.316625,3.872983,3.872983,3.872983,2.000000,4.582576,2.236068
3,2.828427,1.732051,2.000000,2.645751,2.828427,2.000000,2.236068,3.316625,2.449490,3.162278,...,3.162278,4.582576,1.414214,3.872983,4.242640,3.872983,4.000000,2.449490,4.582576,2.449490
4,2.828427,1.732051,2.236068,2.828427,3.000000,2.236068,2.236068,3.464102,2.645751,3.316625,...,3.162278,4.582576,1.414214,3.872983,4.898980,3.872983,4.000000,2.449490,4.582576,2.645751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,8.831760,9.327379,8.888194,8.888194,8.831760,8.888194,8.366600,8.831760,9.055386,9.165152,...,8.888194,8.944272,8.774964,8.485281,8.544003,9.000000,8.485281,8.888194,8.602325,8.888194
12050,8.888194,9.327379,8.944272,8.944272,8.831760,8.944272,8.366600,8.831760,9.055386,9.219544,...,8.888194,8.944272,8.774964,8.544003,8.544003,9.055386,8.485281,8.944272,8.602325,8.888194
12051,8.888194,9.380832,9.000000,8.944272,8.831760,9.000000,8.426149,8.944272,9.110434,9.219544,...,8.888194,8.944272,8.831760,8.660254,8.544003,9.055386,8.544003,8.944272,8.660254,8.888194
12052,8.944272,9.433981,9.000000,9.000000,8.888194,9.000000,8.485281,8.944272,9.165152,9.273619,...,8.888194,8.944272,8.831760,8.717798,8.660254,9.055386,8.602325,8.944272,8.660254,8.944272


In [221]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[2.    1.414 1.    ... 1.414 2.    2.   ]


In [222]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True  True  True]


In [223]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7222959522229595


In [224]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 3011 3012 3013]


In [225]:
out_Ad=list(np.where(cpd_AD == 0)[0])

##  Prediction only for molecules included in  AD

In [226]:
y_pred_svm_ad=list(y_pred_svm)

In [227]:
y_pred_svm_ad[:] = [x for i,x in enumerate(y_pred_svm_ad) if i not in out_Ad]

In [228]:
len(y_pred_svm_ad)

2177

In [229]:
y_ts_ad=list(y_ts)

In [230]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [231]:
len(y_ts_ad)

2177

In [232]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_svm_ad), 2)
Q2_TS

0.63

In [233]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_svm_ad)), 2)
RMSE_TS

0.41

# Consensus

##  load the models from disk

In [237]:
best_svm = pickle.load(open('Models/MACCS/LD50_mouse_introvenus_SVM_MACCS.pkl', 'rb'))

In [238]:
best_rf = pickle.load(open('Models/MACCS/LD50_mouse_introvenus_RF_MACCS.pkl', 'rb'))

## Prediction for CV

In [239]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [240]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [241]:
y_pred_CV_rf = cross_val_predict(best_rf, x_tr, y_tr, cv=cv)

In [242]:
y_pred_con=(y_pred_CV_svm+y_pred_CV_rf)/2

In [243]:
Q2_CV = round(r2_score(y_tr, y_pred_con), 2)
Q2_CV

0.54

In [244]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_con)),2)
RMSE_CV

0.46

## Prediction for test set's molecules

In [245]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [246]:
y_pred_svm = best_svm.predict(x_ts)

In [247]:
y_pred_rf = best_rf.predict(x_ts)

In [248]:
y_pred_rf

array([2.20281439, 2.51684199, 2.18160521, ..., 5.236939  , 5.19370257,
       5.49704032])

In [249]:
y_pred_con=(y_pred_svm+y_pred_rf)/2

In [250]:
Q2_TS = round(r2_score(y_ts, y_pred_con), 2)
Q2_TS

0.56

In [251]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_con)), 2)
RMSE_TS

0.45

## Estimating applicability domain. Method - Euclidian distances, K=1

In [252]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [253]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12044,12045,12046,12047,12048,12049,12050,12051,12052,12053
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2.000000,2.000000,1.414214,3.316625,3.464102,2.449490,2.236068,2.828427,2.000000,3.000000,...,1.000000,1.000000,4.358899,2.000000,1.414214,2.449490,1.000000,0.000000,3.872983,3.872983
2,2.000000,2.645751,1.732051,3.316625,3.605551,2.645751,2.449490,2.828427,2.645751,3.000000,...,1.000000,4.000000,4.582576,2.000000,1.414214,2.449490,1.414214,1.414214,3.872983,4.000000
3,2.000000,2.645751,1.732051,3.316625,3.605551,2.645751,2.449490,3.000000,2.828427,3.741657,...,1.414214,4.000000,4.582576,2.449490,1.414214,3.464102,1.732051,1.732051,3.872983,4.242640
4,2.449490,2.828427,2.000000,3.464102,3.741657,2.828427,2.645751,3.162278,2.828427,3.872983,...,1.732051,4.358899,4.690416,2.449490,2.000000,3.741657,2.000000,2.000000,4.000000,4.242640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,9.110434,9.110434,8.246211,8.544003,9.055386,9.000000,8.717798,8.831760,8.426149,8.306623,...,8.774964,8.944272,9.165152,8.944272,8.717798,8.717798,8.831760,8.831760,8.831760,9.219544
12050,9.165152,9.165152,8.366600,8.544003,9.055386,9.000000,8.774964,8.888194,8.426149,8.306623,...,8.774964,8.944272,9.219544,8.944272,8.774964,8.717798,8.888194,8.831760,8.831760,9.273619
12051,9.165152,9.219544,8.426149,8.602325,9.055386,9.055386,8.774964,8.944272,8.485281,8.366600,...,8.774964,8.944272,9.219544,8.944272,8.774964,8.717798,8.888194,8.888194,8.888194,9.273619
12052,9.219544,9.219544,8.426149,8.602325,9.110434,9.110434,8.831760,9.000000,8.485281,8.366600,...,8.774964,8.944272,9.219544,8.944272,8.888194,8.717798,9.000000,8.944272,8.888194,9.273619


In [254]:
similarity= neighbors_k

In [255]:
Dmean=np.mean(similarity[1,:])

In [256]:
round(Dmean, 2)

1.92

In [257]:
std=np.std(similarity[1,:])

In [258]:
round(std, 2)

1.05

In [259]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.45


In [260]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [261]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3004,3005,3006,3007,3008,3009,3010,3011,3012,3013
0,2.000000,1.414214,1.000000,2.236068,1.732051,1.000000,1.732051,2.645751,2.000000,2.828427,...,1.000000,2.000000,0.000000,2.449490,2.000000,0.000000,3.605551,1.414214,2.000000,2.000000
1,2.449490,1.732051,1.414214,2.236068,2.645751,1.414214,2.000000,2.645751,2.000000,2.828427,...,3.162278,2.828427,1.000000,3.162278,3.872983,1.000000,3.872983,2.000000,2.000000,2.236068
2,2.645751,1.732051,1.732051,2.645751,2.828427,1.732051,2.236068,3.162278,2.236068,2.828427,...,3.162278,4.358899,1.000000,3.316625,3.872983,3.872983,3.872983,2.000000,4.582576,2.236068
3,2.828427,1.732051,2.000000,2.645751,2.828427,2.000000,2.236068,3.316625,2.449490,3.162278,...,3.162278,4.582576,1.414214,3.872983,4.242640,3.872983,4.000000,2.449490,4.582576,2.449490
4,2.828427,1.732051,2.236068,2.828427,3.000000,2.236068,2.236068,3.464102,2.645751,3.316625,...,3.162278,4.582576,1.414214,3.872983,4.898980,3.872983,4.000000,2.449490,4.582576,2.645751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,8.831760,9.327379,8.888194,8.888194,8.831760,8.888194,8.366600,8.831760,9.055386,9.165152,...,8.888194,8.944272,8.774964,8.485281,8.544003,9.000000,8.485281,8.888194,8.602325,8.888194
12050,8.888194,9.327379,8.944272,8.944272,8.831760,8.944272,8.366600,8.831760,9.055386,9.219544,...,8.888194,8.944272,8.774964,8.544003,8.544003,9.055386,8.485281,8.944272,8.602325,8.888194
12051,8.888194,9.380832,9.000000,8.944272,8.831760,9.000000,8.426149,8.944272,9.110434,9.219544,...,8.888194,8.944272,8.831760,8.660254,8.544003,9.055386,8.544003,8.944272,8.660254,8.888194
12052,8.944272,9.433981,9.000000,9.000000,8.888194,9.000000,8.485281,8.944272,9.165152,9.273619,...,8.888194,8.944272,8.831760,8.717798,8.660254,9.055386,8.602325,8.944272,8.660254,8.944272


In [262]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[2.    1.414 1.    ... 1.414 2.    2.   ]


In [263]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True  True  True]


In [264]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7222959522229595


In [265]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 3011 3012 3013]


In [266]:
out_Ad=list(np.where(cpd_AD == 0)[0])

## Prediction only for molecules included in  AD

In [267]:
y_pred_con_ad=list(y_pred_con)

In [268]:
y_pred_con_ad[:] = [x for i,x in enumerate(y_pred_con_ad) if i not in out_Ad]

In [269]:
len(y_pred_con_ad)

2177

In [270]:
y_ts_ad=list(y_ts)

In [271]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [272]:
len(y_ts_ad)

2177

In [273]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_con_ad), 2)
Q2_TS

0.64

In [274]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_con_ad)), 2)
RMSE_TS

0.4