In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
from copy import deepcopy
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
import shap
from tqdm.notebook import tqdm

In [3]:
def convert_smi_to_canon_smi(smi):

    try:
        canon_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi),isomericSmiles = False)
    except:
        canon_smi='wrong_smiles'
    return canon_smi

In [4]:
def standart(smi):
    global m
    if smi!='wrong_smiles':
        try:
            smiles=standardize_smiles(smi)
            m = Chem.MolFromSmiles(smi)
        except:
            smi='error kekule'
    else:
        m = 'check the smiles'
    return m

#  Load data and curation work set

In [9]:
df_ws=pd.read_csv('datasets/rat_LD50_WS.csv')
df_ws

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical_SMILES,InChIKey,rat_intravenous_LD50
0,TOX-3990,178,acetamide,CC(N)=O,CC(N)=O,DLFVBJFMPXGRIB-UHFFFAOYSA-N,0.674442
1,TOX-2407,180,propan-2-one,CC(C)=O,CC(C)=O,CSCPPACGZOOCGX-UHFFFAOYSA-N,1.023664
2,TOX-8193,12025,pyrrolidin-2-one,O=C1CCCN1,O=C1CCCN1,HNJBEVLQSNELDL-UHFFFAOYSA-N,1.072628
3,TOX-4318,1030,"propane-1,2-diol",CC(O)CO,CC(O)CO,DNIAPMSPPWPWGF-UHFFFAOYSA-N,1.073618
4,TOX-8477,99937,"2,3,3a,5,6,6a-hexahydrofuro[3,2-b]furan-3,6-diol",OC1COC2C(O)COC12,OC1COC2C(O)COC12,KLDXJTOLSGUMSJ-UHFFFAOYSA-N,1.123382
...,...,...,...,...,...,...,...
1853,TOX-43158,116224,2-diethoxyphosphorylsulfanylethyl-ethyl-methyl...,CCOP(=O)(OCC)SCC[S+](C)CC,CCOP(=O)(OCC)SCC[S+](C)CC,YDYNMVNUGKEYOS-UHFFFAOYSA-N,7.232647
1854,TOX-2075,14955,"9,10,21,25-tetramethoxy-15,15,30,30-tetramethy...",COc1ccc2cc1Oc1cc3c(cc1OC)CC[N+](C)(C)C3Cc1ccc(...,COc1ccc2cc1Oc1cc3c(cc1OC)CC[N+](C)(C)C3Cc1ccc(...,JFXBEKISTKFVAB-UHFFFAOYSA-N,7.270733
1855,TOX-43154,116216,2-diethoxyphosphorylsulfanylethyl(diethyl)sulf...,CCOP(=O)(OCC)SCC[S+](CC)CC,CCOP(=O)(OCC)SCC[S+](CC)CC,QACHTXIQQRWDTM-UHFFFAOYSA-N,7.458497
1856,TOX-43157,116222,2-dimethoxyphosphorylsulfanylethyl-ethyl-(2-et...,CCSCC[S+](CC)CCSP(=O)(OC)OC,CCSCC[S+](CC)CCSP(=O)(OC)OC,GICPMZCWCHOATC-UHFFFAOYSA-N,7.805466


##  Load data and curation test set

In [13]:
df_ts=pd.read_csv('datasets/rat_LD50_TS.csv')
df_ts

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical_SMILES,InChIKey,rat_intravenous_LD50
0,TOX-69105,9887295,"1-[6-(benzenesulfonyl)-3-hydroxy-2,2-dimethyl-...",CC1(C)Oc2ccc(S(=O)(=O)c3ccccc3)cc2C(N2CCCC2=O)C1O,CC1(C)Oc2ccc(S(=O)(=O)c3ccccc3)cc2C(N2CCCC2=O)C1O,LKAQWOWWTKFLNX-UHFFFAOYSA-N,0.496458
1,TOX-5811,8172,2-[2-(2-hydroxyethoxy)ethoxy]ethanol,OCCOCCOCCO,OCCOCCOCCO,ZIBGPFATKBEMQZ-UHFFFAOYSA-N,1.108409
2,TOX-2835,4101,"1,3,5,7-tetrazatricyclo[3.3.1.13,7]decane",C1N2CN3CN1CN(C2)C3,C1N2CN3CN1CN(C2)C3,VKYKSIONXSXAKP-UHFFFAOYSA-N,1.182929
3,TOX-3906,174,"ethane-1,2-diol",OCCO,OCCO,LYCAIKOWRPUZTN-UHFFFAOYSA-N,1.279650
4,TOX-5749,8087,1-(2-hydroxypropoxy)propan-2-ol,CC(O)COCC(C)O,CC(O)COCC(C)O,AZUXKVXMJOIAOF-UHFFFAOYSA-N,1.364244
...,...,...,...,...,...,...,...
460,TOX-44743,76419053,"[4-[4-[4-[3,5-dihydroxy-6-(hydroxymethyl)-4-[3...",CC1(C)CCC2(C(=O)OC3OC(CO)C(O)C(OC4OC(CO)C(O)C(...,CC1(C)CCC2(C(=O)OC3OC(CO)C(O)C(OC4OC(CO)C(O)C(...,UZQJVUCHXGYFLQ-UHFFFAOYSA-N,6.425276
461,TOX-5571,7871,2-[fluoro(methyl)phosphoryl]oxypropane,CC(C)OP(C)(=O)F,CC(C)OP(C)(=O)F,DYAHQFWOVKZOOW-UHFFFAOYSA-N,6.555355
462,TOX-5132,7305,"3-[fluoro(methyl)phosphoryl]oxy-2,2-dimethylbu...",CC(OP(C)(=O)F)C(C)(C)C,CC(OP(C)(=O)F)C(C)(C)C,GRXKLBBBQUKJJZ-UHFFFAOYSA-N,6.612129
463,TOX-23193,102302,"2-[ethoxy(methyl)phosphoryl]sulfanyl-N,N-dimet...",CCOP(C)(=O)SCCN(C)C,CCOP(C)(=O)SCCN(C)C,PKDYQTANBZBIRM-UHFFFAOYSA-N,7.094383


##  Standardization  for work set

In [17]:
df_ws["Molecule"] = df_ws.apply(lambda x: standart(x.Canonical_SMILES), axis=1)
print('Kept data: ', len(df_ws), 'molecules')

Kept data:  1858 molecules


In [19]:
df_ws

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical_SMILES,InChIKey,rat_intravenous_LD50,Molecule
0,TOX-3990,178,acetamide,CC(N)=O,CC(N)=O,DLFVBJFMPXGRIB-UHFFFAOYSA-N,0.674442,<rdkit.Chem.rdchem.Mol object at 0x0000024B079...
1,TOX-2407,180,propan-2-one,CC(C)=O,CC(C)=O,CSCPPACGZOOCGX-UHFFFAOYSA-N,1.023664,<rdkit.Chem.rdchem.Mol object at 0x0000024B079...
2,TOX-8193,12025,pyrrolidin-2-one,O=C1CCCN1,O=C1CCCN1,HNJBEVLQSNELDL-UHFFFAOYSA-N,1.072628,<rdkit.Chem.rdchem.Mol object at 0x0000024B079...
3,TOX-4318,1030,"propane-1,2-diol",CC(O)CO,CC(O)CO,DNIAPMSPPWPWGF-UHFFFAOYSA-N,1.073618,<rdkit.Chem.rdchem.Mol object at 0x0000024B079...
4,TOX-8477,99937,"2,3,3a,5,6,6a-hexahydrofuro[3,2-b]furan-3,6-diol",OC1COC2C(O)COC12,OC1COC2C(O)COC12,KLDXJTOLSGUMSJ-UHFFFAOYSA-N,1.123382,<rdkit.Chem.rdchem.Mol object at 0x0000024B079...
...,...,...,...,...,...,...,...,...
1853,TOX-43158,116224,2-diethoxyphosphorylsulfanylethyl-ethyl-methyl...,CCOP(=O)(OCC)SCC[S+](C)CC,CCOP(=O)(OCC)SCC[S+](C)CC,YDYNMVNUGKEYOS-UHFFFAOYSA-N,7.232647,<rdkit.Chem.rdchem.Mol object at 0x0000024B07A...
1854,TOX-2075,14955,"9,10,21,25-tetramethoxy-15,15,30,30-tetramethy...",COc1ccc2cc1Oc1cc3c(cc1OC)CC[N+](C)(C)C3Cc1ccc(...,COc1ccc2cc1Oc1cc3c(cc1OC)CC[N+](C)(C)C3Cc1ccc(...,JFXBEKISTKFVAB-UHFFFAOYSA-N,7.270733,<rdkit.Chem.rdchem.Mol object at 0x0000024B07A...
1855,TOX-43154,116216,2-diethoxyphosphorylsulfanylethyl(diethyl)sulf...,CCOP(=O)(OCC)SCC[S+](CC)CC,CCOP(=O)(OCC)SCC[S+](CC)CC,QACHTXIQQRWDTM-UHFFFAOYSA-N,7.458497,<rdkit.Chem.rdchem.Mol object at 0x0000024B07A...
1856,TOX-43157,116222,2-dimethoxyphosphorylsulfanylethyl-ethyl-(2-et...,CCSCC[S+](CC)CCSP(=O)(OC)OC,CCSCC[S+](CC)CCSP(=O)(OC)OC,GICPMZCWCHOATC-UHFFFAOYSA-N,7.805466,<rdkit.Chem.rdchem.Mol object at 0x0000024B07A...


In [24]:
y_tr=df_ws.rat_intravenous_LD50	
y_tr

0       0.674442
1       1.023664
2       1.072628
3       1.073618
4       1.123382
          ...   
1853    7.232647
1854    7.270733
1855    7.458497
1856    7.805466
1857    7.842020
Name: rat_intravenous_LD50, Length: 1858, dtype: float64

In [26]:
df = pd.DataFrame(df_ws, columns=["SMILES"])
df

Unnamed: 0,SMILES
0,CC(N)=O
1,CC(C)=O
2,O=C1CCCN1
3,CC(O)CO
4,OC1COC2C(O)COC12
...,...
1853,CCOP(=O)(OCC)SCC[S+](C)CC
1854,COc1ccc2cc1Oc1cc3c(cc1OC)CC[N+](C)(C)C3Cc1ccc(...
1855,CCOP(=O)(OCC)SCC[S+](CC)CC
1856,CCSCC[S+](CC)CCSP(=O)(OC)OC


In [13]:
df.to_csv('datasets/molecule_ws.smi', sep=',', index=False, header=False)

In [14]:
moldf_ws=df_ws.Molecule

 # Standardization for test set

In [30]:
df_ts["Molecule"] = df_ts.apply(lambda x: standart(x.Canonical_SMILES), axis=1)
moldf_ts=df_ts[df_ts['SMILES']!='wrong_smiles']
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  465 molecules


In [31]:
moldf_ts

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical_SMILES,InChIKey,rat_intravenous_LD50,Molecule
0,TOX-69105,9887295,"1-[6-(benzenesulfonyl)-3-hydroxy-2,2-dimethyl-...",CC1(C)Oc2ccc(S(=O)(=O)c3ccccc3)cc2C(N2CCCC2=O)C1O,CC1(C)Oc2ccc(S(=O)(=O)c3ccccc3)cc2C(N2CCCC2=O)C1O,LKAQWOWWTKFLNX-UHFFFAOYSA-N,0.496458,<rdkit.Chem.rdchem.Mol object at 0x0000024B07A...
1,TOX-5811,8172,2-[2-(2-hydroxyethoxy)ethoxy]ethanol,OCCOCCOCCO,OCCOCCOCCO,ZIBGPFATKBEMQZ-UHFFFAOYSA-N,1.108409,<rdkit.Chem.rdchem.Mol object at 0x0000024B09B...
2,TOX-2835,4101,"1,3,5,7-tetrazatricyclo[3.3.1.13,7]decane",C1N2CN3CN1CN(C2)C3,C1N2CN3CN1CN(C2)C3,VKYKSIONXSXAKP-UHFFFAOYSA-N,1.182929,<rdkit.Chem.rdchem.Mol object at 0x0000024B09B...
3,TOX-3906,174,"ethane-1,2-diol",OCCO,OCCO,LYCAIKOWRPUZTN-UHFFFAOYSA-N,1.279650,<rdkit.Chem.rdchem.Mol object at 0x0000024B0A7...
4,TOX-5749,8087,1-(2-hydroxypropoxy)propan-2-ol,CC(O)COCC(C)O,CC(O)COCC(C)O,AZUXKVXMJOIAOF-UHFFFAOYSA-N,1.364244,<rdkit.Chem.rdchem.Mol object at 0x0000024B0A7...
...,...,...,...,...,...,...,...,...
460,TOX-44743,76419053,"[4-[4-[4-[3,5-dihydroxy-6-(hydroxymethyl)-4-[3...",CC1(C)CCC2(C(=O)OC3OC(CO)C(O)C(OC4OC(CO)C(O)C(...,CC1(C)CCC2(C(=O)OC3OC(CO)C(O)C(OC4OC(CO)C(O)C(...,UZQJVUCHXGYFLQ-UHFFFAOYSA-N,6.425276,<rdkit.Chem.rdchem.Mol object at 0x0000024B0A9...
461,TOX-5571,7871,2-[fluoro(methyl)phosphoryl]oxypropane,CC(C)OP(C)(=O)F,CC(C)OP(C)(=O)F,DYAHQFWOVKZOOW-UHFFFAOYSA-N,6.555355,<rdkit.Chem.rdchem.Mol object at 0x0000024B0A9...
462,TOX-5132,7305,"3-[fluoro(methyl)phosphoryl]oxy-2,2-dimethylbu...",CC(OP(C)(=O)F)C(C)(C)C,CC(OP(C)(=O)F)C(C)(C)C,GRXKLBBBQUKJJZ-UHFFFAOYSA-N,6.612129,<rdkit.Chem.rdchem.Mol object at 0x0000024B0A9...
463,TOX-23193,102302,"2-[ethoxy(methyl)phosphoryl]sulfanyl-N,N-dimet...",CCOP(C)(=O)SCCN(C)C,CCOP(C)(=O)SCCN(C)C,PKDYQTANBZBIRM-UHFFFAOYSA-N,7.094383,<rdkit.Chem.rdchem.Mol object at 0x0000024B0A9...


In [34]:
y_ts=moldf_ts.rat_intravenous_LD50
y_ts

0      0.496458
1      1.108409
2      1.182929
3      1.279650
4      1.364244
         ...   
460    6.425276
461    6.555355
462    6.612129
463    7.094383
464    7.582023
Name: rat_intravenous_LD50, Length: 465, dtype: float64

In [36]:
df_ts = pd.DataFrame(moldf_ts, columns=["SMILES"])
df_ts

Unnamed: 0,SMILES
0,CC1(C)Oc2ccc(S(=O)(=O)c3ccccc3)cc2C(N2CCCC2=O)C1O
1,OCCOCCOCCO
2,C1N2CN3CN1CN(C2)C3
3,OCCO
4,CC(O)COCC(C)O
...,...
460,CC1(C)CCC2(C(=O)OC3OC(CO)C(O)C(OC4OC(CO)C(O)C(...
461,CC(C)OP(C)(=O)F
462,CC(OP(C)(=O)F)C(C)(C)C
463,CCOP(C)(=O)SCCN(C)C


In [38]:
df_ts.to_csv('datasets/molecule_ts.smi', sep=',', index=False, header=False)

# Calculation Fingerprints for work set

In [41]:
import glob
xml_files = glob.glob("fingerprints_xml/*.xml")
xml_files.sort()
xml_files

['fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'fingerprints_xml\\EStateFingerprinter.xml',
 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'fingerprints_xml\\Fingerprinter.xml',
 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'fingerprints_xml\\MACCSFingerprinter.xml',
 'fingerprints_xml\\PubchemFingerprinter.xml',
 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'fingerprints_xml\\SubstructureFingerprinter.xml']

In [43]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [49]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'EState': 'fingerprints_xml\\EStateFingerprinter.xml',
 'CDKextended': 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'CDK': 'fingerprints_xml\\Fingerprinter.xml',
 'CDKgraphonly': 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'MACCS': 'fingerprints_xml\\MACCSFingerprinter.xml',
 'PubChem': 'fingerprints_xml\\PubchemFingerprinter.xml',
 'SubstructureCount': 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'Substructure': 'fingerprints_xml\\SubstructureFingerprinter.xml'}

In [51]:
from padelpy import padeldescriptor

fingerprint = 'AtomPairs2D'

fingerprint_output_file = ''.join([fingerprint,'.csv'])
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ws.smi',
                d_file=fingerprint_output_file,
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [52]:
descriptors_AtomPairs2D = pd.read_csv('AtomPairs2D.csv')

In [53]:
descriptors_AtomPairs2D

Unnamed: 0,Name,AD2D1,AD2D2,AD2D3,AD2D4,AD2D5,AD2D6,AD2D7,AD2D8,AD2D9,...,AD2D771,AD2D772,AD2D773,AD2D774,AD2D775,AD2D776,AD2D777,AD2D778,AD2D779,AD2D780
0,AUTOGEN_molecule_ws_1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUTOGEN_molecule_ws_2,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUTOGEN_molecule_ws_3,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUTOGEN_molecule_ws_4,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUTOGEN_molecule_ws_5,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,AUTOGEN_molecule_ws_1854,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1854,AUTOGEN_molecule_ws_1855,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1855,AUTOGEN_molecule_ws_1856,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1856,AUTOGEN_molecule_ws_1857,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
descriptors_AtomPairs2D[descriptors_AtomPairs2D. isna(). any(axis=1)]

Unnamed: 0,Name,AD2D1,AD2D2,AD2D3,AD2D4,AD2D5,AD2D6,AD2D7,AD2D8,AD2D9,...,AD2D771,AD2D772,AD2D773,AD2D774,AD2D775,AD2D776,AD2D777,AD2D778,AD2D779,AD2D780


In [58]:
x_tr_AtomPairs2D = descriptors_AtomPairs2D.drop('Name', axis=1)

In [60]:
x_tr = np.array(x_tr_AtomPairs2D, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

In [62]:
savetxt('models/Padels/x_tr_AtomPairs2D.csv', x_tr_AtomPairs2D, delimiter=',')

## Descriptor calculation for test set

In [64]:
from padelpy import padeldescriptor

fingerprint = 'AtomPairs2D'

fingerprint_output_file = ''.join([fingerprint,'_ts.csv']) #KlekotaRoth.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ts.smi', 
                d_file=fingerprint_output_file, #'KlekotaRoth.csv'
                #descriptortypes='KlekotaRoth.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=False,
                fingerprints=True)

In [65]:
descriptors_AtomPairs2D_ts = pd.read_csv('AtomPairs2D_ts.csv')

In [68]:
descriptors_AtomPairs2D_ts[descriptors_AtomPairs2D_ts.isna().any(axis=1)]

Unnamed: 0,Name,AD2D1,AD2D2,AD2D3,AD2D4,AD2D5,AD2D6,AD2D7,AD2D8,AD2D9,...,AD2D771,AD2D772,AD2D773,AD2D774,AD2D775,AD2D776,AD2D777,AD2D778,AD2D779,AD2D780


In [70]:
x_ts = descriptors_AtomPairs2D_ts.drop('Name', axis=1)
x_ts

Unnamed: 0,AD2D1,AD2D2,AD2D3,AD2D4,AD2D5,AD2D6,AD2D7,AD2D8,AD2D9,AD2D10,...,AD2D771,AD2D772,AD2D773,AD2D774,AD2D775,AD2D776,AD2D777,AD2D778,AD2D779,AD2D780
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
461,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
462,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
463,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
x_ts.shape

(465, 780)

# CatBoostRegressor

In [76]:
cv=KFold(n_splits=5, random_state=42, shuffle=True)

In [78]:
%%time
model = CatBoostRegressor()
parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [100,500, 1000]
              }

grid = GridSearchCV(estimator=model, param_grid = parameters, n_jobs=-1, cv = cv)
grid.fit(x_tr, y_tr, verbose=False)

CPU times: total: 4min 55s
Wall time: 15min 8s


In [79]:
best_CatBR = grid.best_estimator_

In [80]:
grid.best_params_

{'depth': 10, 'iterations': 500, 'learning_rate': 0.05}

In [81]:
y_pred_ws_GBR = best_CatBR.predict(x_tr)

In [82]:
R2_WS = round(r2_score(y_tr, y_pred_ws_GBR), 2)
R2_WS

0.81

In [60]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_GBR)), 2)
RMSE_WS

0.47

In [84]:
params={'verbose': False}

In [86]:
%%time
y_pred_CV_CatBR = cross_val_predict(best_CatBR, x_tr, y_tr, cv=cv, fit_params=params)



CPU times: total: 19min 46s
Wall time: 1min 42s


In [87]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_CatBR), 2)
Q2_CV

0.47

In [88]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_CatBR)), 2)
RMSE_CV

0.72

# save the model to disk

In [107]:
pickle.dump(best_CatBR, open('Models/Padels/Toxicity_CatBoost_AtomPairs2D.pkl', 'wb'))

# load the model from disk

In [44]:
best_CatBR = pickle.load(open('Models/Padels/Toxicity_CatBoost_AtomPairs2D.pkl', 'rb'))

# 9. Prediction for test set's molecules

In [110]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [112]:
y_pred_GBR = best_CatBR.predict(x_ts)

In [114]:
Q2_TS = round(r2_score(y_ts, y_pred_GBR), 2)
Q2_TS

0.41

In [116]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_GBR)), 2)
RMSE_TS

0.76

# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [119]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [121]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1848,1849,1850,1851,1852,1853,1854,1855,1856,1857
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.414214,1.414214,1.000000,1.000000,1.000000,1.000000,0.000000,0.000000,2.236068,1.000000,...,0.000000,1.000000,4.000000,0.000000,2.000000,0.000000,0.000000,0.000000,1.732051,1.732051
2,1.414214,1.414214,1.000000,1.414214,1.414214,1.000000,1.000000,1.000000,2.449490,1.414214,...,0.000000,1.732051,4.242640,1.732051,2.000000,0.000000,1.732051,0.000000,3.464102,3.316625
3,1.732051,1.732051,1.414214,1.414214,1.414214,1.414214,1.000000,1.732051,2.645751,1.732051,...,1.732051,1.732051,4.358899,2.000000,2.000000,1.414214,2.000000,1.414214,3.464102,3.316625
4,1.732051,1.732051,1.732051,1.732051,1.414214,1.732051,1.414214,1.732051,2.828427,1.732051,...,2.236068,2.000000,4.358899,2.000000,2.000000,1.414214,2.000000,1.414214,3.464102,3.316625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,9.848858,9.949874,9.746795,9.899495,9.695360,9.695360,9.643651,10.099504,9.273619,9.746795,...,8.660254,8.831760,9.949874,8.944272,8.888194,9.848858,8.944272,9.848858,10.099504,10.049875
1854,9.899495,10.000000,9.797959,9.949874,9.746795,9.695360,9.643651,10.148891,9.273619,9.746795,...,8.717798,8.944272,10.000000,9.055386,9.055386,9.949874,9.055386,9.949874,10.148891,10.099504
1855,10.198039,10.295630,10.099504,10.295630,10.099504,10.148891,10.099504,10.440307,9.643651,10.198039,...,9.055386,9.000000,10.246951,9.055386,9.219544,10.000000,9.055386,10.000000,10.246951,10.198039
1856,10.440307,10.535654,10.344080,10.535654,10.344080,10.198039,10.148891,10.677078,9.797959,10.344080,...,9.110434,9.055386,10.488089,9.110434,9.273619,10.198039,9.110434,10.198039,10.488089,10.440307


In [123]:
similarity= neighbors_k

In [125]:
Dmean=np.mean(similarity[1,:])

In [127]:
round(Dmean, 2)

1.58

In [129]:
std=np.std(similarity[1,:])

In [130]:
round(std, 2)

1.39

In [133]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.27


In [135]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [137]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,455,456,457,458,459,460,461,462,463,464
0,3.605551,1.000000,1.732051,1.414214,1.000000,1.000000,1.414214,2.645751,0.000000,0.000000,...,0.000000,2.449490,3.464102,3.162278,0.000000,0.000000,3.741657,3.741657,3.605551,3.741657
1,3.605551,2.236068,1.732051,1.414214,1.414214,1.000000,1.414214,2.828427,0.000000,2.000000,...,0.000000,2.449490,3.741657,3.162278,1.000000,0.000000,3.872983,4.123106,3.872983,3.741657
2,3.741657,2.449490,2.000000,1.732051,1.414214,1.000000,1.414214,3.000000,1.000000,2.236068,...,1.000000,2.449490,3.741657,3.316625,1.000000,0.000000,3.872983,4.123106,4.000000,3.741657
3,3.872983,2.449490,2.000000,1.732051,1.414214,1.000000,1.732051,3.000000,1.000000,2.828427,...,1.000000,2.449490,3.741657,3.316625,1.414214,0.000000,3.872983,4.242640,4.000000,4.000000
4,3.872983,2.449490,2.236068,1.732051,1.732051,1.000000,1.732051,3.162278,2.000000,2.828427,...,1.414214,2.449490,3.872983,3.464102,1.732051,0.000000,3.872983,4.242640,4.000000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,9.000000,9.433981,9.949874,10.000000,9.539392,9.219544,9.591663,9.486833,9.486833,9.746795,...,9.110434,8.717798,9.848858,9.055386,9.110434,9.165152,10.099504,10.049875,9.591663,9.539392
1854,9.055386,9.539392,10.000000,10.049875,9.539392,9.219544,9.643651,9.643651,9.643651,9.797959,...,9.165152,8.831760,9.949874,9.165152,9.219544,9.273619,10.246951,10.198039,9.643651,9.539392
1855,9.219544,10.000000,10.344080,10.392304,10.000000,9.486833,9.949874,9.746795,9.695360,9.797959,...,9.539392,9.000000,10.000000,9.219544,9.486833,9.433981,10.392304,10.246951,9.797959,9.797959
1856,9.219544,10.148891,10.583005,10.630146,10.148891,9.539392,10.198039,9.797959,9.848858,10.148891,...,9.695360,9.110434,10.198039,9.273619,9.539392,9.486833,10.440307,10.344080,9.899495,9.797959


In [139]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.606 1.    1.732 1.414 1.    1.    1.414 2.646 0.    0.    0.    2.646
 1.732 0.    3.162 3.464 2.646 1.414 4.    1.732 3.742 2.236 6.083 3.
 2.449 3.    2.236 3.162 1.    1.    1.732 2.449 1.414 2.    0.    0.
 0.    4.    2.    2.449 3.162 1.    1.732 2.236 3.464 2.236 3.162 1.
 2.    1.    1.    1.    0.    3.606 3.464 2.236 2.236 5.099 1.    2.646
 2.    0.    2.    2.236 0.    1.732 2.449 1.    3.742 2.236 2.449 0.
 3.    4.    4.796 2.828 2.    3.606 0.    1.    2.    1.    0.    1.414
 2.828 3.464 1.732 3.606 4.243 0.    1.    2.646 2.449 0.    3.606 1.
 0.    4.899 2.646 2.    3.    2.449 1.732 2.236 2.646 3.606 1.732 4.
 0.    1.    1.732 2.    3.    1.    2.    3.464 0.    0.    2.646 3.606
 2.    0.    2.    0.    1.414 0.    3.    1.732 1.414 3.162 1.732 1.732
 2.236 2.236 0.    5.477 2.    3.    2.236 1.    0.    2.449 5.    0.
 2.449 1.    2.449 1.732 0.    1.732 0.    2.236 0.    3.606 2.828 3.
 4.796 2.646 0.    4.    2.449 4.123 3.464 2.236 3.    0.    2.    0.
 0.  

In [141]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[False  True  True  True  True  True  True False  True  True  True False
  True  True False False False  True False  True False  True False False
 False False  True False  True  True  True False  True  True  True  True
  True False  True False False  True  True  True False  True False  True
  True  True  True  True  True False False  True  True False  True False
  True  True  True  True  True  True False  True False  True False  True
 False False False False  True False  True  True  True  True  True  True
 False False  True False False  True  True False False  True False  True
  True False False  True False False  True  True False False  True False
  True  True  True  True False  True  True False  True  True False False
  True  True  True  True  True  True False  True  True False  True  True
  True  True  True False  True False  True  True  True False False  True
 False  True False  True  True  True  True  True  True False False False
 False False  True False False False False  True Fa

In [143]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.71


In [145]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  1   2   3   4   5   6   8   9  10  12  13  17  19  21  26  28  29  30
  32  33  34  35  36  38  41  42  43  45  47  48  49  50  51  52  55  56
  58  60  61  62  63  64  65  67  69  71  76  78  79  80  81  82  83  86
  89  90  93  95  96  99 102 103 106 108 109 110 111 113 114 116 117 120
 121 122 123 124 125 127 128 130 131 132 133 134 136 138 139 140 143 145
 147 148 149 150 151 152 158 163 165 166 167 168 169 173 174 177 178 180
 181 182 185 186 188 189 190 191 193 194 195 196 197 198 200 202 203 204
 205 206 207 210 211 213 214 216 217 219 220 224 225 226 227 229 230 231
 232 233 234 236 237 239 240 241 242 243 244 246 247 248 249 250 251 252
 253 255 256 258 259 260 262 263 264 265 266 268 269 270 273 275 276 277
 278 279 280 281 283 284 285 286 287 288 290 291 292 293 294 295 296 297
 299 300 302 303 304 306 307 308 309 311 314 315 316 317 318 319 320 321
 322 323 324 325 329 330 331 332 333 334 335 336 337 338 339 341 343 344
 345 346 34

In [147]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [150]:
y_pred_GBR_ad=list(y_pred_GBR)

In [152]:
y_pred_GBR_ad[:] = [x for i,x in enumerate(y_pred_GBR_ad) if i not in out_Ad]

In [154]:
len(y_pred_GBR_ad)

329

In [156]:
y_ts_ad=list(y_ts)

In [158]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [160]:
len(y_ts_ad)

329

In [162]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_GBR_ad), 2)
Q2_TS

0.49

In [164]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_GBR_ad)), 2)
RMSE_TS

0.66

# SVM model building and validation

In [167]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [169]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [171]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [173]:
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [174]:
svm.best_params_
best_svm = svm.best_estimator_

In [175]:
svm.best_params_

{'C': 1, 'gamma': 0.1}

In [176]:
y_pred_ws_svm = best_svm.predict(x_tr)

In [177]:
R2_WS = round(r2_score(y_tr, y_pred_ws_svm), 2)
R2_WS

0.7

In [178]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_svm)), 2)
RMSE_WS

0.54

In [179]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [180]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.43

In [181]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.74

# 9. Prediction for test set's molecules

In [183]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [184]:
y_pred_svm = best_svm.predict(x_ts)

In [185]:
Q2_TS = round(r2_score(y_ts, y_pred_svm), 2)
Q2_TS

0.38

In [186]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_svm)), 2)
RMSE_TS

0.78

save the model to disk

In [188]:
pickle.dump(best_svm, open('models/Padels/Toxicity_SVM_AtomPairs2D.pkl', 'wb'))

load the model from disk

In [98]:
best_svm = pickle.load(open('models/Padels/Toxicity_SVM_AtomPairs2D.pkl', 'rb'))

# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [190]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [191]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1848,1849,1850,1851,1852,1853,1854,1855,1856,1857
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.414214,1.414214,1.000000,1.000000,1.000000,1.000000,0.000000,0.000000,2.236068,1.000000,...,0.000000,1.000000,4.000000,0.000000,2.000000,0.000000,0.000000,0.000000,1.732051,1.732051
2,1.414214,1.414214,1.000000,1.414214,1.414214,1.000000,1.000000,1.000000,2.449490,1.414214,...,0.000000,1.732051,4.242640,1.732051,2.000000,0.000000,1.732051,0.000000,3.464102,3.316625
3,1.732051,1.732051,1.414214,1.414214,1.414214,1.414214,1.000000,1.732051,2.645751,1.732051,...,1.732051,1.732051,4.358899,2.000000,2.000000,1.414214,2.000000,1.414214,3.464102,3.316625
4,1.732051,1.732051,1.732051,1.732051,1.414214,1.732051,1.414214,1.732051,2.828427,1.732051,...,2.236068,2.000000,4.358899,2.000000,2.000000,1.414214,2.000000,1.414214,3.464102,3.316625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,9.848858,9.949874,9.746795,9.899495,9.695360,9.695360,9.643651,10.099504,9.273619,9.746795,...,8.660254,8.831760,9.949874,8.944272,8.888194,9.848858,8.944272,9.848858,10.099504,10.049875
1854,9.899495,10.000000,9.797959,9.949874,9.746795,9.695360,9.643651,10.148891,9.273619,9.746795,...,8.717798,8.944272,10.000000,9.055386,9.055386,9.949874,9.055386,9.949874,10.148891,10.099504
1855,10.198039,10.295630,10.099504,10.295630,10.099504,10.148891,10.099504,10.440307,9.643651,10.198039,...,9.055386,9.000000,10.246951,9.055386,9.219544,10.000000,9.055386,10.000000,10.246951,10.198039
1856,10.440307,10.535654,10.344080,10.535654,10.344080,10.198039,10.148891,10.677078,9.797959,10.344080,...,9.110434,9.055386,10.488089,9.110434,9.273619,10.198039,9.110434,10.198039,10.488089,10.440307


In [192]:
similarity= neighbors_k

In [193]:
Dmean=np.mean(similarity[1,:])

In [194]:
round(Dmean, 2)

1.58

In [195]:
std=np.std(similarity[1,:])

In [196]:
round(std, 2)

1.39

In [197]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.27


In [198]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [199]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,455,456,457,458,459,460,461,462,463,464
0,3.605551,1.000000,1.732051,1.414214,1.000000,1.000000,1.414214,2.645751,0.000000,0.000000,...,0.000000,2.449490,3.464102,3.162278,0.000000,0.000000,3.741657,3.741657,3.605551,3.741657
1,3.605551,2.236068,1.732051,1.414214,1.414214,1.000000,1.414214,2.828427,0.000000,2.000000,...,0.000000,2.449490,3.741657,3.162278,1.000000,0.000000,3.872983,4.123106,3.872983,3.741657
2,3.741657,2.449490,2.000000,1.732051,1.414214,1.000000,1.414214,3.000000,1.000000,2.236068,...,1.000000,2.449490,3.741657,3.316625,1.000000,0.000000,3.872983,4.123106,4.000000,3.741657
3,3.872983,2.449490,2.000000,1.732051,1.414214,1.000000,1.732051,3.000000,1.000000,2.828427,...,1.000000,2.449490,3.741657,3.316625,1.414214,0.000000,3.872983,4.242640,4.000000,4.000000
4,3.872983,2.449490,2.236068,1.732051,1.732051,1.000000,1.732051,3.162278,2.000000,2.828427,...,1.414214,2.449490,3.872983,3.464102,1.732051,0.000000,3.872983,4.242640,4.000000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,9.000000,9.433981,9.949874,10.000000,9.539392,9.219544,9.591663,9.486833,9.486833,9.746795,...,9.110434,8.717798,9.848858,9.055386,9.110434,9.165152,10.099504,10.049875,9.591663,9.539392
1854,9.055386,9.539392,10.000000,10.049875,9.539392,9.219544,9.643651,9.643651,9.643651,9.797959,...,9.165152,8.831760,9.949874,9.165152,9.219544,9.273619,10.246951,10.198039,9.643651,9.539392
1855,9.219544,10.000000,10.344080,10.392304,10.000000,9.486833,9.949874,9.746795,9.695360,9.797959,...,9.539392,9.000000,10.000000,9.219544,9.486833,9.433981,10.392304,10.246951,9.797959,9.797959
1856,9.219544,10.148891,10.583005,10.630146,10.148891,9.539392,10.198039,9.797959,9.848858,10.148891,...,9.695360,9.110434,10.198039,9.273619,9.539392,9.486833,10.440307,10.344080,9.899495,9.797959


In [200]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.606 1.    1.732 1.414 1.    1.    1.414 2.646 0.    0.    0.    2.646
 1.732 0.    3.162 3.464 2.646 1.414 4.    1.732 3.742 2.236 6.083 3.
 2.449 3.    2.236 3.162 1.    1.    1.732 2.449 1.414 2.    0.    0.
 0.    4.    2.    2.449 3.162 1.    1.732 2.236 3.464 2.236 3.162 1.
 2.    1.    1.    1.    0.    3.606 3.464 2.236 2.236 5.099 1.    2.646
 2.    0.    2.    2.236 0.    1.732 2.449 1.    3.742 2.236 2.449 0.
 3.    4.    4.796 2.828 2.    3.606 0.    1.    2.    1.    0.    1.414
 2.828 3.464 1.732 3.606 4.243 0.    1.    2.646 2.449 0.    3.606 1.
 0.    4.899 2.646 2.    3.    2.449 1.732 2.236 2.646 3.606 1.732 4.
 0.    1.    1.732 2.    3.    1.    2.    3.464 0.    0.    2.646 3.606
 2.    0.    2.    0.    1.414 0.    3.    1.732 1.414 3.162 1.732 1.732
 2.236 2.236 0.    5.477 2.    3.    2.236 1.    0.    2.449 5.    0.
 2.449 1.    2.449 1.732 0.    1.732 0.    2.236 0.    3.606 2.828 3.
 4.796 2.646 0.    4.    2.449 4.123 3.464 2.236 3.    0.    2.    0.
 0.  

In [201]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[False  True  True  True  True  True  True False  True  True  True False
  True  True False False False  True False  True False  True False False
 False False  True False  True  True  True False  True  True  True  True
  True False  True False False  True  True  True False  True False  True
  True  True  True  True  True False False  True  True False  True False
  True  True  True  True  True  True False  True False  True False  True
 False False False False  True False  True  True  True  True  True  True
 False False  True False False  True  True False False  True False  True
  True False False  True False False  True  True False False  True False
  True  True  True  True False  True  True False  True  True False False
  True  True  True  True  True  True False  True  True False  True  True
  True  True  True False  True False  True  True  True False False  True
 False  True False  True  True  True  True  True  True False False False
 False False  True False False False False  True Fa

In [202]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.71


In [203]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  1   2   3   4   5   6   8   9  10  12  13  17  19  21  26  28  29  30
  32  33  34  35  36  38  41  42  43  45  47  48  49  50  51  52  55  56
  58  60  61  62  63  64  65  67  69  71  76  78  79  80  81  82  83  86
  89  90  93  95  96  99 102 103 106 108 109 110 111 113 114 116 117 120
 121 122 123 124 125 127 128 130 131 132 133 134 136 138 139 140 143 145
 147 148 149 150 151 152 158 163 165 166 167 168 169 173 174 177 178 180
 181 182 185 186 188 189 190 191 193 194 195 196 197 198 200 202 203 204
 205 206 207 210 211 213 214 216 217 219 220 224 225 226 227 229 230 231
 232 233 234 236 237 239 240 241 242 243 244 246 247 248 249 250 251 252
 253 255 256 258 259 260 262 263 264 265 266 268 269 270 273 275 276 277
 278 279 280 281 283 284 285 286 287 288 290 291 292 293 294 295 296 297
 299 300 302 303 304 306 307 308 309 311 314 315 316 317 318 319 320 321
 322 323 324 325 329 330 331 332 333 334 335 336 337 338 339 341 343 344
 345 346 34

In [204]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [206]:
y_pred_svm_ad=list(y_pred_svm)

In [207]:
y_pred_svm_ad[:] = [x for i,x in enumerate(y_pred_svm_ad) if i not in out_Ad]

In [208]:
len(y_pred_svm_ad)

329

In [209]:
y_ts_ad=list(y_ts)

In [210]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [211]:
len(y_ts_ad)

329

In [212]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_svm_ad), 2)
Q2_TS

0.48

In [213]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_svm_ad)), 2)
RMSE_TS

0.67

# Multi-layer Perceptron regressor

In [216]:
from sklearn.neural_network import MLPRegressor

In [217]:
param_grid ={"hidden_layer_sizes": [(400, 300, 200, 100), (10, 10)], "activation": ["tanh", "relu"], "solver": ["sgd", "adam"],  'max_iter': [2000]}

In [218]:
m = GridSearchCV(MLPRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [219]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [220]:
best_MLPR = m.best_estimator_

In [221]:
m.best_params_

{'activation': 'relu',
 'hidden_layer_sizes': (400, 300, 200, 100),
 'max_iter': 2000,
 'solver': 'sgd'}

In [222]:
y_pred_ws_MLPR = best_MLPR.predict(x_tr)

In [223]:
R2_WS = round(r2_score(y_tr, y_pred_ws_MLPR), 2)
R2_WS

0.9

In [224]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_MLPR)), 2)
RMSE_WS

0.3

In [225]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, x_tr, y_tr, cv=cv)

In [226]:
y_pred_CV_MLPR

array([1.6828514, 1.5546615, 2.0665581, ..., 6.2097273, 7.50357  ,
       7.374093 ], dtype=float32)

In [227]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_MLPR), 2)
Q2_CV

0.32

In [228]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_MLPR)), 2)
RMSE_CV

0.81

# 9. Prediction for test set's molecules

In [230]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [231]:
y_pred_MLPR = best_MLPR.predict(x_ts)

In [232]:
Q2_TS = round(r2_score(y_ts, y_pred_MLPR), 2)
Q2_TS

0.3

In [233]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_MLPR)), 2)
RMSE_TS

0.83

# save the model to disk

In [427]:
pickle.dump(best_MLPR, open('models/Padels/Toxicity_MLPR_AtomPairs2D.pkl', 'wb'))

# load the model from disk

In [141]:
best_MLPR = pickle.load(open('models/Padels/Toxicity_MLPR_AtomPairs2D.pkl', 'rb'))

# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [237]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [238]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1848,1849,1850,1851,1852,1853,1854,1855,1856,1857
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.414214,1.414214,1.000000,1.000000,1.000000,1.000000,0.000000,0.000000,2.236068,1.000000,...,0.000000,1.000000,4.000000,0.000000,2.000000,0.000000,0.000000,0.000000,1.732051,1.732051
2,1.414214,1.414214,1.000000,1.414214,1.414214,1.000000,1.000000,1.000000,2.449490,1.414214,...,0.000000,1.732051,4.242640,1.732051,2.000000,0.000000,1.732051,0.000000,3.464102,3.316625
3,1.732051,1.732051,1.414214,1.414214,1.414214,1.414214,1.000000,1.732051,2.645751,1.732051,...,1.732051,1.732051,4.358899,2.000000,2.000000,1.414214,2.000000,1.414214,3.464102,3.316625
4,1.732051,1.732051,1.732051,1.732051,1.414214,1.732051,1.414214,1.732051,2.828427,1.732051,...,2.236068,2.000000,4.358899,2.000000,2.000000,1.414214,2.000000,1.414214,3.464102,3.316625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,9.848858,9.949874,9.746795,9.899495,9.695360,9.695360,9.643651,10.099504,9.273619,9.746795,...,8.660254,8.831760,9.949874,8.944272,8.888194,9.848858,8.944272,9.848858,10.099504,10.049875
1854,9.899495,10.000000,9.797959,9.949874,9.746795,9.695360,9.643651,10.148891,9.273619,9.746795,...,8.717798,8.944272,10.000000,9.055386,9.055386,9.949874,9.055386,9.949874,10.148891,10.099504
1855,10.198039,10.295630,10.099504,10.295630,10.099504,10.148891,10.099504,10.440307,9.643651,10.198039,...,9.055386,9.000000,10.246951,9.055386,9.219544,10.000000,9.055386,10.000000,10.246951,10.198039
1856,10.440307,10.535654,10.344080,10.535654,10.344080,10.198039,10.148891,10.677078,9.797959,10.344080,...,9.110434,9.055386,10.488089,9.110434,9.273619,10.198039,9.110434,10.198039,10.488089,10.440307


In [239]:
similarity= neighbors_k

In [240]:
Dmean=np.mean(similarity[1,:])

In [241]:
round(Dmean, 2)

1.58

In [242]:
std=np.std(similarity[1,:])

In [243]:
round(std, 2)

1.39

In [244]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.27


In [245]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [246]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,455,456,457,458,459,460,461,462,463,464
0,3.605551,1.000000,1.732051,1.414214,1.000000,1.000000,1.414214,2.645751,0.000000,0.000000,...,0.000000,2.449490,3.464102,3.162278,0.000000,0.000000,3.741657,3.741657,3.605551,3.741657
1,3.605551,2.236068,1.732051,1.414214,1.414214,1.000000,1.414214,2.828427,0.000000,2.000000,...,0.000000,2.449490,3.741657,3.162278,1.000000,0.000000,3.872983,4.123106,3.872983,3.741657
2,3.741657,2.449490,2.000000,1.732051,1.414214,1.000000,1.414214,3.000000,1.000000,2.236068,...,1.000000,2.449490,3.741657,3.316625,1.000000,0.000000,3.872983,4.123106,4.000000,3.741657
3,3.872983,2.449490,2.000000,1.732051,1.414214,1.000000,1.732051,3.000000,1.000000,2.828427,...,1.000000,2.449490,3.741657,3.316625,1.414214,0.000000,3.872983,4.242640,4.000000,4.000000
4,3.872983,2.449490,2.236068,1.732051,1.732051,1.000000,1.732051,3.162278,2.000000,2.828427,...,1.414214,2.449490,3.872983,3.464102,1.732051,0.000000,3.872983,4.242640,4.000000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,9.000000,9.433981,9.949874,10.000000,9.539392,9.219544,9.591663,9.486833,9.486833,9.746795,...,9.110434,8.717798,9.848858,9.055386,9.110434,9.165152,10.099504,10.049875,9.591663,9.539392
1854,9.055386,9.539392,10.000000,10.049875,9.539392,9.219544,9.643651,9.643651,9.643651,9.797959,...,9.165152,8.831760,9.949874,9.165152,9.219544,9.273619,10.246951,10.198039,9.643651,9.539392
1855,9.219544,10.000000,10.344080,10.392304,10.000000,9.486833,9.949874,9.746795,9.695360,9.797959,...,9.539392,9.000000,10.000000,9.219544,9.486833,9.433981,10.392304,10.246951,9.797959,9.797959
1856,9.219544,10.148891,10.583005,10.630146,10.148891,9.539392,10.198039,9.797959,9.848858,10.148891,...,9.695360,9.110434,10.198039,9.273619,9.539392,9.486833,10.440307,10.344080,9.899495,9.797959


In [247]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.606 1.    1.732 1.414 1.    1.    1.414 2.646 0.    0.    0.    2.646
 1.732 0.    3.162 3.464 2.646 1.414 4.    1.732 3.742 2.236 6.083 3.
 2.449 3.    2.236 3.162 1.    1.    1.732 2.449 1.414 2.    0.    0.
 0.    4.    2.    2.449 3.162 1.    1.732 2.236 3.464 2.236 3.162 1.
 2.    1.    1.    1.    0.    3.606 3.464 2.236 2.236 5.099 1.    2.646
 2.    0.    2.    2.236 0.    1.732 2.449 1.    3.742 2.236 2.449 0.
 3.    4.    4.796 2.828 2.    3.606 0.    1.    2.    1.    0.    1.414
 2.828 3.464 1.732 3.606 4.243 0.    1.    2.646 2.449 0.    3.606 1.
 0.    4.899 2.646 2.    3.    2.449 1.732 2.236 2.646 3.606 1.732 4.
 0.    1.    1.732 2.    3.    1.    2.    3.464 0.    0.    2.646 3.606
 2.    0.    2.    0.    1.414 0.    3.    1.732 1.414 3.162 1.732 1.732
 2.236 2.236 0.    5.477 2.    3.    2.236 1.    0.    2.449 5.    0.
 2.449 1.    2.449 1.732 0.    1.732 0.    2.236 0.    3.606 2.828 3.
 4.796 2.646 0.    4.    2.449 4.123 3.464 2.236 3.    0.    2.    0.
 0.  

In [248]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[False  True  True  True  True  True  True False  True  True  True False
  True  True False False False  True False  True False  True False False
 False False  True False  True  True  True False  True  True  True  True
  True False  True False False  True  True  True False  True False  True
  True  True  True  True  True False False  True  True False  True False
  True  True  True  True  True  True False  True False  True False  True
 False False False False  True False  True  True  True  True  True  True
 False False  True False False  True  True False False  True False  True
  True False False  True False False  True  True False False  True False
  True  True  True  True False  True  True False  True  True False False
  True  True  True  True  True  True False  True  True False  True  True
  True  True  True False  True False  True  True  True False False  True
 False  True False  True  True  True  True  True  True False False False
 False False  True False False False False  True Fa

In [249]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.71


In [250]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  1   2   3   4   5   6   8   9  10  12  13  17  19  21  26  28  29  30
  32  33  34  35  36  38  41  42  43  45  47  48  49  50  51  52  55  56
  58  60  61  62  63  64  65  67  69  71  76  78  79  80  81  82  83  86
  89  90  93  95  96  99 102 103 106 108 109 110 111 113 114 116 117 120
 121 122 123 124 125 127 128 130 131 132 133 134 136 138 139 140 143 145
 147 148 149 150 151 152 158 163 165 166 167 168 169 173 174 177 178 180
 181 182 185 186 188 189 190 191 193 194 195 196 197 198 200 202 203 204
 205 206 207 210 211 213 214 216 217 219 220 224 225 226 227 229 230 231
 232 233 234 236 237 239 240 241 242 243 244 246 247 248 249 250 251 252
 253 255 256 258 259 260 262 263 264 265 266 268 269 270 273 275 276 277
 278 279 280 281 283 284 285 286 287 288 290 291 292 293 294 295 296 297
 299 300 302 303 304 306 307 308 309 311 314 315 316 317 318 319 320 321
 322 323 324 325 329 330 331 332 333 334 335 336 337 338 339 341 343 344
 345 346 34

In [251]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [253]:
y_pred_MLPR_ad=list(y_pred_MLPR)

In [254]:
y_pred_MLPR_ad[:] = [x for i,x in enumerate(y_pred_MLPR_ad) if i not in out_Ad]

In [255]:
len(y_pred_MLPR_ad)

329

In [256]:
y_ts_ad=list(y_ts)

In [257]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [258]:
len(y_ts_ad)

329

In [259]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_MLPR_ad), 2)
Q2_TS

0.41

In [260]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_MLPR_ad)), 2)
RMSE_TS

0.72

# k-nearest neighbors

In [262]:
from sklearn.neighbors import KNeighborsRegressor

In [263]:
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

In [264]:
m = GridSearchCV(KNeighborsRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [265]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [266]:
best_kNN = m.best_estimator_

In [267]:
m.best_params_

{'n_neighbors': 11}

In [268]:
y_pred_ws_kNN = best_kNN.predict(x_tr)

In [269]:
R2_WS = round(r2_score(y_tr, y_pred_ws_kNN), 2)
R2_WS

0.49

In [270]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_kNN)), 2)
RMSE_WS

0.7

In [271]:
y_pred_CV_kNN = cross_val_predict(best_kNN, x_tr, y_tr, cv=cv)

In [273]:
y_pred_CV_kNN

array([2.111545 , 1.951085 , 2.0695126, ..., 5.1267195, 5.0729685,
       5.053592 ], dtype=float32)

In [276]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_kNN), 2)
Q2_CV

0.35

In [280]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_kNN)), 2)
RMSE_CV

0.79

# 9. Prediction for test set's molecules

In [282]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [283]:
y_pred_kNN = best_kNN.predict(x_ts)

In [284]:
Q2_TS = round(r2_score(y_ts, y_pred_kNN), 2)
Q2_TS

0.31

In [293]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_kNN)), 2)
RMSE_TS

0.82

# save the model to disk

In [429]:
pickle.dump(best_kNN, open('models/Padels/Toxicity_kNN_AtomPairs2D.pkl', 'wb'))

# load the model from disk

In [184]:
best_kNN = pickle.load(open('models/Padels/Toxicity_kNN_AtomPairs2D.pkl', 'rb'))

# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [323]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [324]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1848,1849,1850,1851,1852,1853,1854,1855,1856,1857
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.414214,1.414214,1.000000,1.000000,1.000000,1.000000,0.000000,0.000000,2.236068,1.000000,...,0.000000,1.000000,4.000000,0.000000,2.000000,0.000000,0.000000,0.000000,1.732051,1.732051
2,1.414214,1.414214,1.000000,1.414214,1.414214,1.000000,1.000000,1.000000,2.449490,1.414214,...,0.000000,1.732051,4.242640,1.732051,2.000000,0.000000,1.732051,0.000000,3.464102,3.316625
3,1.732051,1.732051,1.414214,1.414214,1.414214,1.414214,1.000000,1.732051,2.645751,1.732051,...,1.732051,1.732051,4.358899,2.000000,2.000000,1.414214,2.000000,1.414214,3.464102,3.316625
4,1.732051,1.732051,1.732051,1.732051,1.414214,1.732051,1.414214,1.732051,2.828427,1.732051,...,2.236068,2.000000,4.358899,2.000000,2.000000,1.414214,2.000000,1.414214,3.464102,3.316625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,9.848858,9.949874,9.746795,9.899495,9.695360,9.695360,9.643651,10.099504,9.273619,9.746795,...,8.660254,8.831760,9.949874,8.944272,8.888194,9.848858,8.944272,9.848858,10.099504,10.049875
1854,9.899495,10.000000,9.797959,9.949874,9.746795,9.695360,9.643651,10.148891,9.273619,9.746795,...,8.717798,8.944272,10.000000,9.055386,9.055386,9.949874,9.055386,9.949874,10.148891,10.099504
1855,10.198039,10.295630,10.099504,10.295630,10.099504,10.148891,10.099504,10.440307,9.643651,10.198039,...,9.055386,9.000000,10.246951,9.055386,9.219544,10.000000,9.055386,10.000000,10.246951,10.198039
1856,10.440307,10.535654,10.344080,10.535654,10.344080,10.198039,10.148891,10.677078,9.797959,10.344080,...,9.110434,9.055386,10.488089,9.110434,9.273619,10.198039,9.110434,10.198039,10.488089,10.440307


In [325]:
similarity= neighbors_k

In [326]:
Dmean=np.mean(similarity[1,:])

In [327]:
round(Dmean, 2)

1.58

In [328]:
std=np.std(similarity[1,:])

In [329]:
round(std, 2)

1.39

In [330]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.27


In [331]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [332]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,455,456,457,458,459,460,461,462,463,464
0,3.605551,1.000000,1.732051,1.414214,1.000000,1.000000,1.414214,2.645751,0.000000,0.000000,...,0.000000,2.449490,3.464102,3.162278,0.000000,0.000000,3.741657,3.741657,3.605551,3.741657
1,3.605551,2.236068,1.732051,1.414214,1.414214,1.000000,1.414214,2.828427,0.000000,2.000000,...,0.000000,2.449490,3.741657,3.162278,1.000000,0.000000,3.872983,4.123106,3.872983,3.741657
2,3.741657,2.449490,2.000000,1.732051,1.414214,1.000000,1.414214,3.000000,1.000000,2.236068,...,1.000000,2.449490,3.741657,3.316625,1.000000,0.000000,3.872983,4.123106,4.000000,3.741657
3,3.872983,2.449490,2.000000,1.732051,1.414214,1.000000,1.732051,3.000000,1.000000,2.828427,...,1.000000,2.449490,3.741657,3.316625,1.414214,0.000000,3.872983,4.242640,4.000000,4.000000
4,3.872983,2.449490,2.236068,1.732051,1.732051,1.000000,1.732051,3.162278,2.000000,2.828427,...,1.414214,2.449490,3.872983,3.464102,1.732051,0.000000,3.872983,4.242640,4.000000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,9.000000,9.433981,9.949874,10.000000,9.539392,9.219544,9.591663,9.486833,9.486833,9.746795,...,9.110434,8.717798,9.848858,9.055386,9.110434,9.165152,10.099504,10.049875,9.591663,9.539392
1854,9.055386,9.539392,10.000000,10.049875,9.539392,9.219544,9.643651,9.643651,9.643651,9.797959,...,9.165152,8.831760,9.949874,9.165152,9.219544,9.273619,10.246951,10.198039,9.643651,9.539392
1855,9.219544,10.000000,10.344080,10.392304,10.000000,9.486833,9.949874,9.746795,9.695360,9.797959,...,9.539392,9.000000,10.000000,9.219544,9.486833,9.433981,10.392304,10.246951,9.797959,9.797959
1856,9.219544,10.148891,10.583005,10.630146,10.148891,9.539392,10.198039,9.797959,9.848858,10.148891,...,9.695360,9.110434,10.198039,9.273619,9.539392,9.486833,10.440307,10.344080,9.899495,9.797959


In [333]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[3.606 1.    1.732 1.414 1.    1.    1.414 2.646 0.    0.    0.    2.646
 1.732 0.    3.162 3.464 2.646 1.414 4.    1.732 3.742 2.236 6.083 3.
 2.449 3.    2.236 3.162 1.    1.    1.732 2.449 1.414 2.    0.    0.
 0.    4.    2.    2.449 3.162 1.    1.732 2.236 3.464 2.236 3.162 1.
 2.    1.    1.    1.    0.    3.606 3.464 2.236 2.236 5.099 1.    2.646
 2.    0.    2.    2.236 0.    1.732 2.449 1.    3.742 2.236 2.449 0.
 3.    4.    4.796 2.828 2.    3.606 0.    1.    2.    1.    0.    1.414
 2.828 3.464 1.732 3.606 4.243 0.    1.    2.646 2.449 0.    3.606 1.
 0.    4.899 2.646 2.    3.    2.449 1.732 2.236 2.646 3.606 1.732 4.
 0.    1.    1.732 2.    3.    1.    2.    3.464 0.    0.    2.646 3.606
 2.    0.    2.    0.    1.414 0.    3.    1.732 1.414 3.162 1.732 1.732
 2.236 2.236 0.    5.477 2.    3.    2.236 1.    0.    2.449 5.    0.
 2.449 1.    2.449 1.732 0.    1.732 0.    2.236 0.    3.606 2.828 3.
 4.796 2.646 0.    4.    2.449 4.123 3.464 2.236 3.    0.    2.    0.
 0.  

In [334]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[False  True  True  True  True  True  True False  True  True  True False
  True  True False False False  True False  True False  True False False
 False False  True False  True  True  True False  True  True  True  True
  True False  True False False  True  True  True False  True False  True
  True  True  True  True  True False False  True  True False  True False
  True  True  True  True  True  True False  True False  True False  True
 False False False False  True False  True  True  True  True  True  True
 False False  True False False  True  True False False  True False  True
  True False False  True False False  True  True False False  True False
  True  True  True  True False  True  True False  True  True False False
  True  True  True  True  True  True False  True  True False  True  True
  True  True  True False  True False  True  True  True False False  True
 False  True False  True  True  True  True  True  True False False False
 False False  True False False False False  True Fa

In [335]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.71


In [336]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  1   2   3   4   5   6   8   9  10  12  13  17  19  21  26  28  29  30
  32  33  34  35  36  38  41  42  43  45  47  48  49  50  51  52  55  56
  58  60  61  62  63  64  65  67  69  71  76  78  79  80  81  82  83  86
  89  90  93  95  96  99 102 103 106 108 109 110 111 113 114 116 117 120
 121 122 123 124 125 127 128 130 131 132 133 134 136 138 139 140 143 145
 147 148 149 150 151 152 158 163 165 166 167 168 169 173 174 177 178 180
 181 182 185 186 188 189 190 191 193 194 195 196 197 198 200 202 203 204
 205 206 207 210 211 213 214 216 217 219 220 224 225 226 227 229 230 231
 232 233 234 236 237 239 240 241 242 243 244 246 247 248 249 250 251 252
 253 255 256 258 259 260 262 263 264 265 266 268 269 270 273 275 276 277
 278 279 280 281 283 284 285 286 287 288 290 291 292 293 294 295 296 297
 299 300 302 303 304 306 307 308 309 311 314 315 316 317 318 319 320 321
 322 323 324 325 329 330 331 332 333 334 335 336 337 338 339 341 343 344
 345 346 34

In [337]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [339]:
y_pred_kNN_ad=list(y_pred_kNN)

In [340]:
y_pred_kNN_ad[:] = [x for i,x in enumerate(y_pred_kNN_ad) if i not in out_Ad]

In [341]:
len(y_pred_kNN_ad)

329

In [342]:
y_ts_ad=list(y_ts)

In [343]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [344]:
len(y_ts_ad)

329

In [345]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_kNN_ad), 2)
Q2_TS

0.44

In [346]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_kNN_ad)), 2)
RMSE_TS

0.7