In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
from copy import deepcopy
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
import shap
from tqdm.notebook import tqdm

In [3]:
def convert_smi_to_canon_smi(smi):

    try:
        canon_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi),isomericSmiles = False)
    except:
        canon_smi='wrong_smiles'
    return canon_smi

In [4]:
def standart(smi):
    global m
    if smi!='wrong_smiles':
        try:
            smiles=standardize_smiles(smi)
            m = Chem.MolFromSmiles(smi)
        except:
            smi='error kekule'
    else:
        m = 'check the smiles'
    return m

#  Load data and curation work set

In [6]:
df_ws=pd.read_csv('datasets/rat_oral_LD50_WS.csv')
df_ws

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical_SMILES,InChIKey,rat_oral_LD50
0,TOX-8327,12283,"6-methyl-1H-pyrimidine-2,4-dione",Cc1cc(=O)[nH]c(=O)[nH]1,Cc1cc(=O)[nH]c(=O)[nH]1,SHVCSCWHWMSGTE-UHFFFAOYSA-N,0.291207
1,TOX-27223,34042,"4-(1,3-benzodioxol-5-yl)butan-2-ylhydrazine",CC(CCc1ccc2c(c1)OCO2)NN,CC(CCc1ccc2c(c1)OCO2)NN,IBWPUTAKVGZXRB-UHFFFAOYSA-N,0.440661
2,TOX-5723,8058,hexane,CCCCCC,CCCCCC,VLKZOEOYAKHREP-UHFFFAOYSA-N,0.537456
3,TOX-72438,94201,"bis(2-ethylhexyl) cyclohexene-1,2-dicarboxylate",CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,ZVPBHZIVOWGPMT-UHFFFAOYSA-N,0.539248
4,TOX-4318,1030,"propane-1,2-diol",CC(O)CO,CC(O)CO,DNIAPMSPPWPWGF-UHFFFAOYSA-N,0.580326
...,...,...,...,...,...,...,...
7869,TOX-34548,39729,"2,3,7,8-tetrabromodibenzo-p-dioxin",Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,JZLQUWSWOJPCAK-UHFFFAOYSA-N,6.698777
7870,TOX-2774,3890,"propan-2-yl 7-[3,5-dihydroxy-2-(3-hydroxy-5-ph...",CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1,GGXICVAJURFBLW-UHFFFAOYSA-N,6.937118
7871,TOX-57286,3323,"5-[2-[7a-methyl-1-[7,7,7-trifluoro-6-hydroxy-6...",C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...,XPYGGHVSFMUHLH-UHFFFAOYSA-N,7.099681
7872,TOX-6661,9426,"2-[(6,7-dimethoxy-1,2,3,4-tetrahydroisoquinoli...",CCC1CN2CCc3cc(OC)c(OC)cc3C2CC1CC1NCCc2cc(OC)c(...,CCC1CN2CCc3cc(OC)c(OC)cc3C2CC1CC1NCCc2cc(OC)c(...,AUVVAXYIELKVAI-UHFFFAOYSA-N,7.602647


##  Load data and curation test set

In [8]:
df_ts=pd.read_csv('datasets/rat_oral_LD50_TS.csv')
df_ts

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical_SMILES,InChIKey,rat_oral_LD50
0,TOX-17137,297,methane,C,C,VNWKTOKETHGBQD-UHFFFAOYSA-N,0.017765
1,TOX-107853,17827,2-(cyclohexylamino)ethanol,OCCNC1CCCCC1,OCCNC1CCCCC1,MGUMZJAQENFQKN-UHFFFAOYSA-N,0.572835
2,TOX-44620,974,oxamic acid,NC(=O)C(=O)O,NC(=O)C(=O)O,SOWBFZRMHSNYGE-UHFFFAOYSA-N,0.624488
3,TOX-11368,62433,pentyl pentanoate,CCCCCOC(=O)CCCC,CCCCCOC(=O)CCCC,FGPPDYNPZTUNIU-UHFFFAOYSA-N,0.686956
4,TOX-7935,11429,"butane-1,2-diol",CCC(O)CO,CCC(O)CO,BMRWNKZVCUKKSR-UHFFFAOYSA-N,0.750711
...,...,...,...,...,...,...,...
1964,TOX-31209,123453,"N-(2,8,9-trioxa-5-aza-1-silabicyclo[3.3.3]unde...",c1ccc(NC[Si]23OCCN(CCO2)CCO3)cc1,c1ccc(NC[Si]23OCCN(CCO2)CCO3)cc1,YGHSPISFNKUQEY-UHFFFAOYSA-N,5.931904
1965,TOX-63960,91771,"3-[3-[4-(4-bromophenyl)phenyl]-1,2,3,4-tetrahy...",O=c1sc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,O=c1sc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,VSVAQRUUFVBBFS-UHFFFAOYSA-N,5.991624
1966,TOX-33005,38439,"1,2,3,7,8-pentachlorodibenzo-p-dioxin",Clc1cc2c(cc1Cl)Oc1c(cc(Cl)c(Cl)c1Cl)O2,Clc1cc2c(cc1Cl)Oc1c(cc(Cl)c(Cl)c1Cl)O2,FSPZPQQWDODWAU-UHFFFAOYSA-N,6.238094
1967,TOX-24685,198007,3-ethyl-N-[methoxy(methylsulfanyl)phosphoryl]-...,CCN1CC(C)OC1=NP(=O)(OC)SC,CCN1CC(C)OC1=NP(=O)(OC)SC,JCYANUVOCLKHHY-UHFFFAOYSA-N,6.401876


##  Standardization  for work set

In [10]:
df_ws["Molecule"] = df_ws.apply(lambda x: standart(x.Canonical_SMILES), axis=1)
print('Kept data: ', len(df_ws), 'molecules')

[15:30:38] Unusual charge on atom 8 number of radical electrons set to zero
[15:30:38] Unusual charge on atom 0 number of radical electrons set to zero
[15:30:38] Unusual charge on atom 16 number of radical electrons set to zero
[15:30:38] Unusual charge on atom 16 number of radical electrons set to zero


Kept data:  7874 molecules


In [11]:
df_ws

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical_SMILES,InChIKey,rat_oral_LD50,Molecule
0,TOX-8327,12283,"6-methyl-1H-pyrimidine-2,4-dione",Cc1cc(=O)[nH]c(=O)[nH]1,Cc1cc(=O)[nH]c(=O)[nH]1,SHVCSCWHWMSGTE-UHFFFAOYSA-N,0.291207,<rdkit.Chem.rdchem.Mol object at 0x00000224A7D...
1,TOX-27223,34042,"4-(1,3-benzodioxol-5-yl)butan-2-ylhydrazine",CC(CCc1ccc2c(c1)OCO2)NN,CC(CCc1ccc2c(c1)OCO2)NN,IBWPUTAKVGZXRB-UHFFFAOYSA-N,0.440661,<rdkit.Chem.rdchem.Mol object at 0x00000224A7D...
2,TOX-5723,8058,hexane,CCCCCC,CCCCCC,VLKZOEOYAKHREP-UHFFFAOYSA-N,0.537456,<rdkit.Chem.rdchem.Mol object at 0x00000224A7D...
3,TOX-72438,94201,"bis(2-ethylhexyl) cyclohexene-1,2-dicarboxylate",CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,ZVPBHZIVOWGPMT-UHFFFAOYSA-N,0.539248,<rdkit.Chem.rdchem.Mol object at 0x00000224A7D...
4,TOX-4318,1030,"propane-1,2-diol",CC(O)CO,CC(O)CO,DNIAPMSPPWPWGF-UHFFFAOYSA-N,0.580326,<rdkit.Chem.rdchem.Mol object at 0x00000224A6F...
...,...,...,...,...,...,...,...,...
7869,TOX-34548,39729,"2,3,7,8-tetrabromodibenzo-p-dioxin",Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,JZLQUWSWOJPCAK-UHFFFAOYSA-N,6.698777,<rdkit.Chem.rdchem.Mol object at 0x00000224A6E...
7870,TOX-2774,3890,"propan-2-yl 7-[3,5-dihydroxy-2-(3-hydroxy-5-ph...",CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1,GGXICVAJURFBLW-UHFFFAOYSA-N,6.937118,<rdkit.Chem.rdchem.Mol object at 0x00000224A6E...
7871,TOX-57286,3323,"5-[2-[7a-methyl-1-[7,7,7-trifluoro-6-hydroxy-6...",C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...,XPYGGHVSFMUHLH-UHFFFAOYSA-N,7.099681,<rdkit.Chem.rdchem.Mol object at 0x00000224A6E...
7872,TOX-6661,9426,"2-[(6,7-dimethoxy-1,2,3,4-tetrahydroisoquinoli...",CCC1CN2CCc3cc(OC)c(OC)cc3C2CC1CC1NCCc2cc(OC)c(...,CCC1CN2CCc3cc(OC)c(OC)cc3C2CC1CC1NCCc2cc(OC)c(...,AUVVAXYIELKVAI-UHFFFAOYSA-N,7.602647,<rdkit.Chem.rdchem.Mol object at 0x00000224A6E...


In [12]:
y_tr=df_ws.rat_oral_LD50	
y_tr

0       0.291207
1       0.440661
2       0.537456
3       0.539248
4       0.580326
          ...   
7869    6.698777
7870    6.937118
7871    7.099681
7872    7.602647
7873    9.541127
Name: rat_oral_LD50, Length: 7874, dtype: float64

In [13]:
df = pd.DataFrame(df_ws, columns=["SMILES"])
df

Unnamed: 0,SMILES
0,Cc1cc(=O)[nH]c(=O)[nH]1
1,CC(CCc1ccc2c(c1)OCO2)NN
2,CCCCCC
3,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1
4,CC(O)CO
...,...
7869,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2
7870,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1
7871,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...
7872,CCC1CN2CCc3cc(OC)c(OC)cc3C2CC1CC1NCCc2cc(OC)c(...


In [14]:
df.to_csv('datasets/molecule_ws.smi', sep=',', index=False, header=False)

In [15]:
moldf_ws=df_ws.Molecule

 # Standardization for test set

In [17]:
df_ts["Molecule"] = df_ts.apply(lambda x: standart(x.Canonical_SMILES), axis=1)
moldf_ts=df_ts[df_ts['SMILES']!='wrong_smiles']
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  1969 molecules


In [18]:
moldf_ts

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical_SMILES,InChIKey,rat_oral_LD50,Molecule
0,TOX-17137,297,methane,C,C,VNWKTOKETHGBQD-UHFFFAOYSA-N,0.017765,<rdkit.Chem.rdchem.Mol object at 0x00000224A7D...
1,TOX-107853,17827,2-(cyclohexylamino)ethanol,OCCNC1CCCCC1,OCCNC1CCCCC1,MGUMZJAQENFQKN-UHFFFAOYSA-N,0.572835,<rdkit.Chem.rdchem.Mol object at 0x00000224A6E...
2,TOX-44620,974,oxamic acid,NC(=O)C(=O)O,NC(=O)C(=O)O,SOWBFZRMHSNYGE-UHFFFAOYSA-N,0.624488,<rdkit.Chem.rdchem.Mol object at 0x00000224A6E...
3,TOX-11368,62433,pentyl pentanoate,CCCCCOC(=O)CCCC,CCCCCOC(=O)CCCC,FGPPDYNPZTUNIU-UHFFFAOYSA-N,0.686956,<rdkit.Chem.rdchem.Mol object at 0x00000224A6E...
4,TOX-7935,11429,"butane-1,2-diol",CCC(O)CO,CCC(O)CO,BMRWNKZVCUKKSR-UHFFFAOYSA-N,0.750711,<rdkit.Chem.rdchem.Mol object at 0x00000224A6E...
...,...,...,...,...,...,...,...,...
1964,TOX-31209,123453,"N-(2,8,9-trioxa-5-aza-1-silabicyclo[3.3.3]unde...",c1ccc(NC[Si]23OCCN(CCO2)CCO3)cc1,c1ccc(NC[Si]23OCCN(CCO2)CCO3)cc1,YGHSPISFNKUQEY-UHFFFAOYSA-N,5.931904,<rdkit.Chem.rdchem.Mol object at 0x00000224A6F...
1965,TOX-63960,91771,"3-[3-[4-(4-bromophenyl)phenyl]-1,2,3,4-tetrahy...",O=c1sc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,O=c1sc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,VSVAQRUUFVBBFS-UHFFFAOYSA-N,5.991624,<rdkit.Chem.rdchem.Mol object at 0x00000224A6F...
1966,TOX-33005,38439,"1,2,3,7,8-pentachlorodibenzo-p-dioxin",Clc1cc2c(cc1Cl)Oc1c(cc(Cl)c(Cl)c1Cl)O2,Clc1cc2c(cc1Cl)Oc1c(cc(Cl)c(Cl)c1Cl)O2,FSPZPQQWDODWAU-UHFFFAOYSA-N,6.238094,<rdkit.Chem.rdchem.Mol object at 0x00000224A6F...
1967,TOX-24685,198007,3-ethyl-N-[methoxy(methylsulfanyl)phosphoryl]-...,CCN1CC(C)OC1=NP(=O)(OC)SC,CCN1CC(C)OC1=NP(=O)(OC)SC,JCYANUVOCLKHHY-UHFFFAOYSA-N,6.401876,<rdkit.Chem.rdchem.Mol object at 0x00000224A6F...


In [19]:
y_ts=moldf_ts.rat_oral_LD50
y_ts

0       0.017765
1       0.572835
2       0.624488
3       0.686956
4       0.750711
          ...   
1964    5.931904
1965    5.991624
1966    6.238094
1967    6.401876
1968    7.206791
Name: rat_oral_LD50, Length: 1969, dtype: float64

In [20]:
df_ts = pd.DataFrame(moldf_ts, columns=["SMILES"])
df_ts

Unnamed: 0,SMILES
0,C
1,OCCNC1CCCCC1
2,NC(=O)C(=O)O
3,CCCCCOC(=O)CCCC
4,CCC(O)CO
...,...
1964,c1ccc(NC[Si]23OCCN(CCO2)CCO3)cc1
1965,O=c1sc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...
1966,Clc1cc2c(cc1Cl)Oc1c(cc(Cl)c(Cl)c1Cl)O2
1967,CCN1CC(C)OC1=NP(=O)(OC)SC


In [20]:
df_ts.to_csv('datasets/molecule_ts.smi', sep=',', index=False, header=False)

# Calculation Fingerprints for work set

In [151]:
import glob
xml_files = glob.glob("fingerprints_xml/*.xml")
xml_files.sort()
xml_files

['fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'fingerprints_xml\\EStateFingerprinter.xml',
 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'fingerprints_xml\\Fingerprinter.xml',
 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'fingerprints_xml\\MACCSFingerprinter.xml',
 'fingerprints_xml\\PubchemFingerprinter.xml',
 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'fingerprints_xml\\SubstructureFingerprinter.xml']

In [152]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [153]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'EState': 'fingerprints_xml\\EStateFingerprinter.xml',
 'CDKextended': 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'CDK': 'fingerprints_xml\\Fingerprinter.xml',
 'CDKgraphonly': 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'MACCS': 'fingerprints_xml\\MACCSFingerprinter.xml',
 'PubChem': 'fingerprints_xml\\PubchemFingerprinter.xml',
 'SubstructureCount': 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'Substructure': 'fingerprints_xml\\SubstructureFingerprinter.xml'}

In [154]:
from padelpy import padeldescriptor

fingerprint = 'AtomPairs2D'

fingerprint_output_file = ''.join([fingerprint,'.csv'])
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ws.smi',
                d_file=fingerprint_output_file,
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [37]:
descriptors_AtomPairs2D = pd.read_csv('AtomPairs2D.csv')

In [39]:
descriptors_AtomPairs2D

Unnamed: 0,Name,AD2D1,AD2D2,AD2D3,AD2D4,AD2D5,AD2D6,AD2D7,AD2D8,AD2D9,...,AD2D771,AD2D772,AD2D773,AD2D774,AD2D775,AD2D776,AD2D777,AD2D778,AD2D779,AD2D780
0,AUTOGEN_molecule_ws_1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUTOGEN_molecule_ws_2,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUTOGEN_molecule_ws_3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUTOGEN_molecule_ws_4,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUTOGEN_molecule_ws_5,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7869,AUTOGEN_molecule_ws_7870,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7870,AUTOGEN_molecule_ws_7871,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7871,AUTOGEN_molecule_ws_7872,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7872,AUTOGEN_molecule_ws_7873,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
x_tr_AtomPairs2D = descriptors_AtomPairs2D.drop('Name', axis=1)

In [43]:
x_tr = np.array(x_tr_AtomPairs2D, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

In [265]:
savetxt('models/Padels/x_tr_AtomPairs2D.csv', x_tr_AtomPairs2D, delimiter=',')

## Descriptor calculation for test set

In [161]:
from padelpy import padeldescriptor

fingerprint = 'AtomPairs2D'

fingerprint_output_file = ''.join([fingerprint,'_ts.csv']) #KlekotaRoth.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='datasets/molecule_ts.smi', 
                d_file=fingerprint_output_file, #'KlekotaRoth.csv'
                #descriptortypes='KlekotaRoth.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=False,
                fingerprints=True)

In [45]:
descriptors_AtomPairs2D_ts = pd.read_csv('AtomPairs2D_ts.csv')

In [47]:
x_ts = descriptors_AtomPairs2D_ts.drop('Name', axis=1)
x_ts

Unnamed: 0,AD2D1,AD2D2,AD2D3,AD2D4,AD2D5,AD2D6,AD2D7,AD2D8,AD2D9,AD2D10,...,AD2D771,AD2D772,AD2D773,AD2D774,AD2D775,AD2D776,AD2D777,AD2D778,AD2D779,AD2D780
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1965,1,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1966,1,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1967,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
x_ts.shape

(1969, 780)

# CatBoostRegressor

In [53]:
cv=KFold(n_splits=5, random_state=42, shuffle=True)

In [55]:
%%time
model = CatBoostRegressor()
parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [100,500, 1000]
              }

grid = GridSearchCV(estimator=model, param_grid = parameters, n_jobs=-1, cv = cv)
grid.fit(x_tr, y_tr, verbose=False)

CPU times: total: 12min 18s
Wall time: 23min 18s


In [56]:
best_CatBR = grid.best_estimator_

In [57]:
grid.best_params_

{'depth': 10, 'iterations': 1000, 'learning_rate': 0.05}

In [58]:
y_pred_ws_GBR = best_CatBR.predict(x_tr)

In [59]:
R2_WS = round(r2_score(y_tr, y_pred_ws_GBR), 2)
R2_WS

0.73

In [60]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_GBR)), 2)
RMSE_WS

0.47

In [61]:
params={'verbose': False}

In [62]:
%%time
y_pred_CV_CatBR = cross_val_predict(best_CatBR, x_tr, y_tr, cv=cv, params=params)

CPU times: total: 57min 57s
Wall time: 4min 55s


In [62]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_CatBR), 2)
Q2_CV

0.47

In [63]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_CatBR)), 2)
RMSE_CV

0.65

# save the model to disk

In [71]:
pickle.dump(best_CatBR, open('Models/Padels/Toxicity_CatBoost_AtomPairs2D.pkl', 'wb'))

# load the model from disk

In [44]:
best_CatBR = pickle.load(open('Models/MACCS/Toxicity_CatBoost_MF.pkl', 'rb'))

# 9. Prediction for test set's molecules

In [73]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [75]:
y_pred_GBR = best_CatBR.predict(x_ts)

In [77]:
Q2_TS = round(r2_score(y_ts, y_pred_GBR), 2)
Q2_TS

0.48

In [79]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_GBR)), 2)
RMSE_TS

0.64

# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [75]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [76]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7864,7865,7866,7867,7868,7869,7870,7871,7872,7873
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.000000,2.449490,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,2.236068,1.732051,2.000000,3.316625,1.732051,3.741657,0.000000,3.316625,1.732051,1.000000
2,1.000000,2.449490,0.000000,0.000000,0.000000,0.000000,2.645751,0.000000,0.000000,1.000000,...,2.449490,2.645751,2.000000,3.464102,1.732051,3.872983,0.000000,3.316625,2.236068,1.000000
3,1.414214,2.449490,0.000000,0.000000,0.000000,1.732051,2.645751,0.000000,0.000000,1.000000,...,3.000000,2.828427,2.449490,3.605551,2.000000,4.242640,0.000000,3.464102,2.236068,1.000000
4,1.732051,2.449490,0.000000,0.000000,1.000000,1.732051,2.645751,0.000000,0.000000,2.645751,...,3.162278,3.000000,2.449490,3.605551,2.000000,4.472136,0.000000,3.464102,2.236068,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7869,10.148891,9.695360,10.488089,10.000000,10.583005,10.392304,9.899495,9.949874,9.899495,10.908712,...,10.000000,10.295630,10.000000,9.797959,9.949874,10.816654,9.949874,9.899495,9.848858,10.049875
7870,10.198039,9.848858,10.630146,10.000000,10.677078,10.392304,10.535654,9.949874,10.000000,10.954452,...,10.630146,10.770329,10.049875,9.848858,10.000000,10.862781,10.000000,9.949874,9.848858,10.049875
7871,10.246951,9.899495,10.677078,10.049875,10.723805,10.440307,10.583005,10.000000,10.049875,10.954452,...,10.677078,10.816654,10.049875,9.899495,10.049875,10.862781,10.148891,10.049875,9.899495,10.295630
7872,10.862781,10.344080,10.862781,10.630146,11.000000,10.816654,10.630146,10.583005,10.535654,11.135529,...,10.770329,10.908712,10.630146,9.899495,10.770329,10.862781,10.770329,10.049875,10.630146,10.908712


In [77]:
similarity= neighbors_k

In [78]:
Dmean=np.mean(similarity[1,:])

In [79]:
round(Dmean, 2)

1.34

In [80]:
std=np.std(similarity[1,:])

In [81]:
round(std, 2)

1.34

In [82]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.02


In [83]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [84]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968
0,0.000000,0.000000,1.414214,0.000000,0.000000,0.000000,0.000000,2.236068,1.000000,0.000000,...,0.000000,1.732051,2.000000,1.000000,0.000000,2.449490,3.605551,1.732051,2.645751,3.316625
1,1.000000,0.000000,1.732051,0.000000,0.000000,0.000000,1.000000,2.449490,1.414214,0.000000,...,0.000000,1.732051,2.000000,2.000000,2.000000,3.605551,3.872983,2.449490,3.000000,3.741657
2,1.000000,0.000000,1.732051,0.000000,1.000000,0.000000,1.414214,2.449490,1.414214,0.000000,...,2.449490,1.732051,2.000000,2.000000,2.449490,3.605551,3.872983,2.645751,3.162278,3.872983
3,1.000000,0.000000,2.236068,0.000000,1.000000,1.000000,1.414214,2.449490,1.414214,0.000000,...,2.449490,2.000000,2.000000,2.449490,2.449490,3.605551,3.872983,3.605551,3.162278,3.872983
4,1.000000,0.000000,2.236068,0.000000,1.000000,1.000000,1.732051,2.449490,1.414214,0.000000,...,2.449490,2.000000,2.000000,3.000000,2.449490,3.605551,4.000000,3.872983,3.464102,3.872983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7869,10.723805,10.049875,10.630146,10.000000,10.488089,10.099504,10.535654,10.049875,10.392304,10.535654,...,10.295630,10.049875,9.746795,10.198039,10.295630,10.246951,10.198039,10.488089,10.198039,10.198039
7870,10.862781,10.148891,10.723805,10.099504,10.583005,10.099504,10.535654,10.630146,10.440307,10.677078,...,10.677078,10.099504,9.746795,10.295630,10.295630,10.246951,10.440307,10.583005,10.630146,10.198039
7871,10.908712,10.198039,10.770329,10.148891,10.630146,10.148891,10.583005,10.770329,10.488089,10.723805,...,10.723805,10.148891,9.797959,10.535654,10.392304,10.392304,10.488089,10.816654,10.677078,10.440307
7872,11.090536,10.583005,11.045361,10.630146,10.908712,10.723805,11.045361,10.816654,10.908712,10.908712,...,11.000000,10.816654,10.488089,10.862781,10.908712,10.862781,10.535654,10.954452,10.862781,10.862781


In [85]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[0.    0.    1.414 ... 1.732 2.646 3.317]


In [86]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True False False]


In [87]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.73


In [88]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 1962 1963 1966]


In [89]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [91]:
y_pred_GBR_ad=list(y_pred_GBR)

In [92]:
y_pred_GBR_ad[:] = [x for i,x in enumerate(y_pred_GBR_ad) if i not in out_Ad]

In [93]:
len(y_pred_GBR_ad)

1442

In [94]:
y_ts_ad=list(y_ts)

In [95]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [96]:
len(y_ts_ad)

1442

In [97]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_GBR_ad), 2)
Q2_TS

0.5

In [98]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_GBR_ad)), 2)
RMSE_TS

0.62

# SVM model building and validation

In [139]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [141]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [143]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [145]:
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [146]:
svm.best_params_
best_svm = svm.best_estimator_

In [147]:
svm.best_params_

{'C': 1, 'gamma': 0.1}

In [148]:
y_pred_ws_svm = best_svm.predict(x_tr)

In [149]:
R2_WS = round(r2_score(y_tr, y_pred_ws_svm), 2)
R2_WS

0.69

In [150]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_svm)), 2)
RMSE_WS

0.5

In [151]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [152]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.44

In [153]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.67

# 9. Prediction for test set's molecules

In [155]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [156]:
y_pred_svm = best_svm.predict(x_ts)

In [157]:
Q2_TS = round(r2_score(y_ts, y_pred_svm), 2)
Q2_TS

0.44

In [158]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_svm)), 2)
RMSE_TS

0.67

save the model to disk

In [160]:
pickle.dump(best_svm, open('models/Padels/Toxicity_SVM_AtomPairs2D.pkl', 'wb'))

load the model from disk

In [98]:
best_svm = pickle.load(open('models/Padels/Toxicity_SVM_AtomPairs2D.pkl', 'rb'))

# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [162]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [163]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7864,7865,7866,7867,7868,7869,7870,7871,7872,7873
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.000000,2.449490,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,2.236068,1.732051,2.000000,3.316625,1.732051,3.741657,0.000000,3.316625,1.732051,1.000000
2,1.000000,2.449490,0.000000,0.000000,0.000000,0.000000,2.645751,0.000000,0.000000,1.000000,...,2.449490,2.645751,2.000000,3.464102,1.732051,3.872983,0.000000,3.316625,2.236068,1.000000
3,1.414214,2.449490,0.000000,0.000000,0.000000,1.732051,2.645751,0.000000,0.000000,1.000000,...,3.000000,2.828427,2.449490,3.605551,2.000000,4.242640,0.000000,3.464102,2.236068,1.000000
4,1.732051,2.449490,0.000000,0.000000,1.000000,1.732051,2.645751,0.000000,0.000000,2.645751,...,3.162278,3.000000,2.449490,3.605551,2.000000,4.472136,0.000000,3.464102,2.236068,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7869,10.148891,9.695360,10.488089,10.000000,10.583005,10.392304,9.899495,9.949874,9.899495,10.908712,...,10.000000,10.295630,10.000000,9.797959,9.949874,10.816654,9.949874,9.899495,9.848858,10.049875
7870,10.198039,9.848858,10.630146,10.000000,10.677078,10.392304,10.535654,9.949874,10.000000,10.954452,...,10.630146,10.770329,10.049875,9.848858,10.000000,10.862781,10.000000,9.949874,9.848858,10.049875
7871,10.246951,9.899495,10.677078,10.049875,10.723805,10.440307,10.583005,10.000000,10.049875,10.954452,...,10.677078,10.816654,10.049875,9.899495,10.049875,10.862781,10.148891,10.049875,9.899495,10.295630
7872,10.862781,10.344080,10.862781,10.630146,11.000000,10.816654,10.630146,10.583005,10.535654,11.135529,...,10.770329,10.908712,10.630146,9.899495,10.770329,10.862781,10.770329,10.049875,10.630146,10.908712


In [164]:
similarity= neighbors_k

In [165]:
Dmean=np.mean(similarity[1,:])

In [166]:
round(Dmean, 2)

1.34

In [167]:
std=np.std(similarity[1,:])

In [168]:
round(std, 2)

1.34

In [169]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.02


In [170]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [171]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968
0,0.000000,0.000000,1.414214,0.000000,0.000000,0.000000,0.000000,2.236068,1.000000,0.000000,...,0.000000,1.732051,2.000000,1.000000,0.000000,2.449490,3.605551,1.732051,2.645751,3.316625
1,1.000000,0.000000,1.732051,0.000000,0.000000,0.000000,1.000000,2.449490,1.414214,0.000000,...,0.000000,1.732051,2.000000,2.000000,2.000000,3.605551,3.872983,2.449490,3.000000,3.741657
2,1.000000,0.000000,1.732051,0.000000,1.000000,0.000000,1.414214,2.449490,1.414214,0.000000,...,2.449490,1.732051,2.000000,2.000000,2.449490,3.605551,3.872983,2.645751,3.162278,3.872983
3,1.000000,0.000000,2.236068,0.000000,1.000000,1.000000,1.414214,2.449490,1.414214,0.000000,...,2.449490,2.000000,2.000000,2.449490,2.449490,3.605551,3.872983,3.605551,3.162278,3.872983
4,1.000000,0.000000,2.236068,0.000000,1.000000,1.000000,1.732051,2.449490,1.414214,0.000000,...,2.449490,2.000000,2.000000,3.000000,2.449490,3.605551,4.000000,3.872983,3.464102,3.872983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7869,10.723805,10.049875,10.630146,10.000000,10.488089,10.099504,10.535654,10.049875,10.392304,10.535654,...,10.295630,10.049875,9.746795,10.198039,10.295630,10.246951,10.198039,10.488089,10.198039,10.198039
7870,10.862781,10.148891,10.723805,10.099504,10.583005,10.099504,10.535654,10.630146,10.440307,10.677078,...,10.677078,10.099504,9.746795,10.295630,10.295630,10.246951,10.440307,10.583005,10.630146,10.198039
7871,10.908712,10.198039,10.770329,10.148891,10.630146,10.148891,10.583005,10.770329,10.488089,10.723805,...,10.723805,10.148891,9.797959,10.535654,10.392304,10.392304,10.488089,10.816654,10.677078,10.440307
7872,11.090536,10.583005,11.045361,10.630146,10.908712,10.723805,11.045361,10.816654,10.908712,10.908712,...,11.000000,10.816654,10.488089,10.862781,10.908712,10.862781,10.535654,10.954452,10.862781,10.862781


In [172]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[0.    0.    1.414 ... 1.732 2.646 3.317]


In [173]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True False False]


In [174]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.73


In [175]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 1962 1963 1966]


In [176]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [178]:
y_pred_svm_ad=list(y_pred_svm)

In [179]:
y_pred_svm_ad[:] = [x for i,x in enumerate(y_pred_svm_ad) if i not in out_Ad]

In [180]:
len(y_pred_svm_ad)

1442

In [181]:
y_ts_ad=list(y_ts)

In [182]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [183]:
len(y_ts_ad)

1442

In [184]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_svm_ad), 2)
Q2_TS

0.48

In [185]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_svm_ad)), 2)
RMSE_TS

0.63

# Multi-layer Perceptron regressor

In [188]:
from sklearn.neural_network import MLPRegressor

In [189]:
param_grid ={"hidden_layer_sizes": [(400, 300, 200, 100), (10, 10)], "activation": ["tanh", "relu"], "solver": ["sgd", "adam"],  'max_iter': [2000]}

In [190]:
m = GridSearchCV(MLPRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [191]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [192]:
best_MLPR = m.best_estimator_

In [193]:
m.best_params_

{'activation': 'tanh',
 'hidden_layer_sizes': (400, 300, 200, 100),
 'max_iter': 2000,
 'solver': 'sgd'}

In [194]:
y_pred_ws_MLPR = best_MLPR.predict(x_tr)

In [195]:
R2_WS = round(r2_score(y_tr, y_pred_ws_MLPR), 2)
R2_WS

0.66

In [196]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_MLPR)), 2)
RMSE_WS

0.52

In [197]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, x_tr, y_tr, cv=cv)

In [198]:
y_pred_CV_MLPR

array([1.6814723, 2.8099287, 1.7698616, ..., 4.358991 , 2.59744  ,
       3.5373187], dtype=float32)

In [199]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_MLPR), 2)
Q2_CV

0.36

In [200]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_MLPR)), 2)
RMSE_CV

0.71

# 9. Prediction for test set's molecules

In [202]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [203]:
y_pred_MLPR = best_MLPR.predict(x_ts)

In [204]:
Q2_TS = round(r2_score(y_ts, y_pred_MLPR), 2)
Q2_TS

0.36

In [205]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_MLPR)), 2)
RMSE_TS

0.72

# save the model to disk

In [207]:
pickle.dump(best_MLPR, open('models/Padels/Toxicity_MLPR_AtomPairs2D.pkl', 'wb'))

# load the model from disk

In [141]:
best_MLPR = pickle.load(open('models/Padels/Toxicity_MLPR_AtomPairs2D.pkl', 'rb'))

# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [210]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [211]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7864,7865,7866,7867,7868,7869,7870,7871,7872,7873
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.000000,2.449490,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,2.236068,1.732051,2.000000,3.316625,1.732051,3.741657,0.000000,3.316625,1.732051,1.000000
2,1.000000,2.449490,0.000000,0.000000,0.000000,0.000000,2.645751,0.000000,0.000000,1.000000,...,2.449490,2.645751,2.000000,3.464102,1.732051,3.872983,0.000000,3.316625,2.236068,1.000000
3,1.414214,2.449490,0.000000,0.000000,0.000000,1.732051,2.645751,0.000000,0.000000,1.000000,...,3.000000,2.828427,2.449490,3.605551,2.000000,4.242640,0.000000,3.464102,2.236068,1.000000
4,1.732051,2.449490,0.000000,0.000000,1.000000,1.732051,2.645751,0.000000,0.000000,2.645751,...,3.162278,3.000000,2.449490,3.605551,2.000000,4.472136,0.000000,3.464102,2.236068,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7869,10.148891,9.695360,10.488089,10.000000,10.583005,10.392304,9.899495,9.949874,9.899495,10.908712,...,10.000000,10.295630,10.000000,9.797959,9.949874,10.816654,9.949874,9.899495,9.848858,10.049875
7870,10.198039,9.848858,10.630146,10.000000,10.677078,10.392304,10.535654,9.949874,10.000000,10.954452,...,10.630146,10.770329,10.049875,9.848858,10.000000,10.862781,10.000000,9.949874,9.848858,10.049875
7871,10.246951,9.899495,10.677078,10.049875,10.723805,10.440307,10.583005,10.000000,10.049875,10.954452,...,10.677078,10.816654,10.049875,9.899495,10.049875,10.862781,10.148891,10.049875,9.899495,10.295630
7872,10.862781,10.344080,10.862781,10.630146,11.000000,10.816654,10.630146,10.583005,10.535654,11.135529,...,10.770329,10.908712,10.630146,9.899495,10.770329,10.862781,10.770329,10.049875,10.630146,10.908712


In [212]:
similarity= neighbors_k

In [213]:
Dmean=np.mean(similarity[1,:])

In [214]:
round(Dmean, 2)

1.34

In [215]:
std=np.std(similarity[1,:])

In [216]:
round(std, 2)

1.34

In [217]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.02


In [218]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [219]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968
0,0.000000,0.000000,1.414214,0.000000,0.000000,0.000000,0.000000,2.236068,1.000000,0.000000,...,0.000000,1.732051,2.000000,1.000000,0.000000,2.449490,3.605551,1.732051,2.645751,3.316625
1,1.000000,0.000000,1.732051,0.000000,0.000000,0.000000,1.000000,2.449490,1.414214,0.000000,...,0.000000,1.732051,2.000000,2.000000,2.000000,3.605551,3.872983,2.449490,3.000000,3.741657
2,1.000000,0.000000,1.732051,0.000000,1.000000,0.000000,1.414214,2.449490,1.414214,0.000000,...,2.449490,1.732051,2.000000,2.000000,2.449490,3.605551,3.872983,2.645751,3.162278,3.872983
3,1.000000,0.000000,2.236068,0.000000,1.000000,1.000000,1.414214,2.449490,1.414214,0.000000,...,2.449490,2.000000,2.000000,2.449490,2.449490,3.605551,3.872983,3.605551,3.162278,3.872983
4,1.000000,0.000000,2.236068,0.000000,1.000000,1.000000,1.732051,2.449490,1.414214,0.000000,...,2.449490,2.000000,2.000000,3.000000,2.449490,3.605551,4.000000,3.872983,3.464102,3.872983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7869,10.723805,10.049875,10.630146,10.000000,10.488089,10.099504,10.535654,10.049875,10.392304,10.535654,...,10.295630,10.049875,9.746795,10.198039,10.295630,10.246951,10.198039,10.488089,10.198039,10.198039
7870,10.862781,10.148891,10.723805,10.099504,10.583005,10.099504,10.535654,10.630146,10.440307,10.677078,...,10.677078,10.099504,9.746795,10.295630,10.295630,10.246951,10.440307,10.583005,10.630146,10.198039
7871,10.908712,10.198039,10.770329,10.148891,10.630146,10.148891,10.583005,10.770329,10.488089,10.723805,...,10.723805,10.148891,9.797959,10.535654,10.392304,10.392304,10.488089,10.816654,10.677078,10.440307
7872,11.090536,10.583005,11.045361,10.630146,10.908712,10.723805,11.045361,10.816654,10.908712,10.908712,...,11.000000,10.816654,10.488089,10.862781,10.908712,10.862781,10.535654,10.954452,10.862781,10.862781


In [220]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[0.    0.    1.414 ... 1.732 2.646 3.317]


In [221]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True False False]


In [222]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.73


In [223]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 1962 1963 1966]


In [224]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [226]:
y_pred_MLPR_ad=list(y_pred_MLPR)

In [227]:
y_pred_MLPR_ad[:] = [x for i,x in enumerate(y_pred_MLPR_ad) if i not in out_Ad]

In [228]:
len(y_pred_MLPR_ad)

1442

In [229]:
y_ts_ad=list(y_ts)

In [230]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [231]:
len(y_ts_ad)

1442

In [232]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_MLPR_ad), 2)
Q2_TS

0.42

In [233]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_MLPR_ad)), 2)
RMSE_TS

0.67

# k-nearest neighbors

In [235]:
from sklearn.neighbors import KNeighborsRegressor

In [236]:
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

In [237]:
m = GridSearchCV(KNeighborsRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [238]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [239]:
best_kNN = m.best_estimator_

In [240]:
m.best_params_

{'n_neighbors': 10}

In [241]:
y_pred_ws_kNN = best_kNN.predict(x_tr)

In [242]:
R2_WS = round(r2_score(y_tr, y_pred_ws_kNN), 2)
R2_WS

0.48

In [243]:
RMSE_WS=round(np.sqrt(mean_squared_error(y_tr, y_pred_ws_kNN)), 2)
RMSE_WS

0.65

In [244]:
y_pred_CV_kNN = cross_val_predict(best_kNN, x_tr, y_tr, cv=cv)

In [245]:
y_pred_CV_kNN

array([1.4828202, 2.3836198, 1.481155 , ..., 3.2345767, 2.7517295,
       2.0904446], dtype=float32)

In [246]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_kNN), 2)
Q2_CV

0.35

In [247]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_kNN)), 2)
RMSE_CV

0.72

# 9. Prediction for test set's molecules

In [249]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [250]:
y_pred_kNN = best_kNN.predict(x_ts)

In [251]:
Q2_TS = round(r2_score(y_ts, y_pred_kNN), 2)
Q2_TS

0.35

In [252]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_kNN)), 2)
RMSE_TS

0.72

# save the model to disk

In [254]:
pickle.dump(best_kNN, open('models/Padels/Toxicity_kNN_AtomPairs2D.pkl', 'wb'))

# load the model from disk

In [184]:
best_kNN = pickle.load(open('models/Padels/Toxicity_kNN_AtomPairs2D.pkl', 'rb'))

# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [256]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [257]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7864,7865,7866,7867,7868,7869,7870,7871,7872,7873
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.000000,2.449490,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,2.236068,1.732051,2.000000,3.316625,1.732051,3.741657,0.000000,3.316625,1.732051,1.000000
2,1.000000,2.449490,0.000000,0.000000,0.000000,0.000000,2.645751,0.000000,0.000000,1.000000,...,2.449490,2.645751,2.000000,3.464102,1.732051,3.872983,0.000000,3.316625,2.236068,1.000000
3,1.414214,2.449490,0.000000,0.000000,0.000000,1.732051,2.645751,0.000000,0.000000,1.000000,...,3.000000,2.828427,2.449490,3.605551,2.000000,4.242640,0.000000,3.464102,2.236068,1.000000
4,1.732051,2.449490,0.000000,0.000000,1.000000,1.732051,2.645751,0.000000,0.000000,2.645751,...,3.162278,3.000000,2.449490,3.605551,2.000000,4.472136,0.000000,3.464102,2.236068,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7869,10.148891,9.695360,10.488089,10.000000,10.583005,10.392304,9.899495,9.949874,9.899495,10.908712,...,10.000000,10.295630,10.000000,9.797959,9.949874,10.816654,9.949874,9.899495,9.848858,10.049875
7870,10.198039,9.848858,10.630146,10.000000,10.677078,10.392304,10.535654,9.949874,10.000000,10.954452,...,10.630146,10.770329,10.049875,9.848858,10.000000,10.862781,10.000000,9.949874,9.848858,10.049875
7871,10.246951,9.899495,10.677078,10.049875,10.723805,10.440307,10.583005,10.000000,10.049875,10.954452,...,10.677078,10.816654,10.049875,9.899495,10.049875,10.862781,10.148891,10.049875,9.899495,10.295630
7872,10.862781,10.344080,10.862781,10.630146,11.000000,10.816654,10.630146,10.583005,10.535654,11.135529,...,10.770329,10.908712,10.630146,9.899495,10.770329,10.862781,10.770329,10.049875,10.630146,10.908712


In [258]:
similarity= neighbors_k

In [259]:
Dmean=np.mean(similarity[1,:])

In [260]:
round(Dmean, 2)

1.34

In [261]:
std=np.std(similarity[1,:])

In [262]:
round(std, 2)

1.34

In [263]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.02


In [264]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [265]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968
0,0.000000,0.000000,1.414214,0.000000,0.000000,0.000000,0.000000,2.236068,1.000000,0.000000,...,0.000000,1.732051,2.000000,1.000000,0.000000,2.449490,3.605551,1.732051,2.645751,3.316625
1,1.000000,0.000000,1.732051,0.000000,0.000000,0.000000,1.000000,2.449490,1.414214,0.000000,...,0.000000,1.732051,2.000000,2.000000,2.000000,3.605551,3.872983,2.449490,3.000000,3.741657
2,1.000000,0.000000,1.732051,0.000000,1.000000,0.000000,1.414214,2.449490,1.414214,0.000000,...,2.449490,1.732051,2.000000,2.000000,2.449490,3.605551,3.872983,2.645751,3.162278,3.872983
3,1.000000,0.000000,2.236068,0.000000,1.000000,1.000000,1.414214,2.449490,1.414214,0.000000,...,2.449490,2.000000,2.000000,2.449490,2.449490,3.605551,3.872983,3.605551,3.162278,3.872983
4,1.000000,0.000000,2.236068,0.000000,1.000000,1.000000,1.732051,2.449490,1.414214,0.000000,...,2.449490,2.000000,2.000000,3.000000,2.449490,3.605551,4.000000,3.872983,3.464102,3.872983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7869,10.723805,10.049875,10.630146,10.000000,10.488089,10.099504,10.535654,10.049875,10.392304,10.535654,...,10.295630,10.049875,9.746795,10.198039,10.295630,10.246951,10.198039,10.488089,10.198039,10.198039
7870,10.862781,10.148891,10.723805,10.099504,10.583005,10.099504,10.535654,10.630146,10.440307,10.677078,...,10.677078,10.099504,9.746795,10.295630,10.295630,10.246951,10.440307,10.583005,10.630146,10.198039
7871,10.908712,10.198039,10.770329,10.148891,10.630146,10.148891,10.583005,10.770329,10.488089,10.723805,...,10.723805,10.148891,9.797959,10.535654,10.392304,10.392304,10.488089,10.816654,10.677078,10.440307
7872,11.090536,10.583005,11.045361,10.630146,10.908712,10.723805,11.045361,10.816654,10.908712,10.908712,...,11.000000,10.816654,10.488089,10.862781,10.908712,10.862781,10.535654,10.954452,10.862781,10.862781


In [266]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[0.    0.    1.414 ... 1.732 2.646 3.317]


In [267]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True ...  True False False]


In [268]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.73


In [269]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    2 ... 1962 1963 1966]


In [270]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [272]:
y_pred_kNN_ad=list(y_pred_kNN)

In [273]:
y_pred_kNN_ad[:] = [x for i,x in enumerate(y_pred_kNN_ad) if i not in out_Ad]

In [274]:
len(y_pred_kNN_ad)

1442

In [275]:
y_ts_ad=list(y_ts)

In [276]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [277]:
len(y_ts_ad)

1442

In [278]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_kNN_ad), 2)
Q2_TS

0.36

In [279]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts_ad, y_pred_kNN_ad)), 2)
RMSE_TS

0.7