# 1. Importing modules and functions

In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem import MACCSkeys
from copy import deepcopy
from rdkit.ML.Descriptors import MoleculeDescriptors
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from padelpy import from_sdf
import shap
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
def convert_smi_to_canon_smi(smi):
    
    try:
        canon_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi),isomericSmiles = False)
    except:
        canon_smi='wrong_smiles'
    return canon_smi

In [3]:
def standart(smi):
    global m
    if smi!='wrong_smiles':
        try:
            smiles=standardize_smiles(smi)
            m = Chem.MolFromSmiles(smi)
        except:
            smi='error kekule'
    else:
        m = 'check the smiles'
    return m

## Load data and curation work set

In [4]:
# Set file path
df_ws=pd.read_csv('rat_oral_LD50_WS.csv')
df_ws

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,626-48-2,Cc1cc(=O)[nH]c(=O)[nH]1,0.291207,64463.0000
1,27849-94-1,CC(CCc1ccc2c(c1)OCO2)NN,0.440660,75449.0000
2,110-54-3,CCCCCC,0.537460,24980.0000
3,1330-92-3,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,0.539250,113917.0000
4,57-55-6,CC(O)CO,0.580330,19989.0000
...,...,...,...,...
7707,56073-10-0,O=c1oc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,6.514700,0.1596
7708,130209-82-4,CC(C)OC(=O)CCC/C=C/CC1C(O)CC(O)C1CCC(O)CCc1ccccc1,6.937100,0.0500
7709,83805-11-2,C=C1/C(=C\C=C2/CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(...,7.099700,0.0417
7710,1746-01-6,Clc1cc2c(cc1Cl)Oc1cc(Cl)c(Cl)cc1O2,7.206800,0.0199


 Convert a SMILES string to canonical SMILES

In [5]:
df_ws1 = deepcopy(df_ws)
df_ws1["SMILES"] = df_ws1.apply(lambda x: convert_smi_to_canon_smi(x.SMILES), axis=1)
df_ws1

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,626-48-2,Cc1cc(=O)[nH]c(=O)[nH]1,0.291207,64463.0000
1,27849-94-1,CC(CCc1ccc2c(c1)OCO2)NN,0.440660,75449.0000
2,110-54-3,CCCCCC,0.537460,24980.0000
3,1330-92-3,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,0.539250,113917.0000
4,57-55-6,CC(O)CO,0.580330,19989.0000
...,...,...,...,...
7707,56073-10-0,O=c1oc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,6.514700,0.1596
7708,130209-82-4,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1,6.937100,0.0500
7709,83805-11-2,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...,7.099700,0.0417
7710,1746-01-6,Clc1cc2c(cc1Cl)Oc1cc(Cl)c(Cl)cc1O2,7.206800,0.0199


In [6]:
print('Original data: ', len(df_ws), 'molecules')
print('Failed data: ', len(df_ws1[df_ws1['SMILES']=='wrong_smiles']), 'molecules')

Original data:  7712 molecules
Failed data:  0 molecules


In [7]:
index=df_ws1.index[df_ws1['SMILES']=='wrong_smiles'].tolist()
wrong_smiles=df_ws.iloc[index]
wrong_smiles=wrong_smiles.SMILES
number=[x+1 for x in index]
bad_molecules = pd.DataFrame({'No. failed smiles in original set': number, 'SMILES of wrong structure: ': wrong_smiles}, index=None)
bad_molecules = bad_molecules.set_index('No. failed smiles in original set')
bad_molecules

Unnamed: 0_level_0,SMILES of wrong structure:
No. failed smiles in original set,Unnamed: 1_level_1


##  Standardization  for work set

In [8]:
df_ws1["Molecule"] = df_ws1.apply(lambda x: standart(x.SMILES), axis=1)
moldf_ws=df_ws1[df_ws1['SMILES']!='wrong_smiles']
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  7712 molecules


In [9]:
moldf_ws

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg",Molecule
0,626-48-2,Cc1cc(=O)[nH]c(=O)[nH]1,0.291207,64463.0000,<rdkit.Chem.rdchem.Mol object at 0x00000231BD9...
1,27849-94-1,CC(CCc1ccc2c(c1)OCO2)NN,0.440660,75449.0000,<rdkit.Chem.rdchem.Mol object at 0x00000231BD9...
2,110-54-3,CCCCCC,0.537460,24980.0000,<rdkit.Chem.rdchem.Mol object at 0x00000231BD9...
3,1330-92-3,CCCCC(CC)COC(=O)C1=C(C(=O)OCC(CC)CCCC)CCCC1,0.539250,113917.0000,<rdkit.Chem.rdchem.Mol object at 0x00000231BD9...
4,57-55-6,CC(O)CO,0.580330,19989.0000,<rdkit.Chem.rdchem.Mol object at 0x00000231BE8...
...,...,...,...,...,...
7707,56073-10-0,O=c1oc2ccccc2c(O)c1C1CC(c2ccc(-c3ccc(Br)cc3)cc...,6.514700,0.1596,<rdkit.Chem.rdchem.Mol object at 0x00000231BDB...
7708,130209-82-4,CC(C)OC(=O)CCCC=CCC1C(O)CC(O)C1CCC(O)CCc1ccccc1,6.937100,0.0500,<rdkit.Chem.rdchem.Mol object at 0x00000231BDB...
7709,83805-11-2,C=C1C(=CC=C2CCCC3(C)C2CCC3C(C)CCCC(O)(C(F)(F)F...,7.099700,0.0417,<rdkit.Chem.rdchem.Mol object at 0x00000231BDB...
7710,1746-01-6,Clc1cc2c(cc1Cl)Oc1cc(Cl)c(Cl)cc1O2,7.206800,0.0199,<rdkit.Chem.rdchem.Mol object at 0x00000231BDB...


In [10]:
y_tr=moldf_ws.pLD50
y_tr

0       0.291207
1       0.440660
2       0.537460
3       0.539250
4       0.580330
          ...   
7707    6.514700
7708    6.937100
7709    7.099700
7710    7.206800
7711    7.602600
Name: pLD50, Length: 7712, dtype: float64

In [11]:
moldf_ws=moldf_ws.Molecule

##  Load data and curation test set

In [12]:
df_ts=pd.read_csv('rat_oral_LD50_TS.csv')
df_ts

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,7782-40-3,C,0.017765,15388.8000
1,2842-38-8,OCCNC1CCCCC1,0.572840,38274.0000
2,66257-53-2,NC(=O)C(=O)O,0.624490,21133.0000
3,2173-56-0,CCCCCOC(=O)CCCC,0.686960,35395.0000
4,4726-93-6,O=C1CCCCC(=O)N1,0.750180,22586.0000
...,...,...,...,...
1924,3385-03-3,CC1(C)OC2CC3C4CC(F)C5=CC(=O)C=CC5(C)C4C(O)CC3(...,5.939000,0.4997
1925,2338-29-6,FC(F)(F)c1nc2c(Cl)c(Cl)c(Cl)c(Cl)c2[nH]1,6.121300,0.2435
1926,128606-48-4,CCOP(=S)(OCC)O/C(C)=C/C(=O)OC,6.282400,0.1399
1927,50585-41-6,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,6.698800,0.0992


 Convert a SMILES string to canonical SMILES

In [13]:
df_ts1 = deepcopy(df_ts)
df_ts1["SMILES"] = df_ts1.apply(lambda x: convert_smi_to_canon_smi(x.SMILES), axis=1)
df_ts1

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg"
0,7782-40-3,C,0.017765,15388.8000
1,2842-38-8,OCCNC1CCCCC1,0.572840,38274.0000
2,66257-53-2,NC(=O)C(=O)O,0.624490,21133.0000
3,2173-56-0,CCCCCOC(=O)CCCC,0.686960,35395.0000
4,4726-93-6,O=C1CCCCC(=O)N1,0.750180,22586.0000
...,...,...,...,...
1924,3385-03-3,CC1(C)OC2CC3C4CC(F)C5=CC(=O)C=CC5(C)C4C(O)CC3(...,5.939000,0.4997
1925,2338-29-6,FC(F)(F)c1nc2c(Cl)c(Cl)c(Cl)c(Cl)c2[nH]1,6.121300,0.2435
1926,128606-48-4,CCOP(=S)(OCC)OC(C)=CC(=O)OC,6.282400,0.1399
1927,50585-41-6,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,6.698800,0.0992


In [14]:
print('Original data: ', len(df_ts), 'molecules')
print('Failed data: ', len(df_ts1[df_ts1['SMILES']=='wrong_smiles']), 'molecules')

Original data:  1929 molecules
Failed data:  0 molecules


In [15]:
index=df_ts1.index[df_ts1['SMILES']=='wrong_smiles'].tolist()
wrong_smiles=df_ts.iloc[index]
wrong_smiles=wrong_smiles.SMILES
number=[x+1 for x in index]
bad_molecules = pd.DataFrame({'No. failed smiles in original set': number, 'SMILES of wrong structure: ': wrong_smiles}, index=None)
bad_molecules = bad_molecules.set_index('No. failed smiles in original set')
bad_molecules

Unnamed: 0_level_0,SMILES of wrong structure:
No. failed smiles in original set,Unnamed: 1_level_1


##  Standardization  for test set

In [16]:
df_ts1["Molecule"] = df_ts1.apply(lambda x: standart(x.SMILES), axis=1)
moldf_ts=df_ts1[df_ts1['SMILES']!='wrong_smiles']
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  1929 molecules


In [17]:
moldf_ts

Unnamed: 0,CAS_Number,SMILES,pLD50,"LD50,mg/kg",Molecule
0,7782-40-3,C,0.017765,15388.8000,<rdkit.Chem.rdchem.Mol object at 0x00000231BD9...
1,2842-38-8,OCCNC1CCCCC1,0.572840,38274.0000,<rdkit.Chem.rdchem.Mol object at 0x00000231BD9...
2,66257-53-2,NC(=O)C(=O)O,0.624490,21133.0000,<rdkit.Chem.rdchem.Mol object at 0x00000231BD9...
3,2173-56-0,CCCCCOC(=O)CCCC,0.686960,35395.0000,<rdkit.Chem.rdchem.Mol object at 0x00000231BD9...
4,4726-93-6,O=C1CCCCC(=O)N1,0.750180,22586.0000,<rdkit.Chem.rdchem.Mol object at 0x00000231BD9...
...,...,...,...,...,...
1924,3385-03-3,CC1(C)OC2CC3C4CC(F)C5=CC(=O)C=CC5(C)C4C(O)CC3(...,5.939000,0.4997,<rdkit.Chem.rdchem.Mol object at 0x00000231BDC...
1925,2338-29-6,FC(F)(F)c1nc2c(Cl)c(Cl)c(Cl)c(Cl)c2[nH]1,6.121300,0.2435,<rdkit.Chem.rdchem.Mol object at 0x00000231BDC...
1926,128606-48-4,CCOP(=S)(OCC)OC(C)=CC(=O)OC,6.282400,0.1399,<rdkit.Chem.rdchem.Mol object at 0x00000231BDC...
1927,50585-41-6,Brc1cc2c(cc1Br)Oc1cc(Br)c(Br)cc1O2,6.698800,0.0992,<rdkit.Chem.rdchem.Mol object at 0x00000231BDC...


In [18]:
y_ts=moldf_ts.pLD50
y_ts

0       0.017765
1       0.572840
2       0.624490
3       0.686960
4       0.750180
          ...   
1924    5.939000
1925    6.121300
1926    6.282400
1927    6.698800
1928    9.541100
Name: pLD50, Length: 1929, dtype: float64

In [19]:
moldf_ts=moldf_ts.Molecule

## Calculation MACCS Fingerprints for work set

In [20]:
fp_tr = [MACCSkeys.GenMACCSKeys(m) for m in moldf_ws]

In [21]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [22]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [23]:
savetxt('models/MACCS/x_tr_MACCS.csv', x_tr, delimiter=',')

In [24]:
x_tr.shape

(7712, 167)

## Calculation  MACCS Fingerprint for test set

In [25]:
fp_ts = [MACCSkeys.GenMACCSKeys(m) for m in moldf_ts]

In [26]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [27]:
x_ts = rdkit_numpy_convert(fp_ts)

In [28]:
x_ts.shape

(1929, 167)

In [29]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

 ## GradientBoostingRegressor model building and validation

In [33]:
seed = 42

In [34]:
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [35]:
param_grid = {'learning_rate': [0.02,0.05],
                  'subsample'    : [0.9, 0.5, 0.1],
                  'n_estimators' : [100,500,1000],
                  'max_depth'    : [4, 10]
                 }

In [36]:
m = GridSearchCV(GradientBoostingRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [37]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [38]:
best_GBR = m.best_estimator_

In [39]:
m.best_params_

{'learning_rate': 0.02, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.5}

In [40]:
y_pred_ws_GBR = best_GBR.predict(x_tr)

In [41]:
R2_WS = round(r2_score(y_tr, y_pred_ws_GBR), 2)
R2_WS

0.95

In [42]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_ws_GBR)), 2)
RMSE_WS

0.39

In [43]:
y_pred_CV_GBR = cross_val_predict(best_GBR, x_tr, y_tr, cv=cv)

In [44]:
y_pred_CV_GBR

array([1.55499859, 2.50630934, 1.45290274, ..., 3.68584762, 5.38351459,
       2.76041539])

In [45]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_GBR), 2)
Q2_CV

0.58

In [46]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_GBR)), 2)
RMSE_CV

0.64

# 9. Prediction for test set's molecules

In [47]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)
len(y_ts)

1929

In [48]:
y_pred_GBR = best_GBR.predict(x_ts)

In [49]:
Q2_TS = round(r2_score(y_ts, y_pred_GBR), 2)
Q2_TS

0.6

In [50]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_GBR)), 2)
RMSE_TS

0.63

# save the model to disk

In [51]:
pickle.dump(best_GBR, open('models/MACCS/Toxicity_GBR_MACCS.pkl', 'wb'))

# load the model from disk

In [20]:
best_GBR = pickle.load(open('models/MACCS/Toxicity_GBR_MACCS.pkl', 'rb'))

# 10. Y-randomization GradientBoostingRegressor model

In [52]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_GBR, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=seed)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  6.7min


True score =  0.58 
Y-randomization =  -0.22 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 12.5min finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [53]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [54]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7702,7703,7704,7705,7706,7707,7708,7709,7710,7711
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.000000,4.000000,1.414214,2.236068,2.000000,2.645751,3.000000,2.645751,0.000000,0.000000,...,0.000000,1.000000,1.732051,3.162278,2.645751,2.000000,3.162278,2.449490,0.000000,3.000000
2,1.732051,4.242640,1.414214,2.236068,2.000000,2.828427,3.162278,2.828427,0.000000,0.000000,...,0.000000,2.449490,2.236068,3.162278,3.741657,2.449490,3.162278,2.449490,0.000000,3.162278
3,2.449490,4.358899,1.732051,2.449490,2.449490,3.162278,3.162278,3.000000,0.000000,0.000000,...,0.000000,3.000000,4.242640,3.464102,5.000000,2.449490,3.316625,3.000000,0.000000,3.162278
4,2.645751,4.690416,1.732051,2.645751,2.449490,3.162278,3.162278,3.000000,0.000000,0.000000,...,0.000000,3.162278,4.358899,3.464102,5.000000,2.449490,3.316625,3.464102,0.000000,3.316625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,8.246211,8.602325,9.219544,8.944272,9.165152,8.888194,9.000000,9.380832,9.110434,9.327379,...,9.000000,8.774964,8.544003,8.774964,8.306623,9.000000,8.774964,9.110434,9.000000,8.544003
7708,8.246211,8.602325,9.273619,9.000000,9.219544,8.888194,9.000000,9.380832,9.165152,9.327379,...,9.055386,8.831760,8.544003,8.774964,8.366600,9.000000,8.831760,9.110434,9.055386,8.544003
7709,8.306623,8.602325,9.327379,9.000000,9.219544,8.944272,9.000000,9.486833,9.219544,9.539392,...,9.110434,8.831760,8.602325,8.774964,8.426149,9.000000,8.831760,9.219544,9.110434,8.602325
7710,8.485281,8.717798,9.380832,9.000000,9.219544,8.944272,9.055386,9.539392,9.273619,9.539392,...,9.219544,8.888194,8.602325,8.888194,8.426149,9.000000,8.944272,9.273619,9.219544,8.774964


In [55]:
similarity= neighbors_k

In [56]:
Dmean=np.mean(similarity[1,:])

In [57]:
round(Dmean, 2)

2.03

In [58]:
std=np.std(similarity[1,:])

In [59]:
round(std, 2)

1.11

In [60]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.59


In [61]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [62]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928
0,1.414214,2.236068,3.000000,0.000000,2.449490,2.000000,0.000000,1.414214,2.000000,2.236068,...,2.236068,2.000000,0.000000,3.162278,1.000000,0.000000,0.000000,2.449490,1.414214,2.236068
1,1.732051,2.828427,3.162278,0.000000,2.449490,2.000000,0.000000,1.732051,2.000000,2.828427,...,2.645751,2.645751,0.000000,3.605551,1.414214,1.732051,0.000000,2.449490,1.414214,2.449490
2,1.732051,3.000000,3.162278,0.000000,3.162278,2.000000,0.000000,1.732051,2.449490,2.828427,...,2.828427,3.316625,0.000000,3.605551,2.236068,2.449490,0.000000,2.645751,1.414214,2.449490
3,1.732051,3.162278,3.316625,0.000000,3.872983,2.000000,1.414214,1.732051,2.645751,2.828427,...,3.000000,3.605551,0.000000,3.605551,2.449490,2.645751,0.000000,2.645751,1.414214,2.645751
4,1.732051,3.162278,3.316625,0.000000,4.000000,2.000000,1.414214,2.000000,2.828427,2.828427,...,3.000000,4.000000,1.000000,3.741657,2.449490,2.828427,1.000000,2.645751,1.414214,2.645751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,9.165152,9.110434,8.660254,9.110434,8.602325,9.273619,9.110434,9.165152,9.110434,8.831760,...,8.944272,8.944272,8.660254,8.774964,9.000000,9.055386,8.660254,8.660254,9.110434,9.055386
7708,9.273619,9.165152,8.717798,9.165152,8.602325,9.273619,9.110434,9.219544,9.110434,8.888194,...,9.000000,9.110434,8.717798,8.774964,9.000000,9.055386,8.717798,8.660254,9.110434,9.055386
7709,9.380832,9.219544,8.717798,9.219544,8.660254,9.327379,9.327379,9.219544,9.165152,8.944272,...,9.055386,9.110434,8.831760,8.774964,9.055386,9.110434,8.831760,8.660254,9.110434,9.110434
7710,9.380832,9.273619,8.831760,9.273619,8.831760,9.486833,9.327379,9.327379,9.219544,9.000000,...,9.273619,9.110434,8.944272,8.774964,9.055386,9.165152,8.944272,8.660254,9.219544,9.219544


In [63]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[1.414 2.236 3.    ... 2.449 1.414 2.236]


In [64]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True False ...  True  True  True]


In [65]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.69


In [66]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    3 ... 1926 1927 1928]


In [67]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [68]:
y_pred_GBR_ad=list(y_pred_GBR)

In [69]:
y_pred_GBR_ad[:] = [x for i,x in enumerate(y_pred_GBR_ad) if i not in out_Ad]

In [70]:
len(y_pred_GBR_ad)

1332

In [71]:
y_ts_ad=list(y_ts)

In [72]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [73]:
len(y_ts_ad)

1332

In [74]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_GBR_ad), 2)
Q2_TS

0.66

In [75]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_GBR_ad)), 2)
RMSE_TS

0.61

# SVM model building and validation

In [30]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [31]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [32]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [33]:
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [34]:
svm.best_params_
best_svm = svm.best_estimator_

In [35]:
svm.best_params_

{'C': 1, 'gamma': 0.1}

In [36]:
y_pred_ws_svm = best_svm.predict(x_tr)

In [37]:
R2_WS = round(r2_score(y_tr, y_pred_ws_svm), 2)
R2_WS

0.85

In [38]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_ws_svm)), 2)
RMSE_WS

0.49

In [39]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [40]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.56

In [41]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.65

# 9. Prediction for test set's molecules

In [42]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [43]:
y_pred_svm = best_svm.predict(x_ts)

In [44]:
Q2_TS = round(r2_score(y_ts, y_pred_svm), 2)
Q2_TS

0.58

In [45]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_svm)), 2)
RMSE_TS

0.64

save the model to disk

In [46]:
pickle.dump(best_svm, open('models/MACCS/Toxicity_SVM_MACCS.pkl', 'wb'))

load the model from disk

In [105]:
best_svm = pickle.load(open('models/MACCS/Toxicity_SVM_MACCS.pkl', 'rb'))

# 10. Y-randomization SVM model

In [92]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_svm, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=seed)
print('True score = ', score.round(3),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  5.7min


True score =  0.559 
Y-randomization =  -0.1 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.3min finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [93]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [94]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7702,7703,7704,7705,7706,7707,7708,7709,7710,7711
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.000000,4.000000,1.414214,2.236068,2.000000,2.645751,3.000000,2.645751,0.000000,0.000000,...,0.000000,1.000000,1.732051,3.162278,2.645751,2.000000,3.162278,2.449490,0.000000,3.000000
2,1.732051,4.242640,1.414214,2.236068,2.000000,2.828427,3.162278,2.828427,0.000000,0.000000,...,0.000000,2.449490,2.236068,3.162278,3.741657,2.449490,3.162278,2.449490,0.000000,3.162278
3,2.449490,4.358899,1.732051,2.449490,2.449490,3.162278,3.162278,3.000000,0.000000,0.000000,...,0.000000,3.000000,4.242640,3.464102,5.000000,2.449490,3.316625,3.000000,0.000000,3.162278
4,2.645751,4.690416,1.732051,2.645751,2.449490,3.162278,3.162278,3.000000,0.000000,0.000000,...,0.000000,3.162278,4.358899,3.464102,5.000000,2.449490,3.316625,3.464102,0.000000,3.316625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,8.246211,8.602325,9.219544,8.944272,9.165152,8.888194,9.000000,9.380832,9.110434,9.327379,...,9.000000,8.774964,8.544003,8.774964,8.306623,9.000000,8.774964,9.110434,9.000000,8.544003
7708,8.246211,8.602325,9.273619,9.000000,9.219544,8.888194,9.000000,9.380832,9.165152,9.327379,...,9.055386,8.831760,8.544003,8.774964,8.366600,9.000000,8.831760,9.110434,9.055386,8.544003
7709,8.306623,8.602325,9.327379,9.000000,9.219544,8.944272,9.000000,9.486833,9.219544,9.539392,...,9.110434,8.831760,8.602325,8.774964,8.426149,9.000000,8.831760,9.219544,9.110434,8.602325
7710,8.485281,8.717798,9.380832,9.000000,9.219544,8.944272,9.055386,9.539392,9.273619,9.539392,...,9.219544,8.888194,8.602325,8.888194,8.426149,9.000000,8.944272,9.273619,9.219544,8.774964


In [95]:
similarity= neighbors_k

In [96]:
Dmean=np.mean(similarity[1,:])

In [97]:
round(Dmean, 2)

2.03

In [98]:
std=np.std(similarity[1,:])

In [99]:
round(std, 2)

1.11

In [100]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.59


In [101]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [102]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928
0,1.414214,2.236068,3.000000,0.000000,2.449490,2.000000,0.000000,1.414214,2.000000,2.236068,...,2.236068,2.000000,0.000000,3.162278,1.000000,0.000000,0.000000,2.449490,1.414214,2.236068
1,1.732051,2.828427,3.162278,0.000000,2.449490,2.000000,0.000000,1.732051,2.000000,2.828427,...,2.645751,2.645751,0.000000,3.605551,1.414214,1.732051,0.000000,2.449490,1.414214,2.449490
2,1.732051,3.000000,3.162278,0.000000,3.162278,2.000000,0.000000,1.732051,2.449490,2.828427,...,2.828427,3.316625,0.000000,3.605551,2.236068,2.449490,0.000000,2.645751,1.414214,2.449490
3,1.732051,3.162278,3.316625,0.000000,3.872983,2.000000,1.414214,1.732051,2.645751,2.828427,...,3.000000,3.605551,0.000000,3.605551,2.449490,2.645751,0.000000,2.645751,1.414214,2.645751
4,1.732051,3.162278,3.316625,0.000000,4.000000,2.000000,1.414214,2.000000,2.828427,2.828427,...,3.000000,4.000000,1.000000,3.741657,2.449490,2.828427,1.000000,2.645751,1.414214,2.645751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,9.165152,9.110434,8.660254,9.110434,8.602325,9.273619,9.110434,9.165152,9.110434,8.831760,...,8.944272,8.944272,8.660254,8.774964,9.000000,9.055386,8.660254,8.660254,9.110434,9.055386
7708,9.273619,9.165152,8.717798,9.165152,8.602325,9.273619,9.110434,9.219544,9.110434,8.888194,...,9.000000,9.110434,8.717798,8.774964,9.000000,9.055386,8.717798,8.660254,9.110434,9.055386
7709,9.380832,9.219544,8.717798,9.219544,8.660254,9.327379,9.327379,9.219544,9.165152,8.944272,...,9.055386,9.110434,8.831760,8.774964,9.055386,9.110434,8.831760,8.660254,9.110434,9.110434
7710,9.380832,9.273619,8.831760,9.273619,8.831760,9.486833,9.327379,9.327379,9.219544,9.000000,...,9.273619,9.110434,8.944272,8.774964,9.055386,9.165152,8.944272,8.660254,9.219544,9.219544


In [103]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[1.414 2.236 3.    ... 2.449 1.414 2.236]


In [104]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True False ...  True  True  True]


In [105]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.69


In [106]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    3 ... 1926 1927 1928]


In [107]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [108]:
y_pred_svm_ad=list(y_pred_svm)

In [109]:
y_pred_svm_ad[:] = [x for i,x in enumerate(y_pred_svm_ad) if i not in out_Ad]

In [110]:
len(y_pred_svm_ad)

1332

In [111]:
y_ts_ad=list(y_ts)

In [112]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [113]:
len(y_ts_ad)

1332

In [114]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_svm_ad), 2)
Q2_TS

0.64

In [115]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_svm_ad)), 2)
RMSE_TS

0.62

# Multi-layer Perceptron regressor

In [116]:
from sklearn.neural_network import MLPRegressor

In [117]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [118]:
param_grid ={"hidden_layer_sizes": [(400, 300, 200, 100),(100, 100, 100), (10, 10, 10),(50,)], "activation": ["tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [0.00005,0.0005], 'max_iter': [1000, 2000]}

In [119]:
m = GridSearchCV(MLPRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [120]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [121]:
best_MLPR = m.best_estimator_

In [122]:
m.best_params_

{'activation': 'relu',
 'alpha': 5e-05,
 'hidden_layer_sizes': (400, 300, 200, 100),
 'max_iter': 1000,
 'solver': 'adam'}

In [123]:
y_pred_ws_MLPR = best_MLPR.predict(x_tr)

In [124]:
R2_WS = round(r2_score(y_tr, y_pred_ws_MLPR), 2)
R2_WS

0.97

In [125]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_ws_MLPR)), 2)
RMSE_WS

0.32

In [126]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, x_tr, y_tr, cv=cv)

In [127]:
y_pred_CV_MLPR

array([1.4264392, 2.7139287, 1.4933684, ..., 5.6859965, 5.301824 ,
       2.8890176], dtype=float32)

In [128]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_MLPR), 2)
Q2_CV

0.47

In [129]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_MLPR)), 2)
RMSE_CV

0.68

# 9. Prediction for test set's molecules

In [130]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [131]:
y_pred_MLPR = best_MLPR.predict(x_ts)

In [132]:
Q2_TS = round(r2_score(y_ts, y_pred_MLPR), 2)
Q2_TS

0.51

In [133]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_MLPR)), 2)
RMSE_TS

0.66

# save the model to disk

In [134]:
pickle.dump(best_MLPR, open('models/MACCS/Toxicity_MLPR_MACCS.pkl', 'wb'))

# load the model from disk

In [83]:
best_MLPR = pickle.load(open('models/MACCS/Toxicity_MLPR_MACCS.pkl', 'rb'))

# 10. Y-randomization MLPR

In [135]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_MLPR, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=seed)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed: 19.1min


True score =  0.48 
Y-randomization =  -0.51 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 32.7min finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [136]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [137]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7702,7703,7704,7705,7706,7707,7708,7709,7710,7711
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.000000,4.000000,1.414214,2.236068,2.000000,2.645751,3.000000,2.645751,0.000000,0.000000,...,0.000000,1.000000,1.732051,3.162278,2.645751,2.000000,3.162278,2.449490,0.000000,3.000000
2,1.732051,4.242640,1.414214,2.236068,2.000000,2.828427,3.162278,2.828427,0.000000,0.000000,...,0.000000,2.449490,2.236068,3.162278,3.741657,2.449490,3.162278,2.449490,0.000000,3.162278
3,2.449490,4.358899,1.732051,2.449490,2.449490,3.162278,3.162278,3.000000,0.000000,0.000000,...,0.000000,3.000000,4.242640,3.464102,5.000000,2.449490,3.316625,3.000000,0.000000,3.162278
4,2.645751,4.690416,1.732051,2.645751,2.449490,3.162278,3.162278,3.000000,0.000000,0.000000,...,0.000000,3.162278,4.358899,3.464102,5.000000,2.449490,3.316625,3.464102,0.000000,3.316625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,8.246211,8.602325,9.219544,8.944272,9.165152,8.888194,9.000000,9.380832,9.110434,9.327379,...,9.000000,8.774964,8.544003,8.774964,8.306623,9.000000,8.774964,9.110434,9.000000,8.544003
7708,8.246211,8.602325,9.273619,9.000000,9.219544,8.888194,9.000000,9.380832,9.165152,9.327379,...,9.055386,8.831760,8.544003,8.774964,8.366600,9.000000,8.831760,9.110434,9.055386,8.544003
7709,8.306623,8.602325,9.327379,9.000000,9.219544,8.944272,9.000000,9.486833,9.219544,9.539392,...,9.110434,8.831760,8.602325,8.774964,8.426149,9.000000,8.831760,9.219544,9.110434,8.602325
7710,8.485281,8.717798,9.380832,9.000000,9.219544,8.944272,9.055386,9.539392,9.273619,9.539392,...,9.219544,8.888194,8.602325,8.888194,8.426149,9.000000,8.944272,9.273619,9.219544,8.774964


In [138]:
similarity= neighbors_k

In [139]:
Dmean=np.mean(similarity[1,:])

In [140]:
round(Dmean, 2)

2.03

In [141]:
std=np.std(similarity[1,:])

In [142]:
round(std, 2)

1.11

In [143]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.59


In [144]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [145]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928
0,1.414214,2.236068,3.000000,0.000000,2.449490,2.000000,0.000000,1.414214,2.000000,2.236068,...,2.236068,2.000000,0.000000,3.162278,1.000000,0.000000,0.000000,2.449490,1.414214,2.236068
1,1.732051,2.828427,3.162278,0.000000,2.449490,2.000000,0.000000,1.732051,2.000000,2.828427,...,2.645751,2.645751,0.000000,3.605551,1.414214,1.732051,0.000000,2.449490,1.414214,2.449490
2,1.732051,3.000000,3.162278,0.000000,3.162278,2.000000,0.000000,1.732051,2.449490,2.828427,...,2.828427,3.316625,0.000000,3.605551,2.236068,2.449490,0.000000,2.645751,1.414214,2.449490
3,1.732051,3.162278,3.316625,0.000000,3.872983,2.000000,1.414214,1.732051,2.645751,2.828427,...,3.000000,3.605551,0.000000,3.605551,2.449490,2.645751,0.000000,2.645751,1.414214,2.645751
4,1.732051,3.162278,3.316625,0.000000,4.000000,2.000000,1.414214,2.000000,2.828427,2.828427,...,3.000000,4.000000,1.000000,3.741657,2.449490,2.828427,1.000000,2.645751,1.414214,2.645751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,9.165152,9.110434,8.660254,9.110434,8.602325,9.273619,9.110434,9.165152,9.110434,8.831760,...,8.944272,8.944272,8.660254,8.774964,9.000000,9.055386,8.660254,8.660254,9.110434,9.055386
7708,9.273619,9.165152,8.717798,9.165152,8.602325,9.273619,9.110434,9.219544,9.110434,8.888194,...,9.000000,9.110434,8.717798,8.774964,9.000000,9.055386,8.717798,8.660254,9.110434,9.055386
7709,9.380832,9.219544,8.717798,9.219544,8.660254,9.327379,9.327379,9.219544,9.165152,8.944272,...,9.055386,9.110434,8.831760,8.774964,9.055386,9.110434,8.831760,8.660254,9.110434,9.110434
7710,9.380832,9.273619,8.831760,9.273619,8.831760,9.486833,9.327379,9.327379,9.219544,9.000000,...,9.273619,9.110434,8.944272,8.774964,9.055386,9.165152,8.944272,8.660254,9.219544,9.219544


In [146]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[1.414 2.236 3.    ... 2.449 1.414 2.236]


In [147]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True False ...  True  True  True]


In [148]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.69


In [149]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    3 ... 1926 1927 1928]


In [150]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [151]:
y_pred_MLPR_ad=list(y_pred_MLPR)

In [152]:
y_pred_MLPR_ad[:] = [x for i,x in enumerate(y_pred_MLPR_ad) if i not in out_Ad]

In [153]:
len(y_pred_MLPR_ad)

1332

In [154]:
y_ts_ad=list(y_ts)

In [155]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [156]:
len(y_ts_ad)

1332

In [157]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_MLPR_ad), 2)
Q2_TS

0.58

In [158]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_MLPR_ad)), 2)
RMSE_TS

0.65

# k-nearest neighbors

In [159]:
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

In [160]:
m = GridSearchCV(KNeighborsRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [161]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [162]:
best_kNN = m.best_estimator_

In [163]:
m.best_params_

{'n_neighbors': 7}

In [164]:
y_pred_ws_kNN = best_kNN.predict(x_tr)

In [165]:
R2_WS = round(r2_score(y_tr, y_pred_ws_kNN), 2)
R2_WS

0.65

In [166]:
RMSE_WS=round(np.sqrt(mean_absolute_error(y_tr, y_pred_ws_kNN)), 2)
RMSE_WS

0.61

In [167]:
y_pred_CV_kNN = cross_val_predict(best_kNN, x_tr, y_tr, cv=cv)

In [168]:
y_pred_CV_kNN

array([1.6700743, 2.6441638, 1.4905128, ..., 3.4748428, 4.457106 ,
       2.7543454], dtype=float32)

In [169]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_kNN), 2)
Q2_CV

0.5

In [170]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_kNN)), 2)
RMSE_CV

0.67

# 9. Prediction for test set's molecules

In [171]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [172]:
y_pred_kNN = best_kNN.predict(x_ts)

In [173]:
Q2_TS = round(r2_score(y_ts, y_pred_kNN), 2)
Q2_TS

0.53

In [174]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_kNN)), 2)
RMSE_TS

0.65

# save the model to disk

In [175]:
pickle.dump(best_kNN, open('models/MACCS/Toxicity_kNN_MACCS.pkl', 'wb'))

# load the model from disk

In [190]:
best_kNN = pickle.load(open('models/MACCS/Toxicity_kNN_MACCS.pkl', 'rb'))

# 10. Y-randomization kNN

In [176]:
permutations = 50
score, permutation_scores, pvalue = permutation_test_score(best_kNN, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=seed)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.9s


True score =  0.5 
Y-randomization =  -0.14 
p-value =  0.0196


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.7s finished


# 11. Estimating applicability domain. Method - Euclidian distances, K=1

In [177]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [178]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7702,7703,7704,7705,7706,7707,7708,7709,7710,7711
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.000000,4.000000,1.414214,2.236068,2.000000,2.645751,3.000000,2.645751,0.000000,0.000000,...,0.000000,1.000000,1.732051,3.162278,2.645751,2.000000,3.162278,2.449490,0.000000,3.000000
2,1.732051,4.242640,1.414214,2.236068,2.000000,2.828427,3.162278,2.828427,0.000000,0.000000,...,0.000000,2.449490,2.236068,3.162278,3.741657,2.449490,3.162278,2.449490,0.000000,3.162278
3,2.449490,4.358899,1.732051,2.449490,2.449490,3.162278,3.162278,3.000000,0.000000,0.000000,...,0.000000,3.000000,4.242640,3.464102,5.000000,2.449490,3.316625,3.000000,0.000000,3.162278
4,2.645751,4.690416,1.732051,2.645751,2.449490,3.162278,3.162278,3.000000,0.000000,0.000000,...,0.000000,3.162278,4.358899,3.464102,5.000000,2.449490,3.316625,3.464102,0.000000,3.316625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,8.246211,8.602325,9.219544,8.944272,9.165152,8.888194,9.000000,9.380832,9.110434,9.327379,...,9.000000,8.774964,8.544003,8.774964,8.306623,9.000000,8.774964,9.110434,9.000000,8.544003
7708,8.246211,8.602325,9.273619,9.000000,9.219544,8.888194,9.000000,9.380832,9.165152,9.327379,...,9.055386,8.831760,8.544003,8.774964,8.366600,9.000000,8.831760,9.110434,9.055386,8.544003
7709,8.306623,8.602325,9.327379,9.000000,9.219544,8.944272,9.000000,9.486833,9.219544,9.539392,...,9.110434,8.831760,8.602325,8.774964,8.426149,9.000000,8.831760,9.219544,9.110434,8.602325
7710,8.485281,8.717798,9.380832,9.000000,9.219544,8.944272,9.055386,9.539392,9.273619,9.539392,...,9.219544,8.888194,8.602325,8.888194,8.426149,9.000000,8.944272,9.273619,9.219544,8.774964


In [179]:
similarity= neighbors_k

In [180]:
Dmean=np.mean(similarity[1,:])

In [181]:
round(Dmean, 2)

2.03

In [182]:
std=np.std(similarity[1,:])

In [183]:
round(std, 2)

1.11

In [184]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

2.59


In [185]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [186]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928
0,1.414214,2.236068,3.000000,0.000000,2.449490,2.000000,0.000000,1.414214,2.000000,2.236068,...,2.236068,2.000000,0.000000,3.162278,1.000000,0.000000,0.000000,2.449490,1.414214,2.236068
1,1.732051,2.828427,3.162278,0.000000,2.449490,2.000000,0.000000,1.732051,2.000000,2.828427,...,2.645751,2.645751,0.000000,3.605551,1.414214,1.732051,0.000000,2.449490,1.414214,2.449490
2,1.732051,3.000000,3.162278,0.000000,3.162278,2.000000,0.000000,1.732051,2.449490,2.828427,...,2.828427,3.316625,0.000000,3.605551,2.236068,2.449490,0.000000,2.645751,1.414214,2.449490
3,1.732051,3.162278,3.316625,0.000000,3.872983,2.000000,1.414214,1.732051,2.645751,2.828427,...,3.000000,3.605551,0.000000,3.605551,2.449490,2.645751,0.000000,2.645751,1.414214,2.645751
4,1.732051,3.162278,3.316625,0.000000,4.000000,2.000000,1.414214,2.000000,2.828427,2.828427,...,3.000000,4.000000,1.000000,3.741657,2.449490,2.828427,1.000000,2.645751,1.414214,2.645751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,9.165152,9.110434,8.660254,9.110434,8.602325,9.273619,9.110434,9.165152,9.110434,8.831760,...,8.944272,8.944272,8.660254,8.774964,9.000000,9.055386,8.660254,8.660254,9.110434,9.055386
7708,9.273619,9.165152,8.717798,9.165152,8.602325,9.273619,9.110434,9.219544,9.110434,8.888194,...,9.000000,9.110434,8.717798,8.774964,9.000000,9.055386,8.717798,8.660254,9.110434,9.055386
7709,9.380832,9.219544,8.717798,9.219544,8.660254,9.327379,9.327379,9.219544,9.165152,8.944272,...,9.055386,9.110434,8.831760,8.774964,9.055386,9.110434,8.831760,8.660254,9.110434,9.110434
7710,9.380832,9.273619,8.831760,9.273619,8.831760,9.486833,9.327379,9.327379,9.219544,9.000000,...,9.273619,9.110434,8.944272,8.774964,9.055386,9.165152,8.944272,8.660254,9.219544,9.219544


In [187]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[1.414 2.236 3.    ... 2.449 1.414 2.236]


In [188]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True False ...  True  True  True]


In [189]:
print("Coverage = ", round(sum(cpd_AD) / len(cpd_AD), 2))

Coverage =  0.69


In [190]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [   0    1    3 ... 1926 1927 1928]


In [191]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# 12. Prediction only for molecules included in  AD

In [192]:
y_pred_kNN_ad=list(y_pred_kNN)

In [193]:
y_pred_kNN_ad[:] = [x for i,x in enumerate(y_pred_kNN_ad) if i not in out_Ad]

In [194]:
len(y_pred_kNN_ad)

1332

In [195]:
y_ts_ad=list(y_ts)

In [196]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [197]:
len(y_ts_ad)

1332

In [198]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_kNN_ad), 2)
Q2_TS

0.59

In [199]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_kNN_ad)), 2)
RMSE_TS

0.64