# 1. Importing modules and functions

In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import chembl_structure_pipeline
from molvs import standardize_smiles
import numpy as np
import pandas as pd
from copy import deepcopy
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split,KFold, StratifiedKFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import joblib
import pickle
from numpy import savetxt
from IPython.display import HTML
from rdkit.Chem import PandasTools

[07:40:51] Initializing Normalizer


In [2]:
def convert_smi_to_canon_smi(smi):
    
    try:
        canon_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi),isomericSmiles = False)
    except:
        canon_smi='wrong_smiles'
    return canon_smi

In [3]:
def standart(smi):
    global m
    if smi!='wrong_smiles':
        try:
            smiles=standardize_smiles(smi)
            m = Chem.MolFromSmiles(smi)
        except:
            smi='error kekule'
    else:
        m = 'check the smiles'
    return m

# MORGAN FP_Gradient Boosting Rat oral LD50, mg/kg  

## Load data and curation work set

In [4]:
# Set file path
df_ws=pd.read_csv('Acute Toxicity_rat_intravenous_LD50.csv')
df_ws

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical SMILES,InChIKey,rat_intravenous_LD50
0,TOX-145,785,"benzene-1,4-diol",Oc1ccc(O)cc1,Oc1ccc(O)cc1,QIGBRXMKCJKVMJ-UHFFFAOYSA-N,2.981137
1,TOX-245,5453,tris(aziridin-1-yl)-sulfanylidene-lambda5-phos...,S=P(N1CC1)(N1CC1)N1CC1,S=P(N1CC1)(N1CC1)N1CC1,FOCVUCIESVLUNU-UHFFFAOYSA-N,4.303848
2,TOX-1292,219402,"2-(3,4-dihydroxy-5-oxooxolan-2-yl)-2-hydroxyac...",O=CC(O)C1OC(=O)C(O)C1O,O=CC(O)C1OC(=O)C(O)C1O,UYUXSRADSPPKRZ-UHFFFAOYSA-N,1.740669
3,TOX-1311,896,N-[2-(5-methoxy-1H-indol-3-yl)ethyl]acetamide,COc1ccc2[nH]cc(CCNC(C)=O)c2c1,COc1ccc2[nH]cc(CCNC(C)=O)c2c1,DRLFMBDRBRZALE-UHFFFAOYSA-N,2.814567
4,TOX-1342,5538,"3,7-dimethyl-9-(2,6,6-trimethylcyclohexen-1-yl...",CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)O,CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)O,SHGAZHPCJJPHSC-UHFFFAOYSA-N,3.602699
...,...,...,...,...,...,...,...
2318,TOX-109688,48082,"(2,2-dimethyl-3-morpholin-4-ylpropyl) 2-cycloh...",CC(C)(COC(=O)C(c1ccccc1)C1CCCCC1)CN1CCOCC1,CC(C)(COC(=O)C(c1ccccc1)C1CCCCC1)CN1CCOCC1,IQJJJMCKXQKLKR-UHFFFAOYSA-N,3.772993
2319,TOX-109792,14991,(1-methylpiperidin-3-yl) 2-cyclohexyl-2-hydrox...,CN1CCCC(OC(=O)C(O)(c2ccccc2)C2CCCCC2)C1,CN1CCCC(OC(=O)C(O)(c2ccccc2)C2CCCCC2)C1,LCFBCKSQWVQIBY-UHFFFAOYSA-N,4.193820
2320,TOX-109817,216337,"7-[3-[cyclohexyl(methyl)amino]propyl]-1,3-dime...",CN(CCCn1cnc2c1c(=O)n(C)c(=O)n2C)C1CCCCC1,CN(CCCn1cnc2c1c(=O)n(C)c(=O)n2C)C1CCCCC1,YSSDOJFBBPZKBV-UHFFFAOYSA-N,3.510175
2321,TOX-113362,5342,"4-[2-(benzenesulfinyl)ethyl]-1,2-diphenylpyraz...",O=C1C(CCS(=O)c2ccccc2)C(=O)N(c2ccccc2)N1c1ccccc1,O=C1C(CCS(=O)c2ccccc2)C(=O)N(c2ccccc2)N1c1ccccc1,MBGGBVCUIVRRBF-UHFFFAOYSA-N,3.419388


 Convert a SMILES string to canonical SMILES

In [5]:
df_ws1 = deepcopy(df_ws)
df_ws1["SMILES"] = df_ws1.apply(lambda x: convert_smi_to_canon_smi(x.SMILES), axis=1)
df_ws1

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical SMILES,InChIKey,rat_intravenous_LD50
0,TOX-145,785,"benzene-1,4-diol",Oc1ccc(O)cc1,Oc1ccc(O)cc1,QIGBRXMKCJKVMJ-UHFFFAOYSA-N,2.981137
1,TOX-245,5453,tris(aziridin-1-yl)-sulfanylidene-lambda5-phos...,S=P(N1CC1)(N1CC1)N1CC1,S=P(N1CC1)(N1CC1)N1CC1,FOCVUCIESVLUNU-UHFFFAOYSA-N,4.303848
2,TOX-1292,219402,"2-(3,4-dihydroxy-5-oxooxolan-2-yl)-2-hydroxyac...",O=CC(O)C1OC(=O)C(O)C1O,O=CC(O)C1OC(=O)C(O)C1O,UYUXSRADSPPKRZ-UHFFFAOYSA-N,1.740669
3,TOX-1311,896,N-[2-(5-methoxy-1H-indol-3-yl)ethyl]acetamide,COc1ccc2[nH]cc(CCNC(C)=O)c2c1,COc1ccc2[nH]cc(CCNC(C)=O)c2c1,DRLFMBDRBRZALE-UHFFFAOYSA-N,2.814567
4,TOX-1342,5538,"3,7-dimethyl-9-(2,6,6-trimethylcyclohexen-1-yl...",CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)O,CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)O,SHGAZHPCJJPHSC-UHFFFAOYSA-N,3.602699
...,...,...,...,...,...,...,...
2318,TOX-109688,48082,"(2,2-dimethyl-3-morpholin-4-ylpropyl) 2-cycloh...",CC(C)(COC(=O)C(c1ccccc1)C1CCCCC1)CN1CCOCC1,CC(C)(COC(=O)C(c1ccccc1)C1CCCCC1)CN1CCOCC1,IQJJJMCKXQKLKR-UHFFFAOYSA-N,3.772993
2319,TOX-109792,14991,(1-methylpiperidin-3-yl) 2-cyclohexyl-2-hydrox...,CN1CCCC(OC(=O)C(O)(c2ccccc2)C2CCCCC2)C1,CN1CCCC(OC(=O)C(O)(c2ccccc2)C2CCCCC2)C1,LCFBCKSQWVQIBY-UHFFFAOYSA-N,4.193820
2320,TOX-109817,216337,"7-[3-[cyclohexyl(methyl)amino]propyl]-1,3-dime...",CN(CCCn1cnc2c1c(=O)n(C)c(=O)n2C)C1CCCCC1,CN(CCCn1cnc2c1c(=O)n(C)c(=O)n2C)C1CCCCC1,YSSDOJFBBPZKBV-UHFFFAOYSA-N,3.510175
2321,TOX-113362,5342,"4-[2-(benzenesulfinyl)ethyl]-1,2-diphenylpyraz...",O=C1C(CCS(=O)c2ccccc2)C(=O)N(c2ccccc2)N1c1ccccc1,O=C1C(CCS(=O)c2ccccc2)C(=O)N(c2ccccc2)N1c1ccccc1,MBGGBVCUIVRRBF-UHFFFAOYSA-N,3.419388


In [6]:
print('Original data: ', len(df_ws), 'molecules')
print('Failed data: ', len(df_ws1[df_ws1['SMILES']=='wrong_smiles']), 'molecules')

Original data:  2323 molecules
Failed data:  0 molecules


In [7]:
index=df_ws1.index[df_ws1['SMILES']=='wrong_smiles'].tolist()
wrong_smiles=df_ws.iloc[index]
wrong_smiles=wrong_smiles.SMILES
number=[x+1 for x in index]
bad_molecules = pd.DataFrame({'No. failed smiles in original set': number, 'SMILES of wrong structure: ': wrong_smiles}, index=None)
bad_molecules = bad_molecules.set_index('No. failed smiles in original set')
bad_molecules

Unnamed: 0_level_0,SMILES of wrong structure:
No. failed smiles in original set,Unnamed: 1_level_1


##  Standardization  for work set

In [8]:
df_ws1["Molecule"] = df_ws1.apply(lambda x: standart(x.SMILES), axis=1)
moldf_ws=df_ws1[df_ws1['SMILES']!='wrong_smiles']
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  2323 molecules


In [9]:
moldf_ws

Unnamed: 0,TAID,Pubchem CID,IUPAC Name,SMILES,Canonical SMILES,InChIKey,rat_intravenous_LD50,Molecule
0,TOX-145,785,"benzene-1,4-diol",Oc1ccc(O)cc1,Oc1ccc(O)cc1,QIGBRXMKCJKVMJ-UHFFFAOYSA-N,2.981137,<rdkit.Chem.rdchem.Mol object at 0x000001CE02A...
1,TOX-245,5453,tris(aziridin-1-yl)-sulfanylidene-lambda5-phos...,S=P(N1CC1)(N1CC1)N1CC1,S=P(N1CC1)(N1CC1)N1CC1,FOCVUCIESVLUNU-UHFFFAOYSA-N,4.303848,<rdkit.Chem.rdchem.Mol object at 0x000001CE029...
2,TOX-1292,219402,"2-(3,4-dihydroxy-5-oxooxolan-2-yl)-2-hydroxyac...",O=CC(O)C1OC(=O)C(O)C1O,O=CC(O)C1OC(=O)C(O)C1O,UYUXSRADSPPKRZ-UHFFFAOYSA-N,1.740669,<rdkit.Chem.rdchem.Mol object at 0x000001CE029...
3,TOX-1311,896,N-[2-(5-methoxy-1H-indol-3-yl)ethyl]acetamide,COc1ccc2[nH]cc(CCNC(C)=O)c2c1,COc1ccc2[nH]cc(CCNC(C)=O)c2c1,DRLFMBDRBRZALE-UHFFFAOYSA-N,2.814567,<rdkit.Chem.rdchem.Mol object at 0x000001CE029...
4,TOX-1342,5538,"3,7-dimethyl-9-(2,6,6-trimethylcyclohexen-1-yl...",CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)O,CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC(=O)O,SHGAZHPCJJPHSC-UHFFFAOYSA-N,3.602699,<rdkit.Chem.rdchem.Mol object at 0x000001CE029...
...,...,...,...,...,...,...,...,...
2318,TOX-109688,48082,"(2,2-dimethyl-3-morpholin-4-ylpropyl) 2-cycloh...",CC(C)(COC(=O)C(c1ccccc1)C1CCCCC1)CN1CCOCC1,CC(C)(COC(=O)C(c1ccccc1)C1CCCCC1)CN1CCOCC1,IQJJJMCKXQKLKR-UHFFFAOYSA-N,3.772993,<rdkit.Chem.rdchem.Mol object at 0x000001CE02A...
2319,TOX-109792,14991,(1-methylpiperidin-3-yl) 2-cyclohexyl-2-hydrox...,CN1CCCC(OC(=O)C(O)(c2ccccc2)C2CCCCC2)C1,CN1CCCC(OC(=O)C(O)(c2ccccc2)C2CCCCC2)C1,LCFBCKSQWVQIBY-UHFFFAOYSA-N,4.193820,<rdkit.Chem.rdchem.Mol object at 0x000001CE02A...
2320,TOX-109817,216337,"7-[3-[cyclohexyl(methyl)amino]propyl]-1,3-dime...",CN(CCCn1cnc2c1c(=O)n(C)c(=O)n2C)C1CCCCC1,CN(CCCn1cnc2c1c(=O)n(C)c(=O)n2C)C1CCCCC1,YSSDOJFBBPZKBV-UHFFFAOYSA-N,3.510175,<rdkit.Chem.rdchem.Mol object at 0x000001CE02A...
2321,TOX-113362,5342,"4-[2-(benzenesulfinyl)ethyl]-1,2-diphenylpyraz...",O=C1C(CCS(=O)c2ccccc2)C(=O)N(c2ccccc2)N1c1ccccc1,O=C1C(CCS(=O)c2ccccc2)C(=O)N(c2ccccc2)N1c1ccccc1,MBGGBVCUIVRRBF-UHFFFAOYSA-N,3.419388,<rdkit.Chem.rdchem.Mol object at 0x000001CE02A...


In [10]:
y_tr=moldf_ws.rat_intravenous_LD50
y_tr

0       2.981137
1       4.303848
2       1.740669
3       2.814567
4       3.602699
          ...   
2318    3.772993
2319    4.193820
2320    3.510175
2321    3.419388
2322    2.380587
Name: rat_intravenous_LD50, Length: 2323, dtype: float64

In [11]:
moldf_ws=moldf_ws.Molecule

## Calculation MorganFingerprint for work set

In [12]:
fp_tr = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2,nBits=1024,useFeatures=False,useChirality = False) for m in moldf_ws]

In [13]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [14]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [15]:
savetxt('Models/MorganFingerprint/x_tr.csv', x_tr, delimiter=',')

In [16]:
x_tr.shape

(2323, 1024)

 # GradientBoostingRegressor model building and validation

In [17]:
seed = 42

In [18]:
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [19]:
param_grid = {'learning_rate': [0.02,0.05],
                  'subsample'    : [0.9, 0.5, 0.1],
                  'n_estimators' : [100,500,1000],
                  'max_depth'    : [4, 10]
                 }

In [20]:
m = GridSearchCV(GradientBoostingRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [21]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [22]:
best_GBR = m.best_estimator_

In [23]:
m.best_params_

{'learning_rate': 0.02,
 'max_depth': 10,
 'n_estimators': 1000,
 'subsample': 0.5}

In [24]:
y_pred_CV_GBR = cross_val_predict(best_GBR, x_tr, y_tr, cv=cv)

In [25]:
y_pred_CV_GBR

array([3.27185673, 3.37535948, 3.88466752, ..., 3.94031735, 3.31027461,
       2.31942743])

In [26]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_GBR), 2)
Q2_CV

0.57

In [27]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_GBR)), 2)
RMSE_CV

0.64

# save the model to disk

In [28]:
pickle.dump(best_GBR, open('models/MorganFingerprint/Toxicity_GBR_MF.pkl', 'wb'))

# load the model from disk

In [17]:
best_GBR = pickle.load(open('models/MorganFingerprint/Toxicity_GBR_MF.pkl', 'rb'))

# SVM model building and validation

In [29]:
param_grid = {"C": [10 ** i for i in range(0, 5)],
              "gamma": [10 ** i for i in range(-6, 0)]}

In [30]:
seed = 42
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [31]:
svm = GridSearchCV(SVR(C=1.0, epsilon=0.2), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [32]:
svm.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [33]:
svm.best_params_
best_svm = svm.best_estimator_

In [34]:
svm.best_params_

{'C': 10, 'gamma': 0.01}

In [35]:
y_pred_CV_svm = cross_val_predict(best_svm, x_tr, y_tr, cv=cv)

In [36]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_svm), 2)
Q2_CV

0.51

In [37]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_svm)), 2)
RMSE_CV

0.69

In [38]:
pickle.dump(best_svm, open('models/MorganFingerprint/Toxicity_SVM_MF.pkl', 'wb'))

load the model from disk

In [28]:
best_svm = pickle.load(open('models/MorganFingerprint/Toxicity_SVM_MF.pkl', 'rb'))

# Multi-layer Perceptron regressor

In [39]:
from sklearn.neural_network import MLPRegressor

In [40]:
param_grid ={"hidden_layer_sizes": [(400, 300, 200, 100),(100, 100, 100), (10, 10, 10),(50,)], "activation": ["tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [0.00005,0.0005], 'max_iter': [1000, 2000]}

In [41]:
m = GridSearchCV(MLPRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [42]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [43]:
best_MLPR = m.best_estimator_

In [44]:
m.best_params_

{'activation': 'relu',
 'alpha': 5e-05,
 'hidden_layer_sizes': (400, 300, 200, 100),
 'max_iter': 1000,
 'solver': 'lbfgs'}

In [45]:
y_pred_CV_MLPR = cross_val_predict(best_MLPR, x_tr, y_tr, cv=cv)

In [46]:
y_pred_CV_MLPR

array([2.75848247, 3.14829669, 3.09001928, ..., 3.56123037, 3.18038449,
       1.42166565])

In [47]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_MLPR), 2)
Q2_CV

0.51

In [48]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_MLPR)), 2)
RMSE_CV

0.69

# save the model to disk

In [49]:
pickle.dump(best_MLPR, open('models/MorganFingerprint/Toxicity_MLPR_MF.pkl', 'wb'))

# load the model from disk

In [32]:
best_MLPR = pickle.load(open('models/MorganFingerprint/Toxicity_MLPR_MF.pkl', 'rb'))

# k-nearest neighbors

In [50]:
from sklearn.neighbors import KNeighborsRegressor

In [51]:
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

In [52]:
m = GridSearchCV(KNeighborsRegressor(), param_grid, n_jobs=-1, cv=cv, verbose=1)

In [53]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [54]:
best_kNN = m.best_estimator_

In [55]:
m.best_params_

{'n_neighbors': 3}

In [56]:
y_pred_CV_kNN = cross_val_predict(best_kNN, x_tr, y_tr, cv=cv)

In [57]:
y_pred_CV_kNN

array([1.94606953, 1.75459522, 1.92709073, ..., 3.05900402, 3.20418276,
       1.56011162])

In [58]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_kNN), 2)
Q2_CV

0.13

In [59]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_kNN)), 2)
RMSE_CV

0.92

# save the model to disk

In [60]:
pickle.dump(best_kNN, open('models/MorganFingerprint/Toxicity_kNN_MF.pkl', 'wb'))

# load the model from disk

In [18]:
best_kNN = pickle.load(open('models/MorganFingerprint/Toxicity_kNN_MF.pkl', 'rb'))

# CatBoostRegressor

In [61]:
from catboost import CatBoostRegressor

In [62]:
cv=KFold(n_splits=5, random_state=42, shuffle=True)

In [63]:
%%time
model = CatBoostRegressor()
parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [100,500, 1000]
              }

grid = GridSearchCV(estimator=model, param_grid = parameters, n_jobs=-1, cv = cv)
grid.fit(x_tr, y_tr, verbose=False)

CPU times: total: 8min 5s
Wall time: 15min 24s


In [64]:
best_CatBR = grid.best_estimator_

In [65]:
grid.best_params_

{'depth': 10, 'iterations': 1000, 'learning_rate': 0.05}

In [66]:
%%time
y_pred_CV_CatBR = cross_val_predict(best_CatBR, x_tr, y_tr, cv=cv, verbose=False)

0:	learn: 0.9657005	total: 34.4ms	remaining: 34.4s
1:	learn: 0.9576367	total: 69.9ms	remaining: 34.9s
2:	learn: 0.9473788	total: 105ms	remaining: 34.9s
3:	learn: 0.9355760	total: 141ms	remaining: 35.2s
4:	learn: 0.9278315	total: 198ms	remaining: 39.5s
5:	learn: 0.9173400	total: 234ms	remaining: 38.8s
6:	learn: 0.9111047	total: 275ms	remaining: 39s
7:	learn: 0.9028262	total: 310ms	remaining: 38.4s
8:	learn: 0.8931775	total: 349ms	remaining: 38.4s
9:	learn: 0.8857264	total: 394ms	remaining: 39s
10:	learn: 0.8768929	total: 431ms	remaining: 38.7s
11:	learn: 0.8699847	total: 472ms	remaining: 38.8s
12:	learn: 0.8626230	total: 518ms	remaining: 39.4s
13:	learn: 0.8575442	total: 559ms	remaining: 39.4s
14:	learn: 0.8507854	total: 604ms	remaining: 39.6s
15:	learn: 0.8438561	total: 643ms	remaining: 39.5s
16:	learn: 0.8362567	total: 685ms	remaining: 39.6s
17:	learn: 0.8288236	total: 721ms	remaining: 39.4s
18:	learn: 0.8217924	total: 770ms	remaining: 39.8s
19:	learn: 0.8176151	total: 811ms	remaining

In [67]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_CatBR), 2)
Q2_CV

0.58

In [68]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_CatBR)), 2)
RMSE_CV

0.64

# save the model to disk

In [69]:
pickle.dump(best_CatBR, open('Models/MorganFingerprint/Toxicity_CatBoost_MF.pkl', 'wb'))

# load the model from disk

In [17]:
best_CatBR = pickle.load(open('Models/MorganFingerprint/Toxicity_CatBoost_MF.pkl', 'rb'))