In [7]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, PandasTools, Descriptors
import urllib.request
from scipy.stats import norm
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, DotProduct, WhiteKernel, RBF, ConstantKernel
url = 'https://raw.githubusercontent.com/onecoinbuybus/Database_chemoinformatics/master/water%20solubility.txt'
urllib.request.urlretrieve(url, 'water_solubility.txt') 
df = pd.read_csv('water_solubility.txt', sep=',') 
PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')
df.columns = ['ID', 'm_sol', 'p_sol', 'SMILES', 'ROMol']
df=df.sort_values('m_sol',ascending=True)
mols=list(map(lambda x: Chem.MolFromSmiles(x), df['SMILES']))
fingerprint=np.array(list(map(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, 2, 2048), mols)))
X=fingerprint
y=df['m_sol']

116    -11.600
519     -9.332
117     -9.160
119     -9.150
427     -9.018
         ...  
710      1.120
1139     1.144
819      1.340
801      1.570
361      1.580
Name: m_sol, Length: 1144, dtype: float64

In [8]:
scaled_X = X
scaled_y = ((y - y.mean(axis=0)) / y.std(axis=0, ddof=1)).values
#minimizie y=-y
y_all=scaled_y.copy()
y_all

array([-4.07440723, -2.99260522, -2.9105638 , ...,  2.09777884,
        2.20748539,  2.21225524])

In [3]:
X_train=scaled_X[:20]
y_train=scaled_y[:20]

In [4]:
import warnings
import matplotlib.figure as figure
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn import model_selection
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, DotProduct, WhiteKernel, RBF, ConstantKernel
warnings.filterwarnings('ignore')

do_maximization = False  

In [5]:
def bayesianoptimization(x,y,x_candidates):
    GPR = GaussianProcessRegressor(ConstantKernel() * RBF() + WhiteKernel(), alpha=0)
    GPR.fit(x, y)
    y_pred, y_pred_std = GPR.predict(x_candidates, return_std=True)
    #probability of improvement(PI)calculation
    pi= norm.cdf((y_pred - max(y)) / y_pred_std)
    good_num = np.where(pi == max(pi))[0][0]
    good_x_candidate = x_candidates[good_num, :]
    
    return good_num, good_x_candidate,max(pi)

In [6]:
iter_num=1000
for iteration in range(iter_num):
    print(['iter_num',iteration+1])
    good_num, good_x_candidate,pi = bayesianoptimization(X_train, y_train, scaled_X)
    X_train = np.append(X_train, np.reshape(scaled_X[good_num, :], (1, X_train.shape[1])), 0)
    y_train = np.append(y_train, scaled_y[good_num])
    scaled_X = np.delete(scaled_X, good_num, 0)
    scaled_y = np.delete(scaled_y, good_num)
    print('y_max=',y_train.max())
    print('good_num',good_num,'pi',pi)
    if y_train.max()==y_all.max():
        break

['iter_num', 1]
y_max= -2.441210547637646
good_num 19 pi 0.31308916467242953
['iter_num', 2]
y_max= -2.441210547637646
good_num 14 pi 0.36931852066895016
['iter_num', 3]
y_max= -2.365369930562295
good_num 21 pi 0.3633247239082699
['iter_num', 4]
y_max= -2.080609877769937
good_num 36 pi 0.8627193511267175
['iter_num', 5]
y_max= -0.20748973151267264
good_num 439 pi 0.5206313624517727
['iter_num', 6]
y_max= -0.20748973151267264
good_num 217 pi 0.03272842273925012
['iter_num', 7]
y_max= -0.20748973151267264
good_num 318 pi 0.06265204935610601
['iter_num', 8]
y_max= -0.20748973151267264
good_num 188 pi 0.13382228602243335
['iter_num', 9]
y_max= -0.20748973151267264
good_num 283 pi 0.10259342549110934
['iter_num', 10]
y_max= -0.20748973151267264
good_num 274 pi 0.06899736961538758
['iter_num', 11]
y_max= -0.20748973151267264
good_num 193 pi 0.10111815739957314
['iter_num', 12]
y_max= -0.20748973151267264
good_num 146 pi 0.07968837340392626
['iter_num', 13]
y_max= -0.20748973151267264
good_nu