In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import gpflow
import tensorflow as tf
from gpflow.ci_utils import ci_niter
from gpflow.mean_functions import Constant
from gpflow.utilities import positive, print_summary
from gpflow.utilities.ops import broadcasting_elementwise
from mordred import Calculator, descriptors
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import mean_absolute_error as mae

In [None]:
train_data= pd.read_csv('data/train_koc.csv')
test_data= pd.read_csv('data/test_koc.csv')
train_data.head()

In [None]:
smile= list(train_data['new_smile'])
mols = [Chem.MolFromSmiles(smi) for smi in smile]
calc = Calculator(descriptors, ignore_3D=True)
train_md = calc.pandas(mols)

smile= list(test_data['new_smile'])
mols = [Chem.MolFromSmiles(smi) for smi in smile]
calc = Calculator(descriptors, ignore_3D=True)
test_md = calc.pandas(mols)
test_md.head()

In [None]:
train_md=train_md.astype('float64')
test_md=test_md.astype('float64')
train_md['Observed']=train_data['Observed']
test_md['Observed']=test_data['Observed']
data = pd.concat([train_md, test_md], ignore_index=True)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(axis=1,inplace=True)
len(data.columns)
corr = data.drop(columns=['Observed']).corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = data.drop(columns=['Observed']).columns[columns]
f_data = data[selected_columns]
for i in f_data.columns:
    if len(f_data[i].value_counts())<3:
        f_data.drop(columns=[i], inplace=True)
len(f_data.columns)
selected_columns=f_data.columns
len(selected_columns)

In [None]:
import pickle
with open("koc_col.dump" , "wb") as f:
     pickle.dump(selected_columns, f)

In [None]:
f_data = data[selected_columns]
f_data['Observed']=data['Observed']

In [None]:
from sklearn.model_selection import ShuffleSplit
sss = ShuffleSplit(n_splits=1, test_size=0.25,random_state=50)
sss.split(f_data)
for train_index, test_index in sss.split(f_data):
    train_data = f_data.iloc[train_index]
    test = f_data.iloc[test_index]
    train_data.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    train_data.to_csv('data/train_koc_rand_mo.csv', index = False)
    test.to_csv('data/test_koc_rand_mo.csv', index = False)

In [None]:
train_data=pd.read_csv('data/train_koc_rand_mo.csv')
test_data=pd.read_csv('data/test_koc_rand_mo.csv')
x_train = train_data.drop(columns=['Observed']).values
y_train = train_data['Observed'].values.reshape(-1,1)
x_test = test_data.drop(columns=['Observed']).values
y_test = test_data['Observed'].values.reshape(-1,1)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train.shape

In [None]:
import sklearn.gaussian_process as gp
kernel = gp.kernels.ConstantKernel(1.0, (1e-1, 1e3)) * gp.kernels.RBF(1.0, (1e-3, 1e7)) + gp.kernels.WhiteKernel() 
m_1 = gp.GaussianProcessRegressor(kernel=kernel, 
                                    optimizer='fmin_l_bfgs_b',
                                    n_restarts_optimizer=30, 
                                    #alpha =0.01,
                                    normalize_y=True, random_state=10)
m_1.fit(x_train, y_train)
print(m_1.kernel_)

In [None]:
import pickle
with open("koc_rand.dump" , "wb") as f:
     pickle.dump(m_1, f)

In [None]:
y_pred_train, y_var_train = m_1.predict(x_train, return_std=True)
y_pred_test, y_var_test = m_1.predict(x_test, return_std=True)

In [None]:
r2_score(y_train, y_pred_train),r2_score(y_test, y_pred_test),np.sqrt(mean_squared_error(y_train, y_pred_train)), np.sqrt(mean_squared_error(y_test, y_pred_test))

In [None]:
mae(y_train, y_pred_train), mae(y_test, y_pred_test)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(8,5))
plt.hist(y_var_train, bins=20, label='Training')
plt.hist(y_var_test, bins =100, label='Test')
plt.vlines(x = y_var_train.max(), ymin=0, ymax=70,linestyles='--')
plt.xlabel('Sigma', fontsize =20)
plt.ylabel('Count', fontsize =20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
plt.tight_layout()
plt.savefig('koc_rand_var_plt_mor.png', figsize=(10, 10),dpi=300)
#plt.show()

In [None]:
plt.scatter(y_train, y_pred_train)
plt.scatter(y_test, y_pred_test)

In [None]:
train_data['pred']=y_pred_train
test_data['pred']=y_pred_test
train_data['var']=y_var_train
test_data['var']=y_var_test
train_data.to_excel('train_data_rand_pred_mor.xlsx', index=False)
test_data.to_excel('test_data_rand_pred_mor.xlsx', index=False)

In [None]:
np.random.seed(100)
r_sample = np.random.randint(752, size =5)
data_md = f_data.copy()
train = data_md.loc[r_sample].copy()
train.reset_index(drop=True, inplace=True)
train_left = data_md.drop(index=r_sample).copy()
train_left.reset_index(drop=True, inplace=True)

In [None]:
x_train = train.drop(columns=['Observed']).values
y_train = train['Observed'].values.reshape(-1,1)
x_train = sc.fit_transform(x_train)
kernel = gp.kernels.ConstantKernel(1.0, (1e-1, 1e3)) * gp.kernels.RBF(1.0, (1e-3, 1e7)) + gp.kernels.WhiteKernel() 
m = gp.GaussianProcessRegressor(kernel=kernel, 
                                    optimizer='fmin_l_bfgs_b',
                                    n_restarts_optimizer=30, 
                                    #alpha =0.01,
                                    normalize_y=True, random_state=10)
m.fit(x_train, y_train)
print(m.kernel_)
y_pred_train, y_var_train = m.predict(x_train, return_std=True)
r2_score(y_train, y_pred_train),

In [None]:
x_data = train_left.drop(columns=['Observed']).values
x_data = sc.transform(x_data)
y_pred, y_var=m.predict(x_data, return_std=True)
for j in range(559):
    ind = np.argmax(y_var)
    train = pd.concat([train, train_left.loc[ind:ind]], ignore_index=True)
    x_train = train.drop(columns=['Observed']).values
    y_train = train['Observed'].values.reshape(-1,1)
    x_train = sc.fit_transform(x_train)
    kernel = gp.kernels.ConstantKernel(1.0, (1e-1, 1e3)) * gp.kernels.RBF(1.0, (1e-3, 1e7)) + gp.kernels.WhiteKernel() 
    m = gp.GaussianProcessRegressor(kernel=kernel, 
                                    optimizer='fmin_l_bfgs_b',
                                    n_restarts_optimizer=30, 
                                    #alpha =0.01,
                                    normalize_y=True, random_state=10)
    m.fit(x_train, y_train)
    train_left.drop(index=[ind], inplace=True)
    train_left.reset_index(drop=True, inplace=True)
    x_data = train_left.drop(columns=['Observed']).values
    x_data = sc.transform(x_data)
    y_data = train_left['Observed'].values.reshape(-1,1)
    y_train_pred, y_var_train = m.predict(x_train, return_std=True)
    y_pred, y_var = m.predict(x_data, return_std=True)

In [None]:
train['pred']=y_train_pred.reshape(-1)
train_left['pred']=y_pred.reshape(-1)
train['var']=y_var_train.reshape(-1)
train_left['var']=y_var.reshape(-1)
train.to_excel('train_data_ac_pred_koc_mor.xlsx', index=False)
train_left.to_excel('test_data_ac_pred_koc_mor.xlsx', index=False)

In [None]:
np.sqrt(mean_squared_error(y_train,y_train_pred)), np.sqrt(mean_squared_error(y_data, y_pred))

In [None]:
mae(y_train, y_train_pred), mae(y_data, y_pred)

In [None]:
import pickle
with open("koc_ac.dump" , "wb") as f:
     pickle.dump(m, f)

In [None]:
#train_v=pd.read_excel('train_ac_var_koc_mor.xlsx')
plt.figure(figsize=(8,5))
plt.hist(y_var_train, bins=20, label ='Training')
plt.hist(y_var, bins =20, label='Test')
plt.vlines(x = y_var_train.max(), ymin=0, ymax=70,linestyles='--')
plt.xlabel('Sigma', fontsize =20)
plt.ylabel('Count', fontsize =20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
plt.tight_layout()
plt.savefig('koc_ac_var_plt_mor.png',figsize = (10,10), dpi=300)
#plt.show()
