In [None]:
import xgboost as xgb
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, PowerTransformer, QuantileTransformer
import category_encoders as ce
from hyperopt import hp, tpe, Trials, STATUS_OK, fmin
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from rdkit import DataStructs, Chem
from rdkit.Chem import AllChem
import shap
from sklearn.model_selection import cross_validate
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge,ARDRegression,BayesianRidge,ElasticNet,GammaRegressor,HuberRegressor
from sklearn.linear_model import Lasso, LassoLars, LinearRegression, LogisticRegression, PassiveAggressiveRegressor,Ridge,SGDRegressor

In [None]:
pd.set_option('display.max_columns', None)
data = pd.read_csv('Rej_0416.csv')
hh = data.drop(columns=['salt rejection','Monomer A1 type','Monomer A2 type'])
data.drop(index = hh[hh.duplicated(keep='first')==True].index, inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
class morgan_fp:
    def __init__(self, radius, length):
        self.radius = radius
        self.length = length
    def __call__(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, self.length)
        npfp = np.array(list(fp.ToBitString())).astype('float32')
        return npfp

In [None]:
categorical_features = data.select_dtypes(include=['object']).drop(['Monomer A1 type','a-smile', 'Monomer A2 type', 'b-smile'], axis=1).columns
numeric_features = data.select_dtypes(include=['int64', 'float64']).drop(['salt rejection'], axis=1).columns

In [None]:
from sklearn.model_selection import ShuffleSplit
sss = ShuffleSplit(n_splits=1, test_size=0.2,random_state=50)
sss.split(data)
for train_index, test_index in sss.split(data):
    train_data = data.iloc[train_index]
    test = data.iloc[test_index]
    train_data.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    train_data.to_csv('salt_train_data.csv', index = False)
    test.to_csv('salt_test.csv', index = False)

In [None]:
def conv_data_pd(data, data_t, fp, en, scaler):
    data['a-fp'] = data['a-smile'].apply(fp)
    data['b-fp'] = data['b-smile'].apply(fp)
    x_a=np.array(list(data['a-fp']))
    x_b=np.array(list(data['b-fp']))
    for i in range(len(data)):
        if data['b-smile'][i]=='C':
            x_b[i]=x_b[i]*0
    
    X_train = data_t.drop(columns=['Monomer A1 type', 'a-smile','Monomer A2 type', 'b-smile', 'salt rejection', 'a-fp', 'b-fp']).copy()
    Y_train = data_t['salt rejection'].copy()
    hh=en.fit_transform(X_train, Y_train)
    SC= scaler.fit(hh[numeric_features])
    
    x = data.drop(columns=['Monomer A1 type', 'a-smile','Monomer A2 type', 'b-smile', 'salt rejection', 'a-fp', 'b-fp']).copy()
    y = data['salt rejection'].copy()
    xx = en.transform(x, y)
    col_pd = xx.drop(columns= numeric_features) #pd
    xxxx = SC.transform(xx[numeric_features])
    num_pd = pd.DataFrame(data= xxxx, columns=numeric_features) #pd
    
    xxxxx = np.concatenate((x_a, x_b), axis =1)
    fp_pd = pd.DataFrame(data= xxxxx, columns=[f'f_{i}' for i in range(2*x_a.shape[1])])
    
    x_pd = pd.concat([fp_pd, col_pd, num_pd], axis =1)
    y = data['salt rejection'].values
    
    return x_pd, y

In [None]:
fp = morgan_fp(1,2048)
train_data = pd.read_csv('salt_train_data.csv')
test_data = pd.read_csv('salt_test.csv')

In [None]:
from sklearn.model_selection import KFold
kf = KFold(shuffle=True, random_state=10)

In [None]:
models = [CatBoostRegressor(verbose=False, random_state=10), XGBRegressor(random_state=10)]
encoder = [ce.backward_difference.BackwardDifferenceEncoder(cols = categorical_features), 
               ce.basen.BaseNEncoder(cols = categorical_features),
               ce.binary.BinaryEncoder(cols = categorical_features), 
                ce.helmert.HelmertEncoder(cols = categorical_features),
                ce.james_stein.JamesSteinEncoder(cols = categorical_features),
                ce.one_hot.OneHotEncoder(cols = categorical_features),
                ce.m_estimate.MEstimateEncoder(cols = categorical_features),
                ce.sum_coding.SumEncoder(cols = categorical_features)]
scaler = [StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler(),PowerTransformer()]
results = pd.DataFrame(columns=['train_rmse','train_r2', 'test_rmse','test_r2', 'name', 'encoder', 'scaler'])

k =0 
for model in models:
    for sc in scaler:
        for en in encoder:
            t_rmse=[]
            t_r2=[]
            v_rmse=[]
            v_r2=[]
            for train_index, test_index in kf.split(train_data):
                train = train_data.loc[train_index]
                train.reset_index(drop=True, inplace=True)
                val = train_data.loc[test_index]
                val.reset_index(drop=True, inplace=True)
                x_train_pd, y_train = conv_data_pd(train, train, fp, en, sc)
                x_train = x_train_pd.values
                x_val_pd, y_val = conv_data_pd(val, train, fp, en,sc)
                x_val = x_val_pd.values
                model.fit(x_train, y_train)
                y_val_pred = model.predict(x_val)
                y_train_pred = model.predict(x_train)
                t_rmse.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
                v_rmse.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
                t_r2.append(r2_score(y_train,y_train_pred))
                v_r2.append(r2_score(y_val, y_val_pred))
            results.loc[k, 'train_rmse']=np.mean(t_rmse)
            results.loc[k, 'test_rmse']=np.mean(v_rmse)
            results.loc[k, 'train_r2']=np.mean(t_r2)
            results.loc[k, 'test_r2']=np.mean(v_r2)
            results.loc[k, 'name']=model.__class__.__name__
            results.loc[k, 'encoder']=en.__class__.__name__
            results.loc[k, 'scaler']=sc.__class__.__name__
            k+=1

In [None]:
results.sort_values(['test_r2'], ascending= False, inplace = True)
results.to_excel('salt_model_selection.xlsx', index =False)
results.head()

In [None]:
space = {    
    'max_delta_step': hp.uniform('max_delta_step', 0, 10),
         'learning_rate': hp.uniform('learning_rate', 0, 1),
        'max_depth': hp.quniform('max_depth', 1,6,1),
         'min_child_weight': hp.uniform('min_child_weight', 1,100),
         'subsample': hp.uniform('subsample', 0.0, 1.0),
         'reg_alpha':hp.uniform('reg_alpha', 0, 100),
         'gamma':hp.uniform('gamma', 0, 100),
         'reg_lambda':hp.uniform('reg_lambda', 0, 100),
    'n_estimators': hp.quniform('n_estimators', 1, 100, 1),
        'fp_length': hp.quniform('fp_length', 10, 5048, 1)
}

In [None]:
import pickle
sc = RobustScaler()
en = ce.helmert.HelmertEncoder(cols = categorical_features)
def fit(params):
    fp = morgan_fp(0, params['fp_length'])
    model = XGBRegressor(max_delta_step=params['max_delta_step'], learning_rate = params['learning_rate'],
                    max_depth = params['max_depth'], min_child_weight= params['min_child_weight'],
                    subsample=params['subsample'],reg_alpha=params['reg_alpha'],gamma= params['gamma'],
                    reg_lambda= params['reg_lambda'],n_estimators=params['n_estimators'], random_state=10)
    t_rmse=[]
    v_rmse=[]
    for train_index, test_index in kf.split(train_data):
        train = train_data.loc[train_index]
        train.reset_index(drop=True, inplace=True)
        val = train_data.loc[test_index]
        val.reset_index(drop=True, inplace=True)
        x_train_pd, y_train = conv_data_pd(train, train, fp, en, sc)
        x_train = x_train_pd.values
        x_val_pd, y_val = conv_data_pd(val, train, fp, en, sc)
        x_val = x_val_pd.values
        model.fit(x_train, y_train)
        t_rmse.append(np.sqrt(mean_squared_error(y_train, model.predict(x_train))))
        v_rmse.append(np.sqrt(mean_squared_error(y_val, model.predict(x_val))))
    
    return np.mean(v_rmse), np.mean(t_rmse)

def objective(params):
    global ITERATION
    ITERATION +=1
    for name in ['max_depth', 'n_estimators', 'fp_radius', 'fp_length']:
        params[name] = int(params[name])
    loss, train_loss = fit(params)
    loss =loss
    off_connection = open(out_file, 'a')
    writer = csv.writer(off_connection)
    writer.writerow([loss,train_loss, params, ITERATION])
    pickle.dump(bayes_trial, open("rej_1.p", "wb"))
    return {'loss':loss,'train_loss':train_loss, 'params': params, 'iteration':ITERATION, 'status':STATUS_OK}

In [None]:
import csv
out_file ='rej.csv'
off_connection =open(out_file, 'w')
writer = csv.writer(off_connection)
writer.writerow(['loss','train_loss', 'params', 'iteration'])
off_connection.close()

In [None]:
tpe_algo = tpe.suggest
bayes_trial = Trials()

In [None]:
##%%capture
global ITERATION
ITERATION =0
best = fmin(fn = objective, space =space, algo = tpe_algo, trials = bayes_trial, max_evals=3000, rstate= np.random.RandomState(50)) 

In [None]:
result = pd.read_csv('rej.csv')
result.sort_values('loss', ascending= True, inplace = True)
result.reset_index(drop = True, inplace =True)
result.head()

In [None]:
import ast
params = ast.literal_eval(result['params'][0])
params

In [None]:
fp = morgan_fp(0, params['fp_length'])
#fp = morgan_fp(0, 3000)
sc = RobustScaler()
en = ce.helmert.HelmertEncoder(cols = categorical_features)
x_train_pd, y_train = conv_data_pd(train_data, train_data, fp, en, sc)
x_train=x_train_pd.values
model = XGBRegressor(max_delta_step=params['max_delta_step'], learning_rate = params['learning_rate'],
                    max_depth = params['max_depth'], min_child_weight= params['min_child_weight'],
                    subsample=params['subsample'],reg_alpha=params['reg_alpha'],gamma= params['gamma'],
                    reg_lambda= params['reg_lambda'],n_estimators=params['n_estimators'])
model.save_model('rej_model.model')

In [None]:
model.fit(x_train,y_train)
y_train_pred = model.predict(x_train)
x_test_pd, y_test = conv_data_pd(test_data, train_data, fp, en, sc)
x_test=x_test_pd.values
y_test_pred = model.predict(x_test)
r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)

In [None]:
np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))

In [None]:
plt.scatter(y_train, y_train_pred, label='Train')
plt.scatter(y_test, y_test_pred, label='Test')
plt.xlim(0,110)
plt.ylim(0,110)
xx=range(0,110)
yy=xx
plt.plot(xx, yy, '--', linewidth=2)
plt.legend()

In [None]:
train_data['pred']=y_train_pred
test_data['pred']=y_test_pred
train_data.to_excel('salt_train_pred.xlsx', index=False)
test_data.to_excel('salt_test_pred.xlsx', index=False)