In [1]:
import time
import copy
import pickle

import numpy as np
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [2]:
def display_df(df):
    display(HTML(df.to_html()))
    return None

In [None]:
def rmse(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.sqrt(mean_squared_error(y_eval, y_hat))


def r2(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return r2_score(y_eval, y_hat)


def peason_r(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.corrcoef(y_eval, y_hat)[0, 1]


def peason_r_metric(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]

peason_r_score = make_scorer(peason_r_metric)

In [None]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def hyperopt_reg(regressor,
                 params_tuned, 
                 X_train, y_train,
                 num_eval,
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {}
    
    def objective(params):
        regressor.set_params(**params_fixed, **params)
        # may use scoring='r2', "neg_mean_squared_error"
        neg_mse = cross_val_score(regressor, X_train, y_train, cv=10, scoring="neg_mean_squared_error").mean()
        #r2 = cross_val_score(regressor, X_train, y_train, cv=10, scoring="r2").mean()
        #pearson_r = cross_val_score(regressor, X_train, y_train, cv=10, scoring=peason_r_score).mean()
        return {"loss": -neg_mse, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_params = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    best_params = whole_to_int(best_params)
    best_model = regressor.set_params(**params_fixed, **best_params)
    best_model.fit(X_train, y_train)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_params, best_model

# Load train/test

In [3]:
pdXY = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_108ft_imputed_std.csv")

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = sorted([col for col in pdXY.columns if col not in PDY_COLS])
print("PDX_COLS", len(PDX_COLS))

print(pdXY.shape)
display_df(pdXY.head())


X_train = pdXY.loc[pdXY["train_test"] == "train", PDX_COLS].copy().values
y_train = pdXY.loc[pdXY["train_test"] == "train", "dG"].copy().values
print(X_train.shape, y_train.shape)

X_test = pdXY.loc[pdXY["train_test"] == "test", PDX_COLS].copy().values
y_test = pdXY.loc[pdXY["train_test"] == "test", "dG"].copy().values
print(X_test.shape, y_test.shape)

PDX_COLS 108
(1859, 114)


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,MaxPartialCharge,MinPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan3,BalabanJ,BertzCT,Chi1n,Chi2v,Chi3n,HallKierAlpha,Ipc,Kappa2,Kappa3,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA7,SlogP_VSA8,TPSA,EState_VSA1,EState_VSA10,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState10,VSA_EState8,VSA_EState9,FractionCSP3,NHOHCount,NOCount,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,fr_Al_OH,fr_ArN,fr_Ar_N,fr_Ar_OH,fr_C_O,fr_NH0,fr_NH1,fr_NH2,fr_Ndealkylation1,fr_Ndealkylation2,fr_allylic_oxid,fr_amide,fr_aniline,fr_aryl_methyl,fr_bicyclic,fr_ester,fr_ether,fr_halogen,fr_ketone,fr_ketone_Topliss,fr_methoxy,fr_para_hydroxylation,fr_piperdine,fr_piperzine,fr_pyridine,fr_quatN,fr_unbrch_alkane,new_id,smiles,dG,code,train_test,smiles_len
0,0.018971,0.179801,0.19104,0.21839,-1.632057,0.746804,0.119087,0.981708,-0.002618,0.91829,1.560917,-0.910241,-1.435985,-1.433772,-1.167884,0.546802,-1.704112,-1.520472,-1.208338,-0.871227,-0.156709,-0.577777,-0.672744,-0.444487,0.004721,-1.160296,0.453844,-0.330714,-0.323921,-0.560798,-1.078734,-0.677036,-1.114388,-0.932611,-0.6087,-1.013132,-0.43761,-1.069899,-1.105732,-0.143045,-0.622478,-0.43835,-0.789679,-0.604122,-0.506707,-1.586722,-1.160637,-0.507705,-0.266748,0.394597,-0.850078,-0.646872,-0.462422,-0.50163,-0.660608,-1.014957,-0.648486,0.083669,0.129555,-1.13974,-0.486688,-0.268596,-0.063084,-1.347869,-1.625341,-0.857802,-1.101096,-0.604598,-0.604303,-0.86914,-0.374174,0.456599,-0.030372,-0.937519,-1.22264,-1.242349,-0.328874,-0.496693,-0.503179,-0.714789,-0.855646,-0.299129,-0.256078,-0.703104,-0.262846,-0.956279,-1.110068,-0.55131,-0.350703,-0.217298,-0.362574,-0.342941,-0.588631,-0.664382,-0.628417,-0.315003,-0.309631,-0.670258,-0.423642,-0.540072,-0.486097,-0.416394,0.967193,-0.324205,-0.230989,-0.608495,-0.245001,-0.448421,_0,O=c1ccc2ccccc2o1,-6.870199,labeled,train,16
1,-0.27568,0.121336,0.457887,-0.246223,-2.04976,-0.49526,1.01947,-0.529795,-0.002618,-2.157913,2.387556,-1.681858,-1.837054,-1.737157,-1.477192,1.298238,-3.429915,-1.653979,-1.002428,-1.369513,-0.821336,0.85933,-0.672744,-0.444487,-0.227192,0.297771,-0.575571,-0.330714,-0.323921,-1.291225,-1.450801,-0.529797,-1.114388,-0.429049,-0.567618,-1.013132,-0.43761,-0.696092,-1.105732,-1.667611,-0.622478,-1.107641,-0.789679,-0.604122,-0.506707,-1.000327,-0.205833,-0.507705,-0.266748,-0.582617,-0.740764,-0.646872,0.167367,-0.10443,-1.108339,-1.23767,-0.362907,-0.807273,-0.877849,-1.13974,-0.998946,-0.268596,-0.385139,-1.404581,0.372028,-0.857802,-1.101096,-0.604598,-0.604303,-0.86914,-1.305226,-0.814698,-1.45221,-0.937519,-1.22264,-1.026838,-0.328874,-0.496693,-0.503179,-1.805533,-1.496401,-0.299129,-0.256078,-0.703104,-0.262846,1.228372,-1.110068,-0.55131,-0.350703,-0.217298,-0.362574,-0.342941,-0.588631,-0.664382,-0.628417,-0.785295,-0.309631,-0.670258,-0.423642,2.674755,2.837173,-0.416394,-0.521875,-0.324205,-0.230989,-0.608495,-0.245001,-0.448421,_1,CC(=O)C(C)=O,-5.405213,labeled,train,12
2,-0.27568,0.121336,0.457887,-0.246223,-2.04976,-0.49526,1.01947,-0.529795,-0.002618,-2.157913,2.387556,-1.681858,-1.837054,-1.737157,-1.477192,1.298238,-3.429915,-1.653979,-1.002428,-1.369513,-0.821336,0.85933,-0.672744,-0.444487,-0.227192,0.297771,-0.575571,-0.330714,-0.323921,-1.291225,-1.450801,-0.529797,-1.114388,-0.429049,-0.567618,-1.013132,-0.43761,-0.696092,-1.105732,-1.667611,-0.622478,-1.107641,-0.789679,-0.604122,-0.506707,-1.000327,-0.205833,-0.507705,-0.266748,-0.582617,-0.740764,-0.646872,0.167367,-0.10443,-1.108339,-1.23767,-0.362907,-0.807273,-0.877849,-1.13974,-0.998946,-0.268596,-0.385139,-1.404581,0.372028,-0.857802,-1.101096,-0.604598,-0.604303,-0.86914,-1.305226,-0.814698,-1.45221,-0.937519,-1.22264,-1.026838,-0.328874,-0.496693,-0.503179,-1.805533,-1.496401,-0.299129,-0.256078,-0.703104,-0.262846,1.228372,-1.110068,-0.55131,-0.350703,-0.217298,-0.362574,-0.342941,-0.588631,-0.664382,-0.628417,-0.785295,-0.309631,-0.670258,-0.423642,2.674755,2.837173,-0.416394,-0.521875,-0.324205,-0.230989,-0.608495,-0.245001,-0.448421,_2,CC(=O)C(C)=O,-5.405213,labeled,train,12
3,-0.642382,0.535413,-0.266457,0.042427,-1.882679,-1.191778,-0.464524,-1.377408,-0.8625,-1.993649,1.687857,-1.522245,-1.692395,-1.633269,-1.371193,0.868846,-2.392272,-1.643971,-1.249294,-0.217407,0.547559,-0.577777,-0.672744,-0.444487,-0.227192,-1.160296,-0.575571,-0.330714,-0.323921,-1.291225,-0.706667,-1.381153,-1.114388,-0.368295,-1.363238,-1.013132,-0.43761,-1.069899,-1.105732,-0.877444,0.666167,-1.107641,-0.789679,1.066174,-0.506707,-1.068943,-1.160637,-0.507705,-0.266748,-0.582617,-0.56497,-0.646872,-1.092211,-0.108941,-1.108339,-1.23767,0.01945,-0.807273,-0.877849,-1.13974,0.185466,-0.268596,-0.385139,-1.314605,-1.625341,0.386552,-1.101096,-0.604598,-0.604303,-0.86914,-0.374174,-0.814698,-0.741291,-0.937519,-1.22264,-1.242349,-0.328874,-0.496693,-0.503179,-1.260161,-1.129165,-0.299129,-0.256078,-0.703104,2.935302,-0.956279,-1.110068,-0.55131,-0.350703,-0.217298,-0.362574,-0.342941,-0.588631,-0.664382,-0.628417,-0.785295,-0.309631,-0.670258,-0.423642,-0.540072,-0.486097,-0.416394,-0.521875,-0.324205,-0.230989,-0.608495,-0.245001,-0.448421,_3,Oc1ccc(O)cc1,-12.18078,labeled,train,12
4,-0.537129,0.541791,-0.237348,1.394206,-0.80286,-0.227081,0.302139,-0.20344,1.52606,1.236861,0.640051,-0.64113,-0.86308,-0.711719,-0.770728,0.579832,-0.139942,-0.731403,-0.66278,-0.146662,0.564611,0.942849,0.493615,-0.444487,-0.227192,-1.160296,1.564607,0.800576,-0.323921,-0.836219,-1.238475,0.226469,-0.14567,-0.420816,-0.183227,0.538965,-0.43761,-0.346091,-0.316469,-0.57132,-0.622478,0.117868,0.208087,-0.604122,0.780966,-0.487526,0.130372,0.432392,-0.266748,-0.582617,0.421086,-0.646872,-1.092211,-0.436022,0.865092,-0.772193,-0.692132,0.480273,0.193706,-0.584106,0.258195,0.22431,-0.246554,-0.704034,0.039133,1.008728,0.113128,-0.604598,-0.604303,-0.86914,-1.305226,1.727896,-0.030372,0.472844,0.201324,-0.380305,-0.328874,-0.496693,-0.503179,-0.714789,-1.321973,1.407884,3.80879,2.44264,-0.262846,-0.956279,1.375884,-0.55131,2.385574,-0.217298,-0.362574,-0.342941,-0.588631,0.787377,0.755703,-0.785295,-0.309631,-0.670258,-0.423642,-0.540072,-0.486097,-0.416394,-0.521875,-0.324205,-0.230989,-0.608495,-0.245001,-0.448421,_4,Cc1c(CCO)sc[n+]1Cc1cnc(C)nc1N,-4.753736,labeled,train,29


(1381, 108) (1381,)
(240, 108) (240,)
(238, 108) (238,)
(478, 108) (478,)


In [4]:
len(PDX_COLS)

108

# Linear regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

print("Train RMSE:", rmse(lr, X_train, y_train))
print("Train Pearson's R:", peason_r(lr, X_train, y_train))


print("Test RMSE:", rmse(lr, X_test, y_test))
print("Test Pearson's R:", peason_r(lr, X_test, y_test))

In [None]:
ridge = Ridge()

params = {"alpha": hp.loguniform("alpha", np.log(1e-10), np.log(1e10)),}

num_eval = 100

trials, best_params, best_model = hyperopt_reg(ridge, params, X_train, y_train, num_eval)
print("best_params:", best_params)

print("Train RMSE:", rmse(best_model, X_train, y_train))
print("Train Pearson's R:", peason_r(best_model, X_train, y_train))

print("Test RMSE:", rmse(best_model, X_test, y_test))
print("Test Pearson's R:", peason_r(best_model, X_test, y_test))

pickle.dump(best_model, open("models/lr/lr_01.pkl", "wb"))

### predict for train test

In [None]:
assert False


pdXY = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_108ft_imputed_std.csv")

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = sorted([col for col in pdXY.columns if col not in PDY_COLS])
print("PDX_COLS", len(PDX_COLS))

print(pdXY.shape)
display_df(pdXY.head())

model = pickle.load(open("models/lr/lr_01.pkl", "rb"))
pdXY["pred"] = model.predict(pdXY[PDX_COLS].values)

df00 = pdXY[PDY_COLS+["pred"]]
print(df00.shape)
display_df(df00.head())

df00.to_csv("results/lr/train_test_pred.csv", index=False)

# Random Forest

In [None]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 14, 1)),
    "min_samples_split": scope.int(hp.quniform("min_samples_split", 2, 20, 2)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 2, 20, 2)), 
    "max_features": scope.int(hp.quniform("max_features", 10, 60, 5)),
}

params_fixed = {
    "n_estimators": 1000
}


num_eval = 100

rf = RandomForestRegressor()

trials, best_params, best_model = hyperopt_reg(rf, params, X_train, y_train, num_eval, params_fixed=params_fixed)
print("best_params:", best_params)

print("Train RMSE:", rmse(best_model, X_train, y_train))
print("Train Pearson's R:", peason_r(best_model, X_train, y_train))

print("Test RMSE:", rmse(best_model, X_test, y_test))
print("Test Pearson's R:", peason_r(best_model, X_test, y_test))

pickle.dump(best_model, open("models/rf/rf_01.pkl", "wb"))

### predict for train test

In [None]:
assert False

pdXY = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_108ft_imputed_std.csv")

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = sorted([col for col in pdXY.columns if col not in PDY_COLS])
print("PDX_COLS", len(PDX_COLS))

print(pdXY.shape)
display_df(pdXY.head())

model = pickle.load(open("models/rf/rf_01.pkl", "rb"))
pdXY["pred"] = model.predict(pdXY[PDX_COLS].values)

df00 = pdXY[PDY_COLS+["pred"]]
print(df00.shape)
display_df(df00.head())

df00.to_csv("results/rf/train_test_pred.csv", index=False)

# XGBOOST

In [None]:
assert False

params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 8, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 10, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.00001), np.log(100)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.0001), np.log(1.)),
    #"gamma": hp.uniform("gamma", 0., 5.),
}

params_fixed = {
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 300
}


num_eval = 100

xgb = XGBRegressor()

trials, best_params, best_model = hyperopt_reg(xgb, params, X_train, y_train, num_eval, params_fixed=params_fixed)
print("best_params:", best_params)

print("Train RMSE:", rmse(best_model, X_train, y_train))
print("Train Pearson's R:", peason_r(best_model, X_train, y_train))

print("Test RMSE:", rmse(best_model, X_test, y_test))
print("Test Pearson's R:", peason_r(best_model, X_test, y_test))

pickle.dump(best_model, open("models/xgb/xbg_01.pkl", "wb"))

## predict nci

In [None]:
pdXY = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_108ft_imputed_std.csv")

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = sorted([col for col in pdXY.columns if col not in PDY_COLS])
print("PDX_COLS", len(PDX_COLS))
smiles_train = pdXY["smiles"].unique().tolist()
smiles_train = [s.upper() for s in smiles_train]
del pdXY

pdXY_nci = pd.read_csv("data/process/pdXY_nci_rdkit_descriptors_108ft_imputed_std.csv")
print("pdXY_nci", pdXY_nci.shape)

# remove some rows having infinite feature values
for cname in PDX_COLS:
    pdXY_nci[cname] = pdXY_nci[cname].replace([np.inf, -np.inf], np.nan)

pdXY_nci = pdXY_nci.dropna(subset=PDX_COLS)

pdXY_nci["in_train_test"] = pdXY_nci["smiles"].str.upper().isin(smiles_train).astype(int)

# remove train/test
print("remove train/test")
print("pdXY_nci", pdXY_nci.shape)
pdXY_nci = pdXY_nci[pdXY_nci["in_train_test"]==0]
print("pdXY_nci", pdXY_nci.shape)


model_path = "models/xgb/xbg_01.pkl"
model = pd.read_pickle(model_path)
pdXY_nci["pred_AChE_2306"] = model.predict(pdXY_nci[PDX_COLS].values)

pdY_nci = pdXY_nci[["new_id", "smiles", "pred_AChE_2306", "in_train_test"]].sort_values(by="pred_AChE_2306")
print("pdY_nci", pdY_nci.shape)
display_df(pdY_nci.head(50))


pdY_clean = pd.read_csv("data/process/nci_clean.csv")
pdY_clean = pdY_clean[['smiles', 'source', 'id', 'code', 'new_id', 'smiles_len',]]
print("pdY_clean", pdY_clean.shape)
display_df(pdY_clean.head())

pdY_nci = pdY_clean.merge(pdY_nci[["new_id", "pred_AChE_2306"]], how="left", on="new_id")
print("pdY_nci", pdY_nci.shape)
display_df(pdY_nci.head(50))


pdY_nci.drop(["new_id"], axis=1).to_excel("results/xgb/nci_AChE_2306_results.xlsx", index=False)