In [None]:
import time
import copy
import pickle

import numpy as np
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
def display_df(df):
    display(HTML(df.to_html()))
    return None

In [None]:
def rmse(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.sqrt(mean_squared_error(y_eval, y_hat))


def r2(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return r2_score(y_eval, y_hat)


def peason_r(estimator, X_eval, y_eval):
    y_hat = estimator.predict(X_eval)
    return np.corrcoef(y_eval, y_hat)[0, 1]


def peason_r_metric(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]

peason_r_score = make_scorer(peason_r_metric)

In [None]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def hyperopt_reg(regressor,
                 params_tuned, 
                 X_train, y_train,
                 num_eval,
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {}
    
    def objective(params):
        regressor.set_params(**params_fixed, **params)
        # may use scoring='r2', "neg_mean_squared_error"
        neg_mse = cross_val_score(regressor, X_train, y_train, cv=10, scoring="neg_mean_squared_error").mean()
        #r2 = cross_val_score(regressor, X_train, y_train, cv=10, scoring="r2").mean()
        #pearson_r = cross_val_score(regressor, X_train, y_train, cv=10, scoring=peason_r_score).mean()
        return {"loss": -neg_mse, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_params = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    best_params = whole_to_int(best_params)
    best_model = regressor.set_params(**params_fixed, **best_params)
    best_model.fit(X_train, y_train)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    return trials, best_params, best_model

# Load train/test

In [None]:
pdXY = pd.read_csv("data/process/pdXY_rdkit_descriptors_105ft_clean.csv")
pdXY.head()

In [None]:
pdXY.shape

In [None]:
PDY_COLS = ["raw_Code", "code", "smiles", "smiles_len", "train_test", "dG"]
PDX_COLS = [col for col in pdXY.columns if col not in PDY_COLS]
print(len(PDX_COLS))

X_train = pdXY.loc[pdXY["train_test"] == "train", PDX_COLS].copy().values
y_train = pdXY.loc[pdXY["train_test"] == "train", "dG"].copy().values
print(X_train.shape, y_train.shape)

X_test = pdXY.loc[pdXY["train_test"] == "test", PDX_COLS].copy().values
y_test = pdXY.loc[pdXY["train_test"] == "test", "dG"].copy().values
print(X_test.shape, y_test.shape)

In [None]:
pdXY.head()

In [None]:
X_all = pdXY[PDX_COLS].copy().values

# Linear regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

print("Train RMSE:", rmse(lr, X_train, y_train))
print("Train Pearson's R:", peason_r(lr, X_train, y_train))

print("Test RMSE:", rmse(lr, X_test, y_test))
print("Test Pearson's R:", peason_r(lr, X_test, y_test))

In [None]:
y_test_pred = lr.predict(X_test)
test_pred_df = pd.DataFrame({"dG": y_test, "pred": y_test_pred})
test_pred_df.to_csv("results/lr_default/test_pred.csv", index=False)

In [None]:
ridge = Ridge()

params = {"alpha": hp.loguniform("alpha", np.log(1e-10), np.log(1e10)),}

num_eval = 100

trials, best_params, best_model = hyperopt_reg(ridge, params, X_train, y_train, num_eval)
print("best_params:", best_params)

print("Train RMSE:", rmse(best_model, X_train, y_train))
print("Train Pearson's R:", peason_r(best_model, X_train, y_train))

print("Test RMSE:", rmse(best_model, X_test, y_test))
print("Test Pearson's R:", peason_r(best_model, X_test, y_test))

pickle.dump(best_model, open("models/lr_01.pkl", "wb"))

In [None]:
model = pickle.load(open("models/lr_01.pkl", "rb"))
y_test_pred = model.predict(X_test)
test_pred_df = pd.DataFrame({"dG": y_test, "pred": y_test_pred})
test_pred_df.to_csv("results/lr/test_pred.csv", index=False)

# Random Forest

In [None]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 14, 1)),
    "min_samples_split": scope.int(hp.quniform("min_samples_split", 2, 20, 2)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 2, 20, 2)), 
    "max_features": scope.int(hp.quniform("max_features", 10, 60, 5)),
}

params_fixed = {
    "n_estimators": 1000
}


num_eval = 100

rf = RandomForestRegressor()

trials, best_params, best_model = hyperopt_reg(rf, params, X_train, y_train, num_eval, params_fixed=params_fixed)
print("best_params:", best_params)

print("Train RMSE:", rmse(best_model, X_train, y_train))
print("Train Pearson's R:", peason_r(best_model, X_train, y_train))

print("Test RMSE:", rmse(best_model, X_test, y_test))
print("Test Pearson's R:", peason_r(best_model, X_test, y_test))

pickle.dump(best_model, open("models/rf_01.pkl", "wb"))

In [None]:
model = pickle.load(open("models/rf_01.pkl", "rb"))
y_test_pred = model.predict(X_test)
test_pred_df = pd.DataFrame({"dG": y_test, "pred": y_test_pred})
test_pred_df.to_csv("results/rf/test_pred.csv", index=False)

In [None]:
pdXY_pred = pdXY[["smiles", "code", "dG"] + PDX_COLS].copy()
ii1 = pdXY_pred["MaxAbsPartialCharge"] != np.inf
ii2 = pdXY_pred["MinPartialCharge"] != np.inf
ii3 = pdXY_pred["MinAbsPartialCharge"] != np.inf
ii4 = pdXY_pred["MaxPartialCharge"] != np.inf

pdXY_pred = pdXY_pred[ii1 & ii2 & ii3 & ii4]

model = pickle.load(open("models/rf_01.pkl", "rb"))
pdXY_pred["pred"] = model.predict(pdXY_pred[PDX_COLS].values)
pdXY_pred[["smiles", "code", "dG"] + ["pred"]].to_csv("results/rf/pred_rf_01.csv", index=False)

In [None]:
pdXY_pred[["smiles", "code", "dG"] + ["pred"]].isnull().sum()

In [None]:
pdXY_pred.head()

In [None]:
pdXY_pred[PDX_COLS].max().sort_values()

In [None]:
pred = pd.read_csv("results/rf/pred_rf_01.csv")
pred.head()

In [None]:
pred["code"].unique()

In [None]:
vietherbs = pred.loc[pred["code"] == 'vietherbs', ["smiles", "pred"]].sort_values(by="pred")
vietherbs.to_csv("results/rf/vietherbs.csv", index=False)
vietherbs.head()

In [None]:
chembl_27 = pred.loc[pred["code"] == 'chembl_27', ["smiles", "pred"]].sort_values(by="pred")
chembl_27.to_csv("results/chembl_27.csv", index=False)
chembl_27.head()

# XGBOOST

In [None]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 8, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 14, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.2, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.00001), np.log(100)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.0001), np.log(1.)),
    #"gamma": hp.uniform("gamma", 0., 5.),
}

params_fixed = {
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 300
}


num_eval = 100

xgb = XGBRegressor()

trials, best_params, best_model = hyperopt_reg(xgb, params, X_train, y_train, num_eval, params_fixed=params_fixed)
print("best_params:", best_params)

print("Train RMSE:", rmse(best_model, X_train, y_train))
print("Train Pearson's R:", peason_r(best_model, X_train, y_train))

print("Test RMSE:", rmse(best_model, X_test, y_test))
print("Test Pearson's R:", peason_r(best_model, X_test, y_test))

pickle.dump(best_model, open("models/xbg_01.pkl", "wb"))

In [None]:
model = pickle.load(open("models/xbg_01.pkl", "rb"))
y_test_pred = model.predict(X_test)
test_pred_df = pd.DataFrame({"dG": y_test, "pred": y_test_pred})
test_pred_df.to_csv("results/xgb/test_pred.csv", index=False)

In [None]:
test_pred_df["dG"].min(), test_pred_df["dG"].max()

In [None]:
pdXY_pred = pdXY[["smiles", "code", "dG"] + PDX_COLS].copy()
ii1 = pdXY_pred["MaxAbsPartialCharge"] != np.inf
ii2 = pdXY_pred["MinPartialCharge"] != np.inf
ii3 = pdXY_pred["MinAbsPartialCharge"] != np.inf
ii4 = pdXY_pred["MaxPartialCharge"] != np.inf

pdXY_pred = pdXY_pred[ii1 & ii2 & ii3 & ii4]

model = pickle.load(open("models/xbg_01.pkl", "rb"))
pdXY_pred["pred"] = model.predict(pdXY_pred[PDX_COLS].values)
pdXY_pred[["smiles", "code", "dG"] + ["pred"]].to_csv("results/xgb/pred_rf_01.csv", index=False)

In [None]:
vietherbs = pred.loc[pred["code"] == 'vietherbs', ["smiles", "pred"]].sort_values(by="pred")
vietherbs.to_csv("results/xgb/vietherbs.csv", index=False)
vietherbs.head()

In [None]:
vietherbs["pred"].min(), vietherbs["pred"].max()

In [None]:
chembl_27 = pred.loc[pred["code"] == 'chembl_27', ["smiles", "pred"]].sort_values(by="pred")
chembl_27.to_csv("results/xgb/chembl_27.csv", index=False)
chembl_27.head()

In [None]:
chembl_27["pred"].min(), chembl_27["pred"].max()

In [None]:
chembl_27.shape

In [None]:
chembl_27_id = pd.read_table("../AchE_ML/data/raw/chembl_27.smi", sep="\s+", header=None)
chembl_27_id = chembl_27_id.rename(columns={0: "smiles", 1: "chembl_id"})
print("chembl_27_id", chembl_27_id.shape)
chembl_27_id = chembl_27_id.drop_duplicates(subset=["smiles"])
print("chembl_27_id", chembl_27_id.shape)
chembl_27_id.head()

chembl_27 = pd.read_csv("results/xgb/chembl_27.csv")
print("chembl_27", chembl_27.shape)
chembl_27 = chembl_27.merge(chembl_27_id, how="left", on="smiles")
print("chembl_27", chembl_27.shape)
chembl_27.to_csv("results/xgb/chembl_27_with_id.csv", index=False)

In [None]:
nirmatrelvir = pd.read_csv("data/process/pdXY_nirm_rdkit_descriptors_105ft_clean.csv")
model = pickle.load(open("models/xbg_01.pkl", "rb"))
nirmatrelvir["pred"] = model.predict(nirmatrelvir[PDX_COLS].values)
nirmatrelvir[["smiles", "code", "dG"] + ["pred"]].to_csv("results/xgb/nirmatrelvir.csv", index=False)

In [None]:
display_df(nirmatrelvir[["smiles", "code", "dG"] + ["pred"]])