In [8]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import pandas as pd
import lightgbm as lgbm

### Read toxic and non-toxic data

In [9]:
toxic_df = pd.read_csv('/home/liujin/Offtarget_drugsafety/offtarget_application/Toxicity_prediction/toxic_predict_data/toxic_offtarget_profile.csv')
not_toxic_df = pd.read_csv('/home/liujin/Offtarget_drugsafety/offtarget_application/Toxicity_prediction/toxic_predict_data/nontoxic_offtarget_profile.csv')
toxic_df['label'] = 1
not_toxic_df['label'] = 0

df = pd.concat([toxic_df, not_toxic_df], axis=0)
print(toxic_df.shape, not_toxic_df.shape, df.shape)

data_df = df.drop(['smiles','label'], axis=1)
print(data_df.shape)

train_x, test_x, train_y, test_y = train_test_split(data_df, df['label'], test_size=0.2, random_state=999)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
print(data_df.shape)
print(len(df['label']))

(877, 244) (1229, 244) (2106, 244)
(2106, 242)
(1684, 242) (422, 242) (1684,) (422,)
(2106, 242)
2106


### Model hyperparameter selection

In [10]:
import optuna  
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [7]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    # Parameter grid
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.5, step=0.05), 
        "num_leaves": trial.suggest_int("num_leaves", 20, 1000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 500, step=100), 
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15, step=0.1),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0, step=0.1), 
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.9, step=0.1),
        "random_state": 2023,
    }

    X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.1,random_state=999) 
    
    model = lgbm.LGBMClassifier(objective="binary", **param_grid) 
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc", 
        callbacks=[
            LightGBMPruningCallback(trial, "auc"),
            lgbm.early_stopping(20),
        ],
    )
    
    preds = model.predict_proba(X_val)
    auc = roc_auc_score(y_val, preds[:, 1])
    return auc

In [11]:
study = optuna.create_study(direction="maximize", study_name="LGBM Classifier") 
func = lambda trial: objective(trial, train_x, train_y)
study.optimize(func, n_trials=1000) 

[32m[I 2023-11-21 14:08:51,594][0m A new study created in memory with name: LGBM Classifier[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:09:04,079][0m Trial 0 finished with value: 0.8274606872451951 and parameters: {'n_estimators': 500, 'learning_rate': 0.35, 'num_leaves': 880, 'max_depth': 4, 'min_data_in_leaf': 100, 'lambda_l1': 95, 'lambda_l2': 20, 'min_gain_to_split': 3.4000000000000004, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: 0.8274606872451951.[0m


Early stopping, best iteration is:
[9]	valid_0's auc: 0.827461	valid_0's binary_logloss: 0.576194
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:09:16,345][0m Trial 1 finished with value: 0.8353960396039604 and parameters: {'n_estimators': 1000, 'learning_rate': 0.2, 'num_leaves': 440, 'max_depth': 11, 'min_data_in_leaf': 400, 'lambda_l1': 90, 'lambda_l2': 100, 'min_gain_to_split': 10.5, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 1 with value: 0.8353960396039604.[0m


Early stopping, best iteration is:
[8]	valid_0's auc: 0.838526	valid_0's binary_logloss: 0.618482
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:09:37,941][0m Trial 2 finished with value: 0.8530139778683751 and parameters: {'n_estimators': 900, 'learning_rate': 0.30000000000000004, 'num_leaves': 440, 'max_depth': 11, 'min_data_in_leaf': 100, 'lambda_l1': 70, 'lambda_l2': 65, 'min_gain_to_split': 0.4, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 2 with value: 0.8530139778683751.[0m


Early stopping, best iteration is:
[29]	valid_0's auc: 0.85396	valid_0's binary_logloss: 0.562073
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:09:47,985][0m Trial 3 finished with value: 0.8281887012230634 and parameters: {'n_estimators': 1000, 'learning_rate': 0.4, 'num_leaves': 800, 'max_depth': 10, 'min_data_in_leaf': 100, 'lambda_l1': 80, 'lambda_l2': 25, 'min_gain_to_split': 14.100000000000001, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 2 with value: 0.8530139778683751.[0m


Early stopping, best iteration is:
[4]	valid_0's auc: 0.828189	valid_0's binary_logloss: 0.563297
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:09:59,781][0m Trial 4 finished with value: 0.8273150844496215 and parameters: {'n_estimators': 700, 'learning_rate': 0.4, 'num_leaves': 440, 'max_depth': 9, 'min_data_in_leaf': 300, 'lambda_l1': 0, 'lambda_l2': 35, 'min_gain_to_split': 14.5, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 2 with value: 0.8530139778683751.[0m


Early stopping, best iteration is:
[15]	valid_0's auc: 0.827315	valid_0's binary_logloss: 0.505395


[32m[I 2023-11-21 14:10:00,678][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:10:01,539][0m Trial 6 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:10:02,667][0m Trial 7 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:10:04,061][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:10:13,609][0m Trial 9 finished with value: 0.8531595806639487 and parameters: {'n_estimators': 800, 'learning_rate': 0.4, 'num_leaves': 80, 'max_depth': 6, 'min_data_in_leaf': 200, 'lambda_l1': 65, 'lambda_l2': 65, 'min_gain_to_split': 8.6, 'bagging_fraction': 1.0, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 9 with value: 0.8531595806639487.[0m


Early stopping, best iteration is:
[6]	valid_0's auc: 0.85316	valid_0's binary_logloss: 0.521096
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:10:22,904][0m Trial 10 finished with value: 0.8580372743156669 and parameters: {'n_estimators': 200, 'learning_rate': 0.5, 'num_leaves': 20, 'max_depth': 7, 'min_data_in_leaf': 200, 'lambda_l1': 55, 'lambda_l2': 0, 'min_gain_to_split': 5.300000000000001, 'bagging_fraction': 1.0, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 10 with value: 0.8580372743156669.[0m


Early stopping, best iteration is:
[5]	valid_0's auc: 0.858037	valid_0's binary_logloss: 0.483937
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:10:32,150][0m Trial 11 finished with value: 0.8587652882935353 and parameters: {'n_estimators': 100, 'learning_rate': 0.5, 'num_leaves': 40, 'max_depth': 7, 'min_data_in_leaf': 200, 'lambda_l1': 50, 'lambda_l2': 0, 'min_gain_to_split': 6.0, 'bagging_fraction': 1.0, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 11 with value: 0.8587652882935353.[0m


Early stopping, best iteration is:
[5]	valid_0's auc: 0.858765	valid_0's binary_logloss: 0.480266
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:10:41,455][0m Trial 12 finished with value: 0.8587652882935353 and parameters: {'n_estimators': 100, 'learning_rate': 0.5, 'num_leaves': 20, 'max_depth': 7, 'min_data_in_leaf': 200, 'lambda_l1': 50, 'lambda_l2': 0, 'min_gain_to_split': 5.4, 'bagging_fraction': 1.0, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 11 with value: 0.8587652882935353.[0m


Early stopping, best iteration is:
[5]	valid_0's auc: 0.858765	valid_0's binary_logloss: 0.480266
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:10:56,160][0m Trial 13 finished with value: 0.8715783343040187 and parameters: {'n_estimators': 100, 'learning_rate': 0.5, 'num_leaves': 200, 'max_depth': 8, 'min_data_in_leaf': 200, 'lambda_l1': 35, 'lambda_l2': 0, 'min_gain_to_split': 5.1000000000000005, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 13 with value: 0.8715783343040187.[0m


Early stopping, best iteration is:
[13]	valid_0's auc: 0.871578	valid_0's binary_logloss: 0.45753


[32m[I 2023-11-21 14:10:57,156][0m Trial 14 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:11:10,232][0m Trial 15 finished with value: 0.8652446126965638 and parameters: {'n_estimators': 400, 'learning_rate': 0.5, 'num_leaves': 200, 'max_depth': 9, 'min_data_in_leaf': 200, 'lambda_l1': 25, 'lambda_l2': 10, 'min_gain_to_split': 6.5, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 13 with value: 0.8715783343040187.[0m


Early stopping, best iteration is:
[8]	valid_0's auc: 0.865245	valid_0's binary_logloss: 0.459729
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:11:23,613][0m Trial 16 finished with value: 0.8617501456027956 and parameters: {'n_estimators': 400, 'learning_rate': 0.45000000000000007, 'num_leaves': 240, 'max_depth': 9, 'min_data_in_leaf': 300, 'lambda_l1': 25, 'lambda_l2': 10, 'min_gain_to_split': 4.1000000000000005, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 13 with value: 0.8715783343040187.[0m


Early stopping, best iteration is:
[6]	valid_0's auc: 0.86175	valid_0's binary_logloss: 0.46822
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:11:38,980][0m Trial 17 finished with value: 0.8797320908561445 and parameters: {'n_estimators': 300, 'learning_rate': 0.45000000000000007, 'num_leaves': 580, 'max_depth': 12, 'min_data_in_leaf': 200, 'lambda_l1': 15, 'lambda_l2': 30, 'min_gain_to_split': 7.800000000000001, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 17 with value: 0.8797320908561445.[0m


Early stopping, best iteration is:
[11]	valid_0's auc: 0.880679	valid_0's binary_logloss: 0.442197
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:11:55,845][0m Trial 18 finished with value: 0.8708503203261504 and parameters: {'n_estimators': 200, 'learning_rate': 0.45000000000000007, 'num_leaves': 600, 'max_depth': 12, 'min_data_in_leaf': 300, 'lambda_l1': 5, 'lambda_l2': 30, 'min_gain_to_split': 10.9, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 17 with value: 0.8797320908561445.[0m


Early stopping, best iteration is:
[16]	valid_0's auc: 0.871069	valid_0's binary_logloss: 0.45048


[32m[I 2023-11-21 14:11:57,307][0m Trial 19 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:11:58,643][0m Trial 20 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:12:21,357][0m Trial 21 finished with value: 0.8915259172976122 and parameters: {'n_estimators': 200, 'learning_rate': 0.45000000000000007, 'num_leaves': 580, 'max_depth': 12, 'min_data_in_leaf': 300, 'lambda_l1': 0, 'lambda_l2': 30, 'min_gain_to_split': 10.700000000000001, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 21 with value: 0.8915259172976122.[0m


Early stopping, best iteration is:
[27]	valid_0's auc: 0.892108	valid_0's binary_logloss: 0.419215
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:12:33,907][0m Trial 22 finished with value: 0.8711415259172977 and parameters: {'n_estimators': 100, 'learning_rate': 0.45000000000000007, 'num_leaves': 580, 'max_depth': 12, 'min_data_in_leaf': 200, 'lambda_l1': 10, 'lambda_l2': 30, 'min_gain_to_split': 10.600000000000001, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 21 with value: 0.8915259172976122.[0m


Early stopping, best iteration is:
[4]	valid_0's auc: 0.871142	valid_0's binary_logloss: 0.473304
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:12:36,210][0m Trial 23 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-11-21 14:12:37,654][0m Trial 24 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:12:39,003][0m Trial 25 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:12:41,107][0m Trial 26 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-11-21 14:12:42,662][0m Trial 27 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:12:44,132][0m Trial 28 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:12:45,692][0m Trial 29 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:12:49,010][0m Trial 30 pruned. Trial was pruned at iteration 3.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:13:12,058][0m Trial 31 finished with value: 0.8814793244030286 and parameters: {'n_estimators': 100, 'learning_rate': 0.45000000000000007, 'num_leaves': 520, 'max_depth': 12, 'min_data_in_leaf': 200, 'lambda_l1': 0, 'lambda_l2': 35, 'min_gain_to_split': 10.3, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 21 with value: 0.8915259172976122.[0m


Early stopping, best iteration is:
[28]	valid_0's auc: 0.881989	valid_0's binary_logloss: 0.429446
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:13:28,748][0m Trial 32 pruned. Trial was pruned at iteration 36.[0m
[32m[I 2023-11-21 14:13:30,218][0m Trial 33 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:13:45,317][0m Trial 34 finished with value: 0.8866482236458941 and parameters: {'n_estimators': 100, 'learning_rate': 0.45000000000000007, 'num_leaves': 660, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 15, 'min_gain_to_split': 8.9, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 21 with value: 0.8915259172976122.[0m


Early stopping, best iteration is:
[7]	valid_0's auc: 0.886648	valid_0's binary_logloss: 0.432383
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:14:00,154][0m Trial 35 finished with value: 0.8835177635410599 and parameters: {'n_estimators': 300, 'learning_rate': 0.35, 'num_leaves': 840, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 20, 'min_gain_to_split': 9.9, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 21 with value: 0.8915259172976122.[0m


Early stopping, best iteration is:
[7]	valid_0's auc: 0.883518	valid_0's binary_logloss: 0.44644
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:14:31,961][0m Trial 36 finished with value: 0.8934187536400698 and parameters: {'n_estimators': 600, 'learning_rate': 0.35, 'num_leaves': 880, 'max_depth': 11, 'min_data_in_leaf': 100, 'lambda_l1': 0, 'lambda_l2': 15, 'min_gain_to_split': 9.9, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 36 with value: 0.8934187536400698.[0m


Early stopping, best iteration is:
[48]	valid_0's auc: 0.894584	valid_0's binary_logloss: 0.405971


[32m[I 2023-11-21 14:14:33,187][0m Trial 37 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:14:53,014][0m Trial 38 finished with value: 0.8787128712871287 and parameters: {'n_estimators': 600, 'learning_rate': 0.25, 'num_leaves': 860, 'max_depth': 11, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 15, 'min_gain_to_split': 9.9, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 36 with value: 0.8934187536400698.[0m


Early stopping, best iteration is:
[13]	valid_0's auc: 0.878713	valid_0's binary_logloss: 0.453928


[32m[I 2023-11-21 14:14:54,666][0m Trial 39 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:14:56,151][0m Trial 40 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:15:11,378][0m Trial 41 finished with value: 0.8837361677344205 and parameters: {'n_estimators': 200, 'learning_rate': 0.4, 'num_leaves': 700, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 0, 'lambda_l2': 25, 'min_gain_to_split': 10.5, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 36 with value: 0.8934187536400698.[0m


Early stopping, best iteration is:
[7]	valid_0's auc: 0.883736	valid_0's binary_logloss: 0.442169
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:15:13,790][0m Trial 42 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2023-11-21 14:15:15,526][0m Trial 43 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:15:47,609][0m Trial 44 finished with value: 0.9049213744903902 and parameters: {'n_estimators': 200, 'learning_rate': 0.4, 'num_leaves': 680, 'max_depth': 11, 'min_data_in_leaf': 100, 'lambda_l1': 0, 'lambda_l2': 5, 'min_gain_to_split': 8.8, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 44 with value: 0.9049213744903902.[0m


Early stopping, best iteration is:
[37]	valid_0's auc: 0.905941	valid_0's binary_logloss: 0.387826
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:16:13,341][0m Trial 45 finished with value: 0.9018637157833431 and parameters: {'n_estimators': 200, 'learning_rate': 0.4, 'num_leaves': 680, 'max_depth': 9, 'min_data_in_leaf': 100, 'lambda_l1': 0, 'lambda_l2': 5, 'min_gain_to_split': 8.700000000000001, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 44 with value: 0.9049213744903902.[0m


Early stopping, best iteration is:
[23]	valid_0's auc: 0.902737	valid_0's binary_logloss: 0.390225


[32m[I 2023-11-21 14:16:14,923][0m Trial 46 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:16:16,350][0m Trial 47 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:16:17,794][0m Trial 48 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:16:19,165][0m Trial 49 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:16:34,695][0m Trial 50 pruned. Trial was pruned at iteration 33.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:17:01,999][0m Trial 51 pruned. Trial was pruned at iteration 49.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:17:14,572][0m Trial 52 finished with value: 0.8779848573092603 and parameters: {'n_estimators': 200, 'learning_rate': 0.45000000000000007, 'num_leaves': 720, 'max_depth': 10, 'min_data_in_leaf': 100, 'lambda_l1': 15, 'lambda_l2': 0, 'min_gain_to_split': 12.5, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 44 with value: 0.9049213744903902.[0m


Early stopping, best iteration is:
[6]	valid_0's auc: 0.877985	valid_0's binary_logloss: 0.445678
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:17:33,311][0m Trial 53 pruned. Trial was pruned at iteration 35.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:17:57,877][0m Trial 54 finished with value: 0.9095078625509609 and parameters: {'n_estimators': 100, 'learning_rate': 0.4, 'num_leaves': 680, 'max_depth': 11, 'min_data_in_leaf': 100, 'lambda_l1': 0, 'lambda_l2': 15, 'min_gain_to_split': 6.1000000000000005, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 54 with value: 0.9095078625509609.[0m


Early stopping, best iteration is:
[16]	valid_0's auc: 0.909508	valid_0's binary_logloss: 0.384843
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:18:02,227][0m Trial 55 pruned. Trial was pruned at iteration 3.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:18:22,655][0m Trial 56 pruned. Trial was pruned at iteration 33.[0m
[32m[I 2023-11-21 14:18:24,104][0m Trial 57 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:18:25,674][0m Trial 58 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:18:27,110][0m Trial 59 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:18:28,860][0m Trial 60 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:18:52,343][0m Trial 61 pruned. Trial was pruned at iteration 48.[0m
[32m[I 2023-11-21 14:18:53,956][0m Trial 62 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:19:30,403][0m Trial 63 finished with value: 0.907251019219569 and parameters: {'n_estimators': 200, 'learning_rate': 0.45000000000000007, 'num_leaves': 680, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 10, 'min_gain_to_split': 4.6000000000000005, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 54 with value: 0.9095078625509609.[0m


Early stopping, best iteration is:
[51]	valid_0's auc: 0.907251	valid_0's binary_logloss: 0.389337
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:19:56,284][0m Trial 64 finished with value: 0.8941467676179383 and parameters: {'n_estimators': 300, 'learning_rate': 0.45000000000000007, 'num_leaves': 600, 'max_depth': 6, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 10, 'min_gain_to_split': 4.800000000000001, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 54 with value: 0.9095078625509609.[0m


Early stopping, best iteration is:
[25]	valid_0's auc: 0.894365	valid_0's binary_logloss: 0.404525
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:19:59,342][0m Trial 65 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2023-11-21 14:20:00,857][0m Trial 66 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2023-11-21 14:20:02,290][0m Trial 67 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:20:05,570][0m Trial 68 pruned. Trial was pruned at iteration 2.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:20:08,799][0m Trial 69 pruned. Trial was pruned at iteration 2.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:20:11,872][0m Trial 70 pruned. Trial was pruned at iteration 2.[0m
[32m[I 2023-11-21 14:20:13,502][0m Trial 71 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:21:12,293][0m Trial 72 finished with value: 0.9114735002912056 and parameters: {'n_estimators': 300, 'learning_rate': 0.5, 'num_leaves': 600, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 10, 'min_gain_to_split': 0.5, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 72 with value: 0.9114735002912056.[0m


Early stopping, best iteration is:
[35]	valid_0's auc: 0.911474	valid_0's binary_logloss: 0.374209
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:22:07,965][0m Trial 73 finished with value: 0.9071054164239953 and parameters: {'n_estimators': 300, 'learning_rate': 0.5, 'num_leaves': 600, 'max_depth': 11, 'min_data_in_leaf': 100, 'lambda_l1': 10, 'lambda_l2': 0, 'min_gain_to_split': 0.1, 'bagging_fraction': 0.9000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 72 with value: 0.9114735002912056.[0m


Early stopping, best iteration is:
[43]	valid_0's auc: 0.908416	valid_0's binary_logloss: 0.380242
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:22:51,455][0m Trial 74 finished with value: 0.9191904484566104 and parameters: {'n_estimators': 300, 'learning_rate': 0.5, 'num_leaves': 540, 'max_depth': 11, 'min_data_in_leaf': 100, 'lambda_l1': 10, 'lambda_l2': 0, 'min_gain_to_split': 0.7000000000000001, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[47]	valid_0's auc: 0.919773	valid_0's binary_logloss: 0.360624


[32m[I 2023-11-21 14:22:53,356][0m Trial 75 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:23:45,497][0m Trial 76 finished with value: 0.918899242865463 and parameters: {'n_estimators': 400, 'learning_rate': 0.5, 'num_leaves': 600, 'max_depth': 10, 'min_data_in_leaf': 100, 'lambda_l1': 10, 'lambda_l2': 0, 'min_gain_to_split': 0.30000000000000004, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[47]	valid_0's auc: 0.920501	valid_0's binary_logloss: 0.356785


[32m[I 2023-11-21 14:23:47,392][0m Trial 77 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:24:05,971][0m Trial 78 pruned. Trial was pruned at iteration 27.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:24:28,649][0m Trial 79 finished with value: 0.8936371578334305 and parameters: {'n_estimators': 400, 'learning_rate': 0.5, 'num_leaves': 560, 'max_depth': 9, 'min_data_in_leaf': 100, 'lambda_l1': 25, 'lambda_l2': 5, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[8]	valid_0's auc: 0.893637	valid_0's binary_logloss: 0.422873


[32m[I 2023-11-21 14:24:30,458][0m Trial 80 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:24:57,715][0m Trial 81 pruned. Trial was pruned at iteration 35.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:25:23,863][0m Trial 82 pruned. Trial was pruned at iteration 28.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:25:51,999][0m Trial 83 pruned. Trial was pruned at iteration 33.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:26:49,178][0m Trial 84 finished with value: 0.9146767617938265 and parameters: {'n_estimators': 400, 'learning_rate': 0.5, 'num_leaves': 640, 'max_depth': 8, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 5, 'min_gain_to_split': 0.2, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[27]	valid_0's auc: 0.914677	valid_0's binary_logloss: 0.372733
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:26:53,639][0m Trial 85 pruned. Trial was pruned at iteration 2.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:26:56,906][0m Trial 86 pruned. Trial was pruned at iteration 2.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:27:32,661][0m Trial 87 finished with value: 0.8775480489225393 and parameters: {'n_estimators': 400, 'learning_rate': 0.5, 'num_leaves': 680, 'max_depth': 8, 'min_data_in_leaf': 100, 'lambda_l1': 0, 'lambda_l2': 0, 'min_gain_to_split': 1.2000000000000002, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[4]	valid_0's auc: 0.877548	valid_0's binary_logloss: 0.422013


[32m[I 2023-11-21 14:27:34,130][0m Trial 88 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:28:29,604][0m Trial 89 finished with value: 0.9132207338380897 and parameters: {'n_estimators': 200, 'learning_rate': 0.45000000000000007, 'num_leaves': 680, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 0, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[24]	valid_0's auc: 0.913221	valid_0's binary_logloss: 0.36033


[32m[I 2023-11-21 14:28:31,275][0m Trial 90 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:29:15,077][0m Trial 91 finished with value: 0.9028829353523588 and parameters: {'n_estimators': 200, 'learning_rate': 0.5, 'num_leaves': 680, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 5, 'min_gain_to_split': 0.5, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[18]	valid_0's auc: 0.902883	valid_0's binary_logloss: 0.376459
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:29:58,443][0m Trial 92 finished with value: 0.9050669772859639 and parameters: {'n_estimators': 200, 'learning_rate': 0.5, 'num_leaves': 620, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 0, 'min_gain_to_split': 0.5, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[18]	valid_0's auc: 0.905067	valid_0's binary_logloss: 0.374644
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:30:31,702][0m Trial 93 finished with value: 0.8998252766453116 and parameters: {'n_estimators': 200, 'learning_rate': 0.5, 'num_leaves': 620, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 0, 'min_gain_to_split': 0.0, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[10]	valid_0's auc: 0.899825	valid_0's binary_logloss: 0.39027


[32m[I 2023-11-21 14:30:33,214][0m Trial 94 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:30:36,926][0m Trial 95 pruned. Trial was pruned at iteration 2.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:31:14,698][0m Trial 96 finished with value: 0.9179528246942341 and parameters: {'n_estimators': 400, 'learning_rate': 0.5, 'num_leaves': 520, 'max_depth': 11, 'min_data_in_leaf': 100, 'lambda_l1': 0, 'lambda_l2': 5, 'min_gain_to_split': 0.9, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[9]	valid_0's auc: 0.917953	valid_0's binary_logloss: 0.369351


[32m[I 2023-11-21 14:31:16,414][0m Trial 97 pruned. Trial was pruned at iteration 0.[0m


Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:31:51,629][0m Trial 98 finished with value: 0.9004076878276064 and parameters: {'n_estimators': 500, 'learning_rate': 0.5, 'num_leaves': 500, 'max_depth': 12, 'min_data_in_leaf': 100, 'lambda_l1': 5, 'lambda_l2': 10, 'min_gain_to_split': 1.3, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 74 with value: 0.9191904484566104.[0m


Early stopping, best iteration is:
[18]	valid_0's auc: 0.900408	valid_0's binary_logloss: 0.394732
Training until validation scores don't improve for 20 rounds


[32m[I 2023-11-21 14:31:57,867][0m Trial 99 pruned. Trial was pruned at iteration 4.[0m


In [22]:
# print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest value (auc): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (auc): 0.91128
	Best params:
		n_estimators: 900
		learning_rate: 0.2
		num_leaves: 540
		max_depth: 6
		min_data_in_leaf: 100
		lambda_l1: 0
		lambda_l2: 0
		min_gain_to_split: 0.2
		bagging_fraction: 1.0
		bagging_freq: 1
		feature_fraction: 0.4


### Conduct five-fold cross-training based on the optimum hyperparameter

In [40]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, balanced_accuracy_score, matthews_corrcoef
import lightgbm as lgbm

# 创建 KFold 实例
kf = KFold(n_splits=5, random_state=999, shuffle=True)

train_x, test_x, train_y, test_y = train_test_split(data_df, df['label'], test_size=0.2, random_state=999)

# #实例化模型
# model = lgbm.LGBMClassifier(**study.best_params)
model = lgbm.LGBMClassifier(n_estimators=900,
                            learning_rate=0.2,
                            num_leaves=540,
                            max_depth=6,
                            min_data_in_leaf=100,
                            lambda_l1=0,
                            lambda_l2=0,
                            min_gain_to_split=0.2,
                            bagging_fraction=1.0,
                            bagging_freq=1,
                            feature_fraction=0.4,
                            random_state=2023)

acc_list = []
auc_list = []
f1_list = []
bacc_list = []
mcc_list = []
#对训练集进行五折交叉划分
for train_index,val_index in kf.split(train_x):
    X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(test_x)
    y_prob = model.predict_proba(test_x)

    acc_list.append(accuracy_score(test_y, y_pred))
    auc_list.append(roc_auc_score(test_y, y_prob[:,1]))
    f1_list.append(f1_score(test_y, y_pred, average='binary'))
    bacc_list.append(balanced_accuracy_score(test_y, y_pred))
    mcc_list.append(matthews_corrcoef(test_y, y_pred))

#计算五折交叉验证的平均值和方差
print('accuracy_score:', np.mean(acc_list), np.std(acc_list))
print('roc_auc_score:', np.mean(auc_list), np.std(auc_list))
print('f1_score:', np.mean(f1_list), np.std(f1_list))
print('balanced_accuracy_score:', np.mean(bacc_list), np.std(bacc_list))
print('matthews_corrcoef:', np.mean(mcc_list), np.std(mcc_list))


accuracy_score: 0.824170616113744 0.005060226659730487
roc_auc_score: 0.9115695871292887 0.007036931467870062
f1_score: 0.7694247176188724 0.005699725924404691
balanced_accuracy_score: 0.8198584997092461 0.004669808908649999
matthews_corrcoef: 0.6294302426108571 0.009432759767705818


#### Save model performance results

In [39]:

result_df_offtarget = pd.DataFrame({'accuracy_score':acc_list,
                                'roc_auc_score':auc_list,
                                'f1_score':f1_list,
                                'balanced_accuracy_score':bacc_list,
                                'matthews_corrcoef':mcc_list})
# result_df_ecfp.head(5)
result_df_offtarget.to_csv('/home/liujin/Offtarget_drugsafety/offtarget_application/Toxicity_prediction/picture_draw/result_df_offtarget.csv',index=False)