### Use RF to train toxic/non-toxic data

#### 1.Import related packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

#### 2.Import data

In [2]:
toxic_df = pd.read_csv('/home/liujin/Drugsafety/toxic_case_analysis/toxic_predict_data/toxic_offtarget_profile.csv')
not_toxic_df = pd.read_csv('/home/liujin/Drugsafety/toxic_case_analysis/toxic_predict_data/nontoxic_offtarget_profile.csv')
toxic_df['label'] = 1
not_toxic_df['label'] = 0


df = pd.concat([toxic_df, not_toxic_df], axis=0)
print(toxic_df.shape, not_toxic_df.shape, df.shape)

data_df = df.drop(['smiles','label'], axis=1)
print(data_df.shape)

train_x, test_x, train_y, test_y = train_test_split(data_df, df['label'], test_size=0.2, random_state=999)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
print(data_df.shape)
print(len(df['label']))

(877, 244) (1229, 244) (2106, 244)
(2106, 242)
(1684, 242) (422, 242) (1684,) (422,)
(2106, 242)
2106


#### 3.Hyperparameter search

In [5]:
import numpy as np
import os
from sklearn.model_selection import cross_val_score
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [6]:
def objective(trial,X,y):

    X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.1,random_state=999) 
    
    n_estimators = trial.suggest_int('n_estimators', 10,250,step=10) 
    max_depth = trial.suggest_int('max_depth', 2,10) 
    min_samples_split = trial.suggest_int('min_samples_split', 20,80) 
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 40,120)
    max_features = trial.suggest_int('max_features', 3,9)

    param = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features,
        'random_state': 2023,
    }

    dt_clf = RandomForestClassifier(**param)
    
    dt_clf.fit(X_train, y_train)
    pred_dt = dt_clf.predict(X_val) 
    proba = dt_clf.predict_proba(X_val)
    score = roc_auc_score(y_val, proba[:,1])

    return score

In [7]:
study = optuna.create_study(direction="maximize", study_name="RF Classifier")
func = lambda trial: objective(trial, train_x, train_y)
study.optimize(func, n_trials=1000) 

[I 2023-09-15 02:25:50,107] A new study created in memory with name: RF Classifier
[I 2023-09-15 02:25:50,237] Trial 0 finished with value: 0.8613861386138614 and parameters: {'n_estimators': 40, 'max_depth': 9, 'min_samples_split': 37, 'min_samples_leaf': 110, 'max_features': 3}. Best is trial 0 with value: 0.8613861386138614.
[I 2023-09-15 02:25:50,541] Trial 1 finished with value: 0.8637157833430401 and parameters: {'n_estimators': 90, 'max_depth': 9, 'min_samples_split': 72, 'min_samples_leaf': 114, 'max_features': 5}. Best is trial 1 with value: 0.8637157833430401.
[I 2023-09-15 02:25:51,697] Trial 2 finished with value: 0.8841001747233547 and parameters: {'n_estimators': 240, 'max_depth': 5, 'min_samples_split': 71, 'min_samples_leaf': 76, 'max_features': 8}. Best is trial 2 with value: 0.8841001747233547.
[I 2023-09-15 02:25:51,789] Trial 3 finished with value: 0.8724519510774607 and parameters: {'n_estimators': 20, 'max_depth': 9, 'min_samples_split': 71, 'min_samples_leaf': 51

In [8]:
print(f"\tBest value (auc): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (auc): 0.90128
	Best params:
		n_estimators: 170
		max_depth: 8
		min_samples_split: 22
		min_samples_leaf: 40
		max_features: 8


#### 4.Conduct five-fold cross-training based on the optimum hyperparameter

In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, balanced_accuracy_score, matthews_corrcoef
from sklearn.ensemble import RandomForestClassifier


kf = KFold(n_splits=5, random_state=999, shuffle=True)

train_x, test_x, train_y, test_y = train_test_split(data_df, df['label'], test_size=0.2, random_state=999)


# model = RandomForestClassifier(**study.best_params)
model = RandomForestClassifier(n_estimators=170,
                                max_depth=8,
                                min_samples_split=22,
                                min_samples_leaf=40,
                                max_features=8,
                                random_state=2023)

acc_list = []
auc_list = []
f1_list = []
bacc_list = []
mcc_list = []

for train_index,val_index in kf.split(train_x):
    X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(test_x)
    y_prob = model.predict_proba(test_x)

    acc_list.append(accuracy_score(test_y, y_pred))
    auc_list.append(roc_auc_score(test_y, y_prob[:,1]))
    f1_list.append(f1_score(test_y, y_pred, average='binary'))
    bacc_list.append(balanced_accuracy_score(test_y, y_pred))
    mcc_list.append(matthews_corrcoef(test_y, y_pred))


print('accuracy_score:', np.mean(acc_list), np.std(acc_list))
print('roc_auc_score:', np.mean(auc_list), np.std(auc_list))
print('f1_score:', np.mean(f1_list), np.std(f1_list))
print('balanced_accuracy_score:', np.mean(bacc_list), np.std(bacc_list))
print('matthews_corrcoef:', np.mean(mcc_list), np.std(mcc_list))

accuracy_score: 0.7753554502369668 0.001773297339703314
roc_auc_score: 0.8967362861019577 0.0023234866862578843
f1_score: 0.7351785213197362 0.0032494718601906495
balanced_accuracy_score: 0.7921981004070557 0.002993833171922471
matthews_corrcoef: 0.5630097611389475 0.005597655404513538
