### Use SVM to train toxic/non-toxic data

#### 1.Import related packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC #SVM
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

#### 2.Import data

In [2]:
toxic_df = pd.read_csv('/home/liujin/Drugsafety/toxic_case_analysis/toxic_predict_data/toxic_offtarget_profile.csv')
not_toxic_df = pd.read_csv('/home/liujin/Drugsafety/toxic_case_analysis/toxic_predict_data/nontoxic_offtarget_profile.csv')
toxic_df['label'] = 1
not_toxic_df['label'] = 0

df = pd.concat([toxic_df, not_toxic_df], axis=0)
print(toxic_df.shape, not_toxic_df.shape, df.shape)

data_df = df.drop(['smiles','label'], axis=1)
print(data_df.shape)

train_x, test_x, train_y, test_y = train_test_split(data_df, df['label'], test_size=0.2, random_state=999)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
print(data_df.shape)
print(len(df['label']))

(877, 244) (1229, 244) (2106, 244)
(2106, 242)
(1684, 242) (422, 242) (1684,) (422,)
(2106, 242)
2106


#### 3.Hyperparameter search

In [7]:
import numpy as np
import os
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
import joblib
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [8]:
def objective(trial,X,y):

    X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.1,random_state=999)
    
    param = {
        'C': trial.suggest_loguniform('C', 1e-5, 1e5),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
        'degree': trial.suggest_int('degree', 1, 5),
        'probability': True,
        'random_state': 2023,
    }

    dt_clf = SVC(**param)
    
    dt_clf.fit(X_train, y_train)
    pred_dt = dt_clf.predict(X_val) 
    proba = dt_clf.predict_proba(X_val)
    score = roc_auc_score(y_val, proba[:,1])

    return score

In [16]:
study = optuna.create_study(direction="maximize", study_name="RF Classifier")
func = lambda trial: objective(trial, train_x, train_y)
study.optimize(func, n_trials=1000)

[I 2023-11-02 05:26:30,902] A new study created in memory with name: RF Classifier
  'C': trial.suggest_loguniform('C', 1e-5, 1e5),
  'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
[I 2023-11-02 05:26:32,675] Trial 0 finished with value: 0.8562900407687828 and parameters: {'C': 35.334107672432815, 'gamma': 0.7524097361345803, 'degree': 3}. Best is trial 0 with value: 0.8562900407687828.
  'C': trial.suggest_loguniform('C', 1e-5, 1e5),
  'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
[I 2023-11-02 05:26:34,555] Trial 1 finished with value: 0.7850174723354688 and parameters: {'C': 801.4175041066849, 'gamma': 18.801648028292686, 'degree': 3}. Best is trial 0 with value: 0.8562900407687828.
  'C': trial.suggest_loguniform('C', 1e-5, 1e5),
  'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
[I 2023-11-02 05:26:36,152] Trial 2 finished with value: 0.5987186953989517 and parameters: {'C': 4.5402671010122205e-05, 'gamma': 44326.988206845504, 'degree': 4}. Best is trial 0 w

In [1]:
print(f"\tBest value (auc): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

#### 4.Conduct five-fold cross-training based on the optimum hyperparameter

In [23]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, balanced_accuracy_score, matthews_corrcoef
from sklearn.svm import SVC #SVM


kf = KFold(n_splits=5, random_state=999, shuffle=True)

train_x, test_x, train_y, test_y = train_test_split(data_df, df['label'], test_size=0.2, random_state=999)

# model = SVC(**study.best_params)
model = SVC(C=125,
		    gamma=9.56e-05,
		    degree=5, 
            random_state=5,
            probability=True)

acc_list = []
auc_list = []
f1_list = []
bacc_list = []
mcc_list = []

for train_index,val_index in kf.split(train_x):
    X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(test_x)
    y_prob = model.predict_proba(test_x)

    acc_list.append(accuracy_score(test_y, y_pred))
    auc_list.append(roc_auc_score(test_y, y_prob[:,1]))
    f1_list.append(f1_score(test_y, y_pred, average='binary'))
    bacc_list.append(balanced_accuracy_score(test_y, y_pred))
    mcc_list.append(matthews_corrcoef(test_y, y_pred))


print('accuracy_score:', np.mean(acc_list), np.std(acc_list))
print('roc_auc_score:', np.mean(auc_list), np.std(auc_list))
print('f1_score:', np.mean(f1_list), np.std(f1_list))
print('balanced_accuracy_score:', np.mean(bacc_list), np.std(bacc_list))
print('matthews_corrcoef:', np.mean(mcc_list), np.std(mcc_list))

accuracy_score: 0.7933649289099526 0.004594957210821155
roc_auc_score: 0.8802311494475674 0.0025564065025843108
f1_score: 0.7196999456892037 0.007978496185535283
balanced_accuracy_score: 0.7793080054274084 0.0062354442926821465
matthews_corrcoef: 0.5564497965339346 0.010291845124452815
