In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    log_loss,
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
from tpot import TPOTClassifier
import matplotlib.pyplot as plt

# Data preparation

## full 16k features

In [2]:
# Load pd_train
pd_train = pd.read_parquet("data/training_class_mixed.parquet")
pd_train["label"] = pd_train["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)
print(pd_train["label"].value_counts())
pd_train.head()

(1221, 16095)
label
1    723
0    498
Name: count, dtype: int64


Unnamed: 0_level_0,Smiles,Liver,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,...,APC2D10_I_B,APC2D10_I_Si,APC2D10_I_X,APC2D10_B_B,APC2D10_B_Si,APC2D10_B_X,APC2D10_Si_Si,APC2D10_Si_X,APC2D10_X_X,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
998,N1C[C@H]2C[C@H](c3c2cc2nccnc2c3)C1,NonHepatotoxicity,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,CCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)I)O)I,Hepatotoxicity,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
283,O[C@@]12[C@H]3[C@@H]([C@@]4([C@H](CC3)C[C@@H](...,NonHepatotoxicity,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1082,O=C1[C@@]2(C(C3C([C@@]4(C(=CC3)C[C@@H](O)CC4)C...,Hepatotoxicity,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
122,O[C@@]12[C@@]3(CCN([C@H]1Cc1c3cc(O)cc1)CC1CCC1...,NonHepatotoxicity,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Load pd_test
pd_test = pd.read_parquet("data/testing_class_mixed.parquet")
pd_test["label"] = pd_test["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)
print(pd_test["label"].value_counts())
pd_test.head()

(306, 16095)
label
1    181
0    125
Name: count, dtype: int64


Unnamed: 0_level_0,Smiles,Liver,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,...,APC2D10_I_B,APC2D10_I_Si,APC2D10_I_X,APC2D10_B_B,APC2D10_B_Si,APC2D10_B_X,APC2D10_Si_Si,APC2D10_Si_X,APC2D10_X_X,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
724,Clc1c(cc([C@]2(O)CCN(CC2)CCC[C@@H](c2ccc(F)cc2...,NonHepatotoxicity,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
82,CN(C)N/N=C\1/C(=NC=N1)C(=O)N,Hepatotoxicity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
384,F[C@]12[C@H]([C@H]3[C@@]([C@](O)(CC3)C(=O)CO)(...,NonHepatotoxicity,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
627,O=C(N1[C@@H](Cc2c(C1)cc(OC)c(OC)c2)C(=O)O)[C@@...,Hepatotoxicity,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
896,Oc1cc([C@@H]([C@H](CN(C)C)C)CC)ccc1,Hepatotoxicity,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
pd_data = pd.concat([pd_train, pd_test], axis=0)
print(pd_data.shape)

(1527, 16095)


In [5]:
# get X, y
X = pd_data.drop(columns=["Liver", "label", "Smiles"])
y = pd_data["label"]

print(X.shape)
print(y.shape)

(1527, 16092)
(1527,)


## PCA

In [6]:
n_components = 50
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

# Model training

## TPOT classifier

In [None]:
# Initialize TPOTClassifier with 5-fold cross-validation
tpot = TPOTClassifier(
    generations=10,  # Number of iterations
    population_size=40,  # Number of pipelines to evaluate in each generation
    cv=5,  # 5-fold cross-validation
    random_state=42,
    scoring="roc_auc",  # AUC
    verbosity=2,  # Output progress
    n_jobs=16,  # Use 16 cores
)

# Fit the TPOT classifier on the training data
tpot.fit(X, y)

Optimization Progress:   0%|          | 0/440 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7380435015536269

Generation 2 - Current best internal CV score: 0.7419779277831352

Generation 3 - Current best internal CV score: 0.744598735669131

Generation 4 - Current best internal CV score: 0.744598735669131

Generation 5 - Current best internal CV score: 0.7452544733740492

Generation 6 - Current best internal CV score: 0.7511325404478731

Generation 7 - Current best internal CV score: 0.7511325404478731

Generation 8 - Current best internal CV score: 0.7511325404478731

Generation 9 - Current best internal CV score: 0.7511325404478731

Generation 10 - Current best internal CV score: 0.7511325404478731

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.4, min_samples_leaf=3, min_samples_split=3, n_estimators=100)


In [7]:
# Export the best pipeline
tpot.export("models/tpot_best_pipeline_automl_mixed.py")

## Optuna for RandomForestClassifier

In [16]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer

In [17]:
# Objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 10, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    # Create the model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1,
    )

    # Cross-validation with AUC score
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = cross_val_score(
        model, X, y, cv=cv, scoring=make_scorer(roc_auc_score, needs_proba=True)
    )

    # Maximize the average AUC score
    return auc_scores.mean()

In [18]:
# Create the study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-11-16 21:22:18,588] A new study created in memory with name: no-name-d4b8bf5c-ef81-487a-a86b-40ffa7507107
[I 2024-11-16 21:22:20,999] Trial 0 finished with value: 0.7842809188300758 and parameters: {'n_estimators': 225, 'max_depth': 18, 'min_samples_split': 14, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.7842809188300758.
[I 2024-11-16 21:22:23,360] Trial 1 finished with value: 0.7814939852274302 and parameters: {'n_estimators': 238, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.7842809188300758.
[I 2024-11-16 21:22:25,297] Trial 2 finished with value: 0.7607920275649024 and parameters: {'n_estimators': 237, 'max_depth': 6, 'min_samples_split': 19, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.7842809188300758.
[I 2024-11-16 21:22:27,429] Trial 3 finished with value: 0.7538252282223409 and parameters: {'n_estimators': 271, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 0 with value:

In [19]:
# Output the best parameters and score
print("Best AUC Score:", study.best_value)
print("Best Hyperparameters:", study.best_params)

Best AUC Score: 0.7969017200340601
Best Hyperparameters: {'n_estimators': 132, 'max_depth': 19, 'min_samples_split': 5, 'min_samples_leaf': 2}


# Model training PCA

## TPOT classifier

In [16]:
# Initialize TPOTClassifier with 5-fold cross-validation
tpot = TPOTClassifier(
    generations=10,  # Number of iterations
    population_size=40,  # Number of pipelines to evaluate in each generation
    cv=5,  # 5-fold cross-validation
    random_state=42,
    scoring="roc_auc",  # AUC
    verbosity=2,  # Output progress
    n_jobs=16,  # Use 16 cores
)

# Fit the TPOT classifier on the training data
tpot.fit(X_pca, y)

Version 0.12.1 of tpot is outdated. Version 0.12.2 was released Friday February 23, 2024.


Optimization Progress:   0%|          | 0/440 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7898838836412603

Generation 2 - Current best internal CV score: 0.7898838836412603

Generation 3 - Current best internal CV score: 0.7914172193508783

Generation 4 - Current best internal CV score: 0.7914172193508783

Generation 5 - Current best internal CV score: 0.7935651779243154

Generation 6 - Current best internal CV score: 0.7935651779243154

Generation 7 - Current best internal CV score: 0.7935651779243154

Generation 8 - Current best internal CV score: 0.7988901641616666

Generation 9 - Current best internal CV score: 0.7988901641616666

Generation 10 - Current best internal CV score: 0.7988901641616666

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.55, min_samples_leaf=3, min_samples_split=9, n_estimators=100)


In [None]:
# Export the best pipeline
tpot.export("models/tpot_best_pipeline_pca_automl_mixed.py")

## Optuna for RandomForestClassifier

In [7]:
import optuna
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer

In [10]:
def objective(trial):
    # Hyperparameter suggestions
    n_estimators = trial.suggest_int("n_estimators", 10, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", [None, "sqrt", "log2"])

    # Create the model
    model = ExtraTreesClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1,
    )

    # Stratified K-Fold Cross Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Evaluate using AUC score
    auc_scores = cross_val_score(
        model, X_pca, y, cv=cv, scoring=make_scorer(roc_auc_score, needs_proba=True)
    )

    # Return the mean AUC score to maximize
    return auc_scores.mean()

In [11]:
# Create the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-11-16 21:20:58,452] A new study created in memory with name: no-name-a975eb17-689b-4568-8e66-6acb80446a94
[I 2024-11-16 21:20:58,684] Trial 0 finished with value: 0.7543521360819025 and parameters: {'n_estimators': 22, 'max_depth': 19, 'min_samples_split': 11, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7543521360819025.
[I 2024-11-16 21:20:59,935] Trial 1 finished with value: 0.7883631572902433 and parameters: {'n_estimators': 257, 'max_depth': 13, 'min_samples_split': 17, 'min_samples_leaf': 8, 'max_features': None}. Best is trial 1 with value: 0.7883631572902433.
[I 2024-11-16 21:21:00,790] Trial 2 finished with value: 0.7864220051882215 and parameters: {'n_estimators': 190, 'max_depth': 16, 'min_samples_split': 17, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7883631572902433.
[I 2024-11-16 21:21:01,213] Trial 3 finished with value: 0.7769869351868354 and parameters: {'n_estimators': 81, 'max_depth': 12, 'mi

In [12]:
# Output the best parameters and score
print("Best AUC Score:", study.best_value)
print("Best Hyperparameters:", study.best_params)

Best AUC Score: 0.7999981092694906
Best Hyperparameters: {'n_estimators': 171, 'max_depth': 20, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'log2'}


In [13]:
# Train the final model with the best hyperparameters
best_params = study.best_params
final_model = ExtraTreesClassifier(**best_params, random_state=42, n_jobs=-1)
final_model.fit(X, y)