In [12]:
import pandas as pd
import numpy as np
from tpot import TPOTClassifier
# from transformers import BertTokenizer, BertModel

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix,
)

# Data preparation

## Load fingerprints

In [2]:
# Load pd_train
pd_train = pd.read_parquet("data/training_class.parquet")
pd_train["label"] = pd_train["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)
print(pd_train["Class"].value_counts())
pd_train.head()

(1241, 16094)
Class
Hepatotoxicity       683
NonHepatotoxicity    558
Name: count, dtype: int64


Unnamed: 0,Class,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,FP9,...,APC2D10_I_B,APC2D10_I_Si,APC2D10_I_X,APC2D10_B_B,APC2D10_B_Si,APC2D10_B_X,APC2D10_Si_Si,APC2D10_Si_X,APC2D10_X_X,label
0,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
# Load pd_test
pd_test = pd.read_parquet("data/testing_class.parquet")
pd_test["label"] = pd_test["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)
print(pd_test["Class"].value_counts())
pd_test.head()

(286, 16094)
Class
Hepatotoxicity       221
NonHepatotoxicity     65
Name: count, dtype: int64


Unnamed: 0,Class,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,FP9,...,APC2D10_I_B,APC2D10_I_Si,APC2D10_I_X,APC2D10_B_B,APC2D10_B_Si,APC2D10_B_X,APC2D10_Si_Si,APC2D10_Si_X,APC2D10_X_X,label
0,Hepatotoxicity,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Hepatotoxicity,0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,Hepatotoxicity,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Hepatotoxicity,0,0,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
X_train_fingerprints = pd_train.drop(columns=["Class", "label"])
X_test_fingerprints = pd_test.drop(columns=["Class", "label"])
print(X_train_fingerprints.shape)
print(X_test_fingerprints.shape)

(1241, 16092)
(286, 16092)


## Load embeddings

In [5]:
# Use a pre-trained tokenizer and model (e.g., ChemBERTa)
tokenizer = BertTokenizer.from_pretrained(
    "seyonec/SMILES_tokenized_PubChem_shard00_160k"
)
model = BertModel.from_pretrained("seyonec/SMILES_tokenized_PubChem_shard00_160k")

# Tokenize SMILES string
inputs = tokenizer("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", return_tensors="pt")
outputs = model(**inputs)

# Get the embeddings
smiles_embedding = outputs.last_hidden_state
print("SMILES Embedding Shape:", smiles_embedding.shape)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at seyonec/SMILES_tokenized_PubChem_shard00_160k and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.

SMILES Embedding Shape: torch.Size([1, 26, 768])


In [6]:
# Tokenize and embed each SMILES string
def embed_smiles(smiles):
    inputs = tokenizer(smiles, return_tensors="pt", max_length=514, truncation=True)
    outputs = model(**inputs)
    # Use mean pooling to create a fixed-size embedding
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.detach().cpu().numpy()

In [7]:
pd_train_smiles = pd.read_csv("data_smiles/Training_Group.csv")
pd_train_smiles["embedding"] = pd_train_smiles["Smiles"].apply(embed_smiles)
print(pd_train_smiles.shape)
pd_train_smiles.head()

(1241, 3)


Unnamed: 0,Smiles,Liver,embedding
0,S=C=Nc1c2c(ccc1)cccc2,Hepatotoxicity,"[[0.7060221, -1.7110245, 0.07632195, 0.3369924..."
1,c1(c(cc(cc1[N+](=O)[O-])[N+](=O)[O-])[N+](=O)[...,Hepatotoxicity,"[[0.87278545, -1.546725, 0.25801018, 0.7569198..."
2,c1(c(cc(cc1)[N+](=O)[O-])[N+](=O)[O-])O,Hepatotoxicity,"[[0.9185096, -1.5700381, 0.30937475, 0.7761047..."
3,O(CCO)CC,Hepatotoxicity,"[[0.71440613, -1.454202, 0.14423189, 0.3271389..."
4,Oc1cc2c(cc1)cccc2,Hepatotoxicity,"[[0.7504034, -1.4247329, 0.04407968, 0.2881815..."


In [8]:
pd_test_smiles = pd.read_csv("data_smiles/Testing_Group.csv")
pd_test_smiles["embedding"] = pd_test_smiles["Smiles"].apply(embed_smiles)
print(pd_test_smiles.shape)
pd_test_smiles.head()

(286, 3)


Unnamed: 0,Smiles,Liver,embedding
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,Hepatotoxicity,"[[0.92375356, -1.662626, 0.4028539, 0.84775025..."
1,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,Hepatotoxicity,"[[0.8236335, -1.7612044, 0.49285388, 0.7976179..."
2,CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(...,Hepatotoxicity,"[[0.8446836, -1.6429605, 0.3667821, 0.7405252,..."
3,C1CC2=CC=CC=C2C(C3=CC=CC=C31)NCCCCCCC(=O)O,Hepatotoxicity,"[[0.8776639, -1.9113547, 0.44247052, 0.5683984..."
4,C1=CC=C(C=C1)CN2C3=CC=CC=C3C(=N2)OCC(=O)O,Hepatotoxicity,"[[0.8453375, -1.8455155, 0.4760615, 0.6869821,..."


In [9]:
X_train_embedding = np.vstack(pd_train_smiles["embedding"].values)  # Stack embeddings into a 2D array
X_test_embedding = np.vstack(pd_test_smiles["embedding"].values)  # Stack embeddings into a 2D array
print(X_train_embedding.shape)
print(X_test_embedding.shape)

(1241, 768)
(286, 768)


## Combine features

In [10]:
# combine fingerprints and embeddings
X_train = np.hstack([X_train_fingerprints, X_train_embedding])
X_test = np.hstack([X_test_fingerprints, X_test_embedding])
print(X_train.shape)
print(X_test.shape)

(1241, 16860)
(286, 16860)


In [11]:
y_train = pd_train["label"].values
y_test = pd_test["label"].values
print(y_train.shape)
print(y_test.shape)

(1241,)
(286,)


In [12]:
# save the data
np.save("data_combined/X_train_nlp_fingerprints.npy", X_train)
np.save("data_combined/X_test_nlp_fingerprints.npy", X_test)
np.save("data_combined/y_train_nlp_fingerprints.npy", y_train)
np.save("data_combined/y_test_nlp_fingerprints.npy", y_test)

# Model training (full features NLP + fingerprints)

## TPOT classifier

In [3]:
# load the data
X_train = np.load("data_combined/X_train_nlp_fingerprints.npy")
X_test = np.load("data_combined/X_test_nlp_fingerprints.npy")
y_train = np.load("data_combined/y_train_nlp_fingerprints.npy")
y_test = np.load("data_combined/y_test_nlp_fingerprints.npy")
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1241, 16860)
(286, 16860)
(1241,)
(286,)


In [4]:
# Initialize TPOTClassifier with 5-fold cross-validation
tpot = TPOTClassifier(
    generations=10,  # Number of iterations
    population_size=40,  # Number of pipelines to evaluate in each generation
    cv=5,  # 5-fold cross-validation
    random_state=42,
    scoring="roc_auc",  # AUC
    verbosity=2,  # Output progress
    n_jobs=16,  # Use 16 cores
)

# Fit the TPOT classifier on the training data
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/440 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7384421566762432

Generation 2 - Current best internal CV score: 0.7435618446184945

Generation 3 - Current best internal CV score: 0.7435618446184945

Generation 4 - Current best internal CV score: 0.7435618446184945

Generation 5 - Current best internal CV score: 0.7455172747733932

Generation 6 - Current best internal CV score: 0.7488648388927478

Generation 7 - Current best internal CV score: 0.750925951733031

Generation 8 - Current best internal CV score: 0.750925951733031

Generation 9 - Current best internal CV score: 0.7544638090983089

Generation 10 - Current best internal CV score: 0.7544638090983089

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.05, min_samples_leaf=5, min_samples_split=14, n_estimators=100)


In [5]:
# Extract model name and parameters
best_pipeline = tpot.fitted_pipeline_
model_name = type(
    best_pipeline.steps[-1][1]
).__name__  # Name of the final estimator
print(model_name)

ExtraTreesClassifier


In [6]:
model_params = best_pipeline.steps[-1][
    1
].get_params()  # Params of the final estimator
model_params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 0.05,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 5,
 'min_samples_split': 14,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [17]:
y_pred_proba = tpot.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score:", test_auc)

Test AUC score: 0.8987817612252001


## Optuna for ExtraTreesClassifier

In [7]:
import optuna
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [10]:
def objective(trial):
    # Suggest values for hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
    max_features = trial.suggest_categorical("max_features", [None, "sqrt", "log2"])

    # Initialize the model with suggested hyperparameters
    model = ExtraTreesClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = cross_val_score(
        model, X_train, y_train, scoring=make_scorer(roc_auc_score, needs_proba=True), cv=cv
    )
    return auc_scores.mean()

In [11]:
# Create Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-11-18 19:35:09,904] A new study created in memory with name: no-name-0876af1a-ae8b-4887-8434-422125c61b89
[I 2024-11-18 19:39:10,126] Trial 0 finished with value: 0.6949428722218813 and parameters: {'n_estimators': 200, 'max_depth': 16, 'min_samples_split': 18, 'min_samples_leaf': 19, 'max_features': None}. Best is trial 0 with value: 0.6949428722218813.
[I 2024-11-18 19:39:16,810] Trial 1 finished with value: 0.7572524434167038 and parameters: {'n_estimators': 255, 'max_depth': 8, 'min_samples_split': 14, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7572524434167038.
[I 2024-11-18 19:48:04,829] Trial 2 finished with value: 0.6896201242876319 and parameters: {'n_estimators': 454, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 11, 'max_features': None}. Best is trial 1 with value: 0.7572524434167038.
[I 2024-11-18 19:48:06,208] Trial 3 finished with value: 0.7402860431507384 and parameters: {'n_estimators': 241, 'max_depth': 14, 'min

In [15]:
# Train final model with the best parameters
best_params = study.best_params
final_model = ExtraTreesClassifier(**best_params, random_state=42)
final_model.fit(X_train, y_train)

In [16]:
y_pred_proba = final_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score:", test_auc)

Test AUC score: 0.9021232161503656
