In [2]:
# from rdkit import Chem
# from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer

# smiles = "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"
# tokenizer = BasicSmilesTokenizer()
# tokens = tokenizer.tokenize(smiles)
# print("Tokenized SMILES:", tokens)

In [1]:
from transformers import BertTokenizer, BertModel

# Use a pre-trained tokenizer and model (e.g., ChemBERTa)
tokenizer = BertTokenizer.from_pretrained(
    "seyonec/SMILES_tokenized_PubChem_shard00_160k"
)
model = BertModel.from_pretrained("seyonec/SMILES_tokenized_PubChem_shard00_160k")

# Tokenize SMILES string
inputs = tokenizer("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", return_tensors="pt")
outputs = model(**inputs)

# Get the embeddings
smiles_embedding = outputs.last_hidden_state
print("SMILES Embedding Shape:", smiles_embedding.shape)

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at seyonec/SMILES_tokenized_PubChem_shard00_160k and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention

SMILES Embedding Shape: torch.Size([1, 26, 768])


In [3]:
smiles_embedding_np = smiles_embedding.detach().cpu().numpy()  # Convert to NumPy array
smiles_embedding_np

array([[[-8.7590200e-01,  3.0506086e-01,  6.9207108e-01, ...,
          5.4176462e-01,  5.5791640e-01, -5.0844544e-01],
        [-1.2149686e+00,  4.3680158e-01, -4.5811260e-01, ...,
          6.9964379e-01,  1.6057323e+00,  1.6602541e+00],
        [-1.0540334e+00,  7.6476914e-01, -1.2318478e+00, ...,
          2.3868134e+00,  2.2018609e+00,  1.3283165e-03],
        ...,
        [-1.0319257e+00, -2.1826245e-01, -8.8171393e-02, ...,
          1.7335273e+00,  1.4440529e+00, -5.0599241e-01],
        [-2.7555614e+00,  1.7899810e-01, -5.9625320e-02, ...,
         -2.3788512e-01, -4.2273989e-01,  1.8023103e-01],
        [-1.8629063e+00, -2.4388081e-01, -4.4336745e-01, ...,
         -5.4076403e-02,  1.4175699e+00, -9.9230146e-01]]], dtype=float32)

In [4]:
smiles_embedding_np.shape

(1, 26, 768)

# Data preparation

In [1]:
import pandas as pd
import numpy as np
import joblib

In [10]:
# Load pd_train
pd_train = pd.read_csv("data_smiles/Training_Group.csv")
pd_train["label"] = pd_train["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)

(1241, 3)


In [13]:
pd_train["label"].value_counts()

label
1    683
0    558
Name: count, dtype: int64

In [14]:
# Load pd_test
pd_test = pd.read_csv("data_smiles/Testing_Group.csv")
pd_test["label"] = pd_test["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)

(286, 3)


In [15]:
pd_test["label"].value_counts()

label
1    221
0     65
Name: count, dtype: int64

In [None]:
# Tokenize and embed each SMILES string
def embed_smiles(smiles):
    inputs = tokenizer(smiles, return_tensors="pt", max_length=514, truncation=True)
    outputs = model(**inputs)
    # Use mean pooling to create a fixed-size embedding
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.detach().cpu().numpy()


# Apply embedding to each SMILES string
pd_train["Embeddings"] = pd_train["Smiles"].apply(embed_smiles)

# Save the embeddings
joblib.dump(pd_train, "data_smiles/Training_Group_embeddings.pkl")

['data_smiles/Training_Group_embeddings.pkl']

In [31]:
# pd_test["Embeddings"] = pd_test["Smiles"].apply(embed_smiles)
joblib.dump(pd_test, "data_smiles/Testing_Group_embeddings.pkl")

['data_smiles/Testing_Group_embeddings.pkl']

In [2]:
# load the embeddings
pd_train = joblib.load("data_smiles/Training_Group_embeddings.pkl")
pd_test = joblib.load("data_smiles/Testing_Group_embeddings.pkl")

In [3]:
# Prepare features and labels for the split
X_train = np.vstack(pd_train["Embeddings"].values)  # Stack embeddings into a 2D array
y_train = pd_train["label"].values

print("X_train Shape:", X_train.shape)
print("y_train Shape:", y_train.shape)

X_train Shape: (1241, 768)
y_train Shape: (1241,)


In [4]:
X_test = np.vstack(pd_test["Embeddings"].values)  # Stack embeddings into a 2D array
y_test = pd_test["label"].values

print("X_test Shape:", X_test.shape)
print("y_test Shape:", y_test.shape)

X_test Shape: (286, 768)
y_test Shape: (286,)


# Model training

In [5]:
from tpot import TPOTClassifier

# Set up TPOT with basic configuration
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring="roc_auc",  # AUC
    random_state=42,
    n_jobs=6,  # Use 4 cores
)

# Fit the model
tpot.fit(X_train, y_train)

                                                                              
Generation 1 - Current best internal CV score: 0.631005816769892
                                                                             
Generation 2 - Current best internal CV score: 0.6410992739100967
                                                                             
Generation 3 - Current best internal CV score: 0.6411517098701781
                                                                             
Generation 4 - Current best internal CV score: 0.6411517098701781
                                                                              
Generation 5 - Current best internal CV score: 0.6411517098701781
                                                                              
Best pipeline: KNeighborsClassifier(ZeroCount(input_matrix), n_neighbors=27, p=2, weights=distance)


In [7]:
# Access the best pipeline directly after fitting
from pprint import pprint
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    roc_auc_score,
)


best_pipeline = tpot.fitted_pipeline_

#
y_pred = best_pipeline.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred)

y_pred = (y_pred > 0.5).astype(int)  # Binarize predictions for threshold of 0.5
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)  # Same as sensitivity for positive class

# Calculate sensitivity and specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

# Extract model name and parameters
model_name = type(best_pipeline.steps[-1][1]).__name__  # Name of the final estimator
model_params = best_pipeline.steps[-1][1].get_params()  # Params of the final estimator

# Append results to list
result = {
    "Best model": best_pipeline,
    "Model Name": model_name,
    "Parameters": model_params,
    "AUC": auc,
    "Precision": precision,
    "Recall": recall,
    "Sensitivity": sensitivity,
    "Specificity": specificity,
}

pprint(result)

{'AUC': np.float64(0.5876435781413156),
 'Best model': Pipeline(steps=[('zerocount', ZeroCount()),
                ('kneighborsclassifier',
                 KNeighborsClassifier(n_neighbors=27, weights='distance'))]),
 'Model Name': 'KNeighborsClassifier',
 'Parameters': {'algorithm': 'auto',
                'leaf_size': 30,
                'metric': 'minkowski',
                'metric_params': None,
                'n_jobs': None,
                'n_neighbors': 27,
                'p': 2,
                'weights': 'distance'},
 'Precision': np.float64(0.7955555555555556),
 'Recall': np.float64(0.8099547511312217),
 'Sensitivity': np.float64(0.8099547511312217),
 'Specificity': np.float64(0.2923076923076923)}
