In [25]:
import pandas as pd
import numpy as np
from tpot import TPOTClassifier
from transformers import BertTokenizer, BertModel

# Data preparation

## Load fingerprints

In [4]:
# Load pd_train
pd_train = pd.read_parquet("data/training_class.parquet")
pd_train["label"] = pd_train["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)
print(pd_train["Class"].value_counts())
pd_train.head()

(1241, 16094)
Class
Hepatotoxicity       683
NonHepatotoxicity    558
Name: count, dtype: int64


Unnamed: 0,Class,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,FP9,...,APC2D10_I_B,APC2D10_I_Si,APC2D10_I_X,APC2D10_B_B,APC2D10_B_Si,APC2D10_B_X,APC2D10_Si_Si,APC2D10_Si_X,APC2D10_X_X,label
0,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
# Load pd_test
pd_test = pd.read_parquet("data/testing_class.parquet")
pd_test["label"] = pd_test["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)
print(pd_test["Class"].value_counts())
pd_test.head()

(286, 16094)
Class
Hepatotoxicity       221
NonHepatotoxicity     65
Name: count, dtype: int64


Unnamed: 0,Class,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,FP9,...,APC2D10_I_B,APC2D10_I_Si,APC2D10_I_X,APC2D10_B_B,APC2D10_B_Si,APC2D10_B_X,APC2D10_Si_Si,APC2D10_Si_X,APC2D10_X_X,label
0,Hepatotoxicity,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,Hepatotoxicity,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Hepatotoxicity,0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,Hepatotoxicity,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,Hepatotoxicity,0,0,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1


In [20]:
X_train_fingerprints = pd_train.drop(columns=["Class", "label"])
X_test_fingerprints = pd_test.drop(columns=["Class", "label"])
print(X_train_fingerprints.shape)
print(X_test_fingerprints.shape)

(1241, 16092)
(286, 16092)


## Load embeddings

In [None]:
# Use a pre-trained tokenizer and model (e.g., ChemBERTa)
tokenizer = BertTokenizer.from_pretrained(
    "seyonec/SMILES_tokenized_PubChem_shard00_160k"
)
model = BertModel.from_pretrained("seyonec/SMILES_tokenized_PubChem_shard00_160k")

# Tokenize SMILES string
inputs = tokenizer("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", return_tensors="pt")
outputs = model(**inputs)

# Get the embeddings
smiles_embedding = outputs.last_hidden_state
print("SMILES Embedding Shape:", smiles_embedding.shape)

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at seyonec/SMILES_tokenized_PubChem_shard00_160k and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention

SMILES Embedding Shape: torch.Size([1, 26, 768])


In [7]:
# Tokenize and embed each SMILES string
def embed_smiles(smiles):
    inputs = tokenizer(smiles, return_tensors="pt", max_length=514, truncation=True)
    outputs = model(**inputs)
    # Use mean pooling to create a fixed-size embedding
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.detach().cpu().numpy()

In [10]:
pd_train_smiles = pd.read_csv("data_smiles/Training_Group.csv")
pd_train_smiles["embedding"] = pd_train_smiles["Smiles"].apply(embed_smiles)
print(pd_train_smiles.shape)
pd_train_smiles.head()

(1241, 3)


Unnamed: 0,Smiles,Liver,embedding
0,S=C=Nc1c2c(ccc1)cccc2,Hepatotoxicity,"[[0.2722826, -0.83886486, -0.47826117, 0.20309..."
1,c1(c(cc(cc1[N+](=O)[O-])[N+](=O)[O-])[N+](=O)[...,Hepatotoxicity,"[[-0.234783, -0.81951904, -0.31077084, -0.0422..."
2,c1(c(cc(cc1)[N+](=O)[O-])[N+](=O)[O-])O,Hepatotoxicity,"[[-0.2744998, -0.84486884, -0.32333818, -0.108..."
3,O(CCO)CC,Hepatotoxicity,"[[0.11608047, -0.9577132, -0.42876914, -0.1015..."
4,Oc1cc2c(cc1)cccc2,Hepatotoxicity,"[[0.2636841, -1.0092615, -0.38923508, 0.033094..."


In [11]:
pd_test_smiles = pd.read_csv("data_smiles/Testing_Group.csv")
pd_test_smiles["embedding"] = pd_test_smiles["Smiles"].apply(embed_smiles)
print(pd_test_smiles.shape)
pd_test_smiles.head()

(286, 3)


Unnamed: 0,Smiles,Liver,embedding
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,Hepatotoxicity,"[[0.11688743, -0.9710314, -0.38742563, 0.11946..."
1,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,Hepatotoxicity,"[[0.011577387, -0.92429733, -0.4831625, -0.147..."
2,CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(...,Hepatotoxicity,"[[-0.016588315, -0.84102994, -0.4183025, 0.093..."
3,C1CC2=CC=CC=C2C(C3=CC=CC=C31)NCCCCCCC(=O)O,Hepatotoxicity,"[[-0.12550335, -0.85214424, -0.42031476, 0.070..."
4,C1=CC=C(C=C1)CN2C3=CC=CC=C3C(=N2)OCC(=O)O,Hepatotoxicity,"[[-0.019535245, -0.87949306, -0.42566398, 0.05..."


In [19]:
X_train_embedding = np.vstack(pd_train_smiles["embedding"].values)  # Stack embeddings into a 2D array
X_test_embedding = np.vstack(pd_test_smiles["embedding"].values)  # Stack embeddings into a 2D array
print(X_train_embedding.shape)
print(X_test_embedding.shape)

(1241, 768)
(286, 768)


## Combine features

In [21]:
# combine fingerprints and embeddings
X_train = np.hstack([X_train_fingerprints, X_train_embedding])
X_test = np.hstack([X_test_fingerprints, X_test_embedding])
print(X_train.shape)
print(X_test.shape)

(1241, 16860)
(286, 16860)


In [23]:
y_train = pd_train["label"].values
y_test = pd_test["label"].values
print(y_train.shape)
print(y_test.shape)

(1241,)
(286,)


# Model training (full features NLP + fingerprints)

## TPOT classifier

In [None]:
# Initialize TPOTClassifier with 5-fold cross-validation
tpot = TPOTClassifier(
    generations=5,  # Number of iterations
    population_size=40,  # Number of pipelines to evaluate in each generation
    cv=5,  # 5-fold cross-validation
    random_state=42,
    scoring="roc_auc",  # AUC
    verbosity=2,  # Output progress
    n_jobs=16,  # Use 16 cores
)

# Fit the TPOT classifier on the training data
tpot.fit(X_train, y_train)