In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd

import torch

  from .autonotebook import tqdm as notebook_tqdm


# Step 1: Tokenize the SMILES Strings

In [2]:
# Model ID for ModernBERT
model_id = "answerdotai/ModernBERT-base"

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)
model.eval()  # Set model to evaluation mode

# Use CPU
device = torch.device("cpu")
model.to(device)

# Tokenize SMILES
def tokenize_smiles(smiles_list):
    return tokenizer(smiles_list, padding=True, truncation=True, return_tensors="pt")

In [3]:
# Example dataset
smiles_data = [
    "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Example SMILES strings
    "C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl",
    "CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(C=C3)Cl",
]

tokenized_smiles = [tokenize_smiles(s) for s in smiles_data]
print("Tokenized SMILES:", tokenized_smiles)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Tokenized SMILES: [{'input_ids': tensor([[50281,  2648,     9,    36,    10,  2648,    18,    30,  2648,    30,
            36,     9,    36,    30,    36,    18,    10,    36,     9,    36,
            10,    36,     9,    30,    48,    10,    48, 50282]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}, {'input_ids': tensor([[50281,    36,    30,  2648,  3231,    18,    30,    36,     9,    36,
            30,    36,     9,    36,    30,    36,    18,    10,  2648,     9,
            30,    48,    10,    48,    10,  2019, 50282]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}, {'input_ids': tensor([[50281,  2648, 14546,     9, 39228,    10,    36,     9,    30,    48,
            10,  2648,    18,    30,    36,     9,    47,    30,    36,    19,
            47,    18,    36,    30,    36,     9,    36,    30,    36,    19,
            

In [4]:
# Load pd_train
pd_train = pd.read_csv("data_smiles/Training_Group.csv")
pd_train["label"] = pd_train["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)
pd_train.head()

(1241, 3)


Unnamed: 0,Smiles,Liver,label
0,S=C=Nc1c2c(ccc1)cccc2,Hepatotoxicity,1
1,c1(c(cc(cc1[N+](=O)[O-])[N+](=O)[O-])[N+](=O)[...,Hepatotoxicity,1
2,c1(c(cc(cc1)[N+](=O)[O-])[N+](=O)[O-])O,Hepatotoxicity,1
3,O(CCO)CC,Hepatotoxicity,1
4,Oc1cc2c(cc1)cccc2,Hepatotoxicity,1


In [5]:
# Load pd_test
pd_test = pd.read_csv("data_smiles/Testing_Group.csv")
pd_test["label"] = pd_test["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)
pd_test.head()

(286, 3)


Unnamed: 0,Smiles,Liver,label
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,Hepatotoxicity,1
1,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,Hepatotoxicity,1
2,CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(...,Hepatotoxicity,1
3,C1CC2=CC=CC=C2C(C3=CC=CC=C31)NCCCCCCC(=O)O,Hepatotoxicity,1
4,C1=CC=C(C=C1)CN2C3=CC=CC=C3C(=N2)OCC(=O)O,Hepatotoxicity,1


In [None]:
def encode_smiles(smiles_list):
    # Tokenize SMILES
    inputs = tokenizer(smiles_list, padding=True, truncation=True, return_tensors="pt")
    
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling over tokens
    return embeddings.numpy()

# Encode SMILES train and test
X_train = encode_smiles(pd_train["Smiles"].to_list())
X_test = encode_smiles(pd_test["Smiles"].to_list())
y_train = pd_train["label"].values
y_test = pd_test["label"].values

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# Initialize and train the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred_proba = clf.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

# Metrics
auc = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)
print(f"AUC: {auc:.4f}, Accuracy: {accuracy:.4f}")
