In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import numpy as np
import torch

from pprint import pprint
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    f1_score,
    roc_auc_score,
)

from transformers import AutoConfig , AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# ModernBERT example

In [2]:
model_id = "unikei/bert-base-smiles"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [10]:
text = "The capital of France is [MASK]."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

# To get predictions for the mask:
masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)
predicted_token_id = outputs.logits[0, masked_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print("Predicted token:", predicted_token)

Predicted token: ##1


# Data preparation

In [7]:
# Load pd_train
pd_train = pd.read_csv("data_smiles/Training_Group.csv")
pd_train["label"] = pd_train["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)
pd_train.head()

(1241, 3)


Unnamed: 0,Smiles,Liver,label
0,S=C=Nc1c2c(ccc1)cccc2,Hepatotoxicity,1
1,c1(c(cc(cc1[N+](=O)[O-])[N+](=O)[O-])[N+](=O)[...,Hepatotoxicity,1
2,c1(c(cc(cc1)[N+](=O)[O-])[N+](=O)[O-])O,Hepatotoxicity,1
3,O(CCO)CC,Hepatotoxicity,1
4,Oc1cc2c(cc1)cccc2,Hepatotoxicity,1


In [8]:
# SMILES length column
pd_train["smiles_length"] = pd_train["Smiles"].apply(lambda x: len(x))
pd_train["smiles_length"].describe()

count    1241.000000
mean       63.667204
std        61.782302
min         8.000000
25%        36.000000
50%        48.000000
75%        68.000000
max       748.000000
Name: smiles_length, dtype: float64

In [9]:
# Load pd_test
pd_test = pd.read_csv("data_smiles/Testing_Group.csv")
pd_test["label"] = pd_test["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)
pd_test.head()

(286, 3)


Unnamed: 0,Smiles,Liver,label
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,Hepatotoxicity,1
1,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,Hepatotoxicity,1
2,CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(...,Hepatotoxicity,1
3,C1CC2=CC=CC=C2C(C3=CC=CC=C31)NCCCCCCC(=O)O,Hepatotoxicity,1
4,C1=CC=C(C=C1)CN2C3=CC=CC=C3C(=N2)OCC(=O)O,Hepatotoxicity,1


In [10]:
# SMILES length column
pd_test["smiles_length"] = pd_test["Smiles"].apply(lambda x: len(x))
pd_test["smiles_length"].describe()

count    286.000000
mean      54.370629
std       39.480251
min       10.000000
25%       32.000000
50%       44.000000
75%       60.000000
max      284.000000
Name: smiles_length, dtype: float64

## Embedding

In [16]:
pd_train_hf = Dataset.from_pandas(pd_train)
pd_test_hf = Dataset.from_pandas(pd_test)

In [17]:
# max=748, min=8, avg=54/63
max_seq_length = 748
max_seq_length = min(max_seq_length, tokenizer.model_max_length)
print(max_seq_length)


def preprocess_function(examples):
  # Tokenize the texts
  result = tokenizer(
      examples["Smiles"],
      padding="max_length",
      max_length=max_seq_length,
      truncation=True
  )
  result["label"] = examples["label"]
  return result

pd_train_hf_processed = pd_train_hf.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)
pd_train_hf_processed

512


Running tokenizer on dataset: 100%|██████████| 1241/1241 [00:00<00:00, 8575.77 examples/s]


Dataset({
    features: ['Smiles', 'Liver', 'label', 'smiles_length', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1241
})

In [18]:
pd_test_hf_processed = pd_test_hf.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)
pd_test_hf_processed

Running tokenizer on dataset: 100%|██████████| 286/286 [00:00<00:00, 8654.35 examples/s]


Dataset({
    features: ['Smiles', 'Liver', 'label', 'smiles_length', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 286
})

In [19]:
X_train = pd_train_hf_processed["input_ids"]
y_train = pd_train_hf_processed["label"]
X_test = pd_test_hf_processed["input_ids"]
y_test = pd_test_hf_processed["label"]

In [20]:
# Save data to .npy files
np.save("modern_bert_data/X_train.npy", X_train)
np.save("modern_bert_data/X_test.npy", X_test)
np.save("modern_bert_data/y_train.npy", y_train)
np.save("modern_bert_data/y_test.npy", y_test)

print("Data saved successfully!")

Data saved successfully!


# Model training

In [3]:
# load data
X_train = np.load("modern_bert_data/X_train.npy")
X_test = np.load("modern_bert_data/X_test.npy")
y_train = np.load("modern_bert_data/y_train.npy")
y_test = np.load("modern_bert_data/y_test.npy")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1241, 512)
X_test shape: (286, 512)
y_train shape: (1241,)
y_test shape: (286,)


In [11]:
def find_optimal_threshold(y_true, y_pred_proba):
    """
    Find optimal threshold based on sensitivity >= 0.7 or best F1 score.
    """
    best_threshold = 0.5
    best_metrics = {
        "accuracy": 0,
        "precision": 0,
        "recall": 0,
        "sensitivity": 0,
        "specificity": 0,
        "f1": 0,
    }

    for threshold in np.arange(0.0, 1.0, 0.01):
        y_pred = (y_pred_proba >= threshold).astype(int)

        # Calculate metrics
        auc = roc_auc_score(y_true, y_pred_proba)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = f1_score(y_true, y_pred)

        if sensitivity >= 0.7 or f1 > best_metrics["f1"]:
            best_threshold = threshold
            best_metrics = {
                "auc": auc,
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "sensitivity": sensitivity,
                "specificity": specificity,
                "f1": f1,
            }

    return best_threshold, best_metrics

In [12]:
# tokenization
model_name = "unikei/bert-base-smiles"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# max=748, min=8, avg=54/63
max_seq_length = 748
max_seq_length = min(max_seq_length, tokenizer.model_max_length)
print(max_seq_length)

512


In [13]:
def preprocess_function(examples):
  # Tokenize the texts
  result = tokenizer(
      examples["Smiles"],
      padding="max_length",
      max_length=max_seq_length,
      truncation=True
  )
  result["label"] = examples["label"]
  return result

# 
pd_train_hf = Dataset.from_pandas(pd_train)
pd_test_hf = Dataset.from_pandas(pd_test)

pd_train_hf_processed = pd_train_hf.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

pd_test_hf_processed = pd_test_hf.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)


Running tokenizer on dataset:   0%|          | 0/1241 [00:00<?, ? examples/s]

Running tokenizer on dataset: 100%|██████████| 1241/1241 [00:00<00:00, 8824.77 examples/s]
Running tokenizer on dataset: 100%|██████████| 286/286 [00:00<00:00, 8756.89 examples/s]


# Modeling

In [14]:
num_labels = 2

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task="text-classification"
)
config

BertConfig {
  "_name_or_path": "unikei/bert-base-smiles",
  "architectures": [
    "BertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "text-classification",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at unikei/bert-base-smiles and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Training

In [16]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions , axis=1)
  result = metric.compute(predictions=predictions, references=labels)
  return result

In [17]:
training_args = TrainingArguments(
    output_dir="save_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=["tensorboard"],  # Only use tensorboard
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=pd_train_hf_processed,
    eval_dataset=pd_test_hf_processed,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [18]:
%%time
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.821928,0.307692
2,No log,0.674736,0.548951
3,No log,0.576907,0.699301
4,0.587400,0.811309,0.555944
5,0.587400,1.058315,0.590909
6,0.587400,1.068299,0.643357
7,0.247200,1.645828,0.531469
8,0.247200,1.606146,0.573427


CPU times: user 14min 58s, sys: 6.41 s, total: 15min 4s
Wall time: 15min 4s


TrainOutput(global_step=1248, training_loss=0.3555318193557935, metrics={'train_runtime': 904.4731, 'train_samples_per_second': 10.977, 'train_steps_per_second': 1.38, 'total_flos': 2612166557614080.0, 'train_loss': 0.3555318193557935, 'epoch': 8.0})

In [None]:
# save and load trainer
trainer.save_model("save_model_bert_base_smiles")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [24]:
outputs = trainer.predict(pd_test_hf_processed) 
outputs.metrics

{'test_loss': 0.5769072771072388,
 'test_accuracy': 0.6993006993006993,
 'test_runtime': 8.1883,
 'test_samples_per_second': 34.928,
 'test_steps_per_second': 4.396}

In [25]:
def find_optimal_threshold(y_true, y_pred_proba):
    """
    Find optimal threshold based on sensitivity >= 0.7 or best F1 score.
    """
    best_threshold = 0.5
    best_metrics = {
        "accuracy": 0,
        "precision": 0,
        "recall": 0,
        "sensitivity": 0,
        "specificity": 0,
        "f1": 0,
    }
    y_pred_proba = np.array(y_pred_proba)
    for threshold in np.arange(0.0, 1.0, 0.01):
        y_pred = (y_pred_proba >= threshold).astype(int)

        # Calculate metrics
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = f1_score(y_true, y_pred)

        if sensitivity >= 0.7 or f1 > best_metrics["f1"]:
            best_threshold = threshold
            best_metrics = {
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "sensitivity": sensitivity,
                "specificity": specificity,
                "f1": f1,
            }

    return best_threshold, best_metrics

In [26]:
import numpy as np
from sklearn.metrics import (
    roc_auc_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix,
    accuracy_score
)

def compute_binary_metrics(pred_probs, pred_labels, true_labels):
    """
    Compute comprehensive binary classification metrics
    """
    # labels, pred_labels to numpy
    true_labels = np.array(true_labels)
    pred_labels = np.array(pred_labels)

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels).ravel()
    
    # Calculate metrics
    metrics = {
        "accuracy": accuracy_score(true_labels, pred_labels),
        "auc": roc_auc_score(true_labels, pred_probs),
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels),
        "sensitivity": tp / (tp + fn),  # Same as recall
        "specificity": tn / (tn + fp),
        "confusion_matrix": {
            "true_negatives": tn,
            "false_positives": fp,
            "false_negatives": fn,
            "true_positives": tp
        }
    }
    
    return metrics

# Process trainer.predict outputs
outputs = trainer.predict(pd_test_hf_processed)
logits = outputs.predictions
true_labels = outputs.label_ids

# Get probabilities and predicted labels
logits = torch.tensor(logits)
probs = logits.softmax(axis=1)
pred_probs = probs[:, 1]  # For binary classification, take probability of positive class
# pred_labels = np.argmax(logits, axis=1)

# Find optimal threshold
best_threshold, best_metrics = find_optimal_threshold(true_labels, pred_probs)
pred_probs = np.array(pred_probs)
pred_labels = (pred_probs >= best_threshold).astype(int)

# Compute all metrics
all_metrics = compute_binary_metrics(pred_probs, pred_labels, true_labels)

# Print results in a formatted way
print("\nModel Evaluation Metrics:")
print("-" * 50)
for metric, value in all_metrics.items():
    if metric != "confusion_matrix":
        print(f"{metric.capitalize():15} : {value:.4f}")

print("\nConfusion Matrix:")
print("-" * 50)
for key, value in all_metrics["confusion_matrix"].items():
    print(f"{key:15} : {value}")


Model Evaluation Metrics:
--------------------------------------------------
Accuracy        : 0.6608
Auc             : 0.6721
Precision       : 0.8333
Recall          : 0.7014
F1              : 0.7617
Sensitivity     : 0.7014
Specificity     : 0.5231

Confusion Matrix:
--------------------------------------------------
true_negatives  : 34
false_positives : 31
false_negatives : 66
true_positives  : 155


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
