In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git

# ModernBERT example

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

In [None]:
text = "The capital of France is [MASK]."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

# To get predictions for the mask:
masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)
predicted_token_id = outputs.logits[0, masked_index].argmax(axis=-1)
predicted_token = tokenizer.decode(predicted_token_id)
print("Predicted token:", predicted_token)

# Data preparation

In [4]:
import pandas as pd
import numpy as np
import torch

In [None]:
# Load pd_train
pd_train = pd.read_csv("data_smiles/Training_Group.csv")
pd_train["label"] = pd_train["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)
pd_train.head()

In [4]:
# SMILES length column
pd_train["smiles_length"] = pd_train["Smiles"].apply(lambda x: len(x))
pd_train["smiles_length"].describe()

count    1241.000000
mean       63.667204
std        61.782302
min         8.000000
25%        36.000000
50%        48.000000
75%        68.000000
max       748.000000
Name: smiles_length, dtype: float64

In [5]:
# Load pd_test
pd_test = pd.read_csv("data_smiles/Testing_Group.csv")
pd_test["label"] = pd_test["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)
pd_test.head()

In [6]:
# SMILES length column
pd_test["smiles_length"] = pd_test["Smiles"].apply(lambda x: len(x))
pd_test["smiles_length"].describe()

count    286.000000
mean      54.370629
std       39.480251
min       10.000000
25%       32.000000
50%       44.000000
75%       60.000000
max      284.000000
Name: smiles_length, dtype: float64

In [None]:
# Save data to .npy files
np.save("modern_bert_data/X_train.npy", X_train)
np.save("modern_bert_data/X_test.npy", X_test)
np.save("modern_bert_data/y_train.npy", y_train)
np.save("modern_bert_data/y_test.npy", y_test)

print("Data saved successfully!")

# Model training

In [16]:
from pprint import pprint
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    f1_score,
    roc_auc_score,
)

In [6]:
# load data
X_train = np.load("modern_bert_data/X_train.npy")
X_test = np.load("modern_bert_data/X_test.npy")
y_train = np.load("modern_bert_data/y_train.npy")
y_test = np.load("modern_bert_data/y_test.npy")

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1241, 50368)
X_test shape: (286, 50368)
y_train shape: (1241,)
y_test shape: (286,)


In [12]:
def find_optimal_threshold(y_true, y_pred_proba):
    """
    Find optimal threshold based on sensitivity >= 0.7 or best F1 score.
    """
    best_threshold = 0.5
    best_metrics = {
        "accuracy": 0,
        "precision": 0,
        "recall": 0,
        "sensitivity": 0,
        "specificity": 0,
        "f1": 0,
    }

    for threshold in np.arange(0.0, 1.0, 0.01):
        y_pred = (y_pred_proba >= threshold).astype(int)

        # Calculate metrics
        auc = roc_auc_score(y_true, y_pred_proba)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = f1_score(y_true, y_pred)

        if sensitivity >= 0.7 or f1 > best_metrics["f1"]:
            best_threshold = threshold
            best_metrics = {
                "auc": auc,
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "sensitivity": sensitivity,
                "specificity": specificity,
                "f1": f1,
            }

    return best_threshold, best_metrics

In [7]:
# tokenization
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# max=748, min=8, avg=54/63
max_seq_length = 748
max_seq_length = min(max_seq_length, tokenizer.model_max_length)
print(max_seq_length)

512


In [8]:
def preprocess_function(examples):
  # Tokenize the texts
  result = tokenizer(
      examples["Smiles"],
      padding="max_length",
      max_length=max_seq_length,
      truncation=True
  )
  result["label"] = examples["label"]
  return result

# 
pd_train_hf = Dataset.from_pandas(pd_train)
pd_test_hf = Dataset.from_pandas(pd_test)

pd_train_hf_processed = pd_train_hf.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

pd_test_hf_processed = pd_test_hf.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)


Running tokenizer on dataset: 100%|██████████| 1241/1241 [00:00<00:00, 11605.93 examples/s]
Running tokenizer on dataset: 100%|██████████| 286/286 [00:00<00:00, 11927.48 examples/s]


# Modeling

In [9]:
num_labels = 2

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task="text-classification"
)
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "text-classification",
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.47.1",
  "vocab_size": 30522
}

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Training

In [36]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions , axis=1)
  result = metric.compute(predictions=predictions, references=labels)
  return result

In [12]:
training_args = TrainingArguments(
    output_dir="save_model",
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=["tensorboard"],  # Only use tensorboard
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=pd_train_hf_processed,
    eval_dataset=pd_test_hf_processed,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [13]:
%%time
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.585658,0.772727
2,No log,0.540117,0.741259
3,No log,0.56809,0.741259
4,No log,0.536167,0.762238
5,No log,0.587361,0.702797
6,No log,0.614004,0.695804
7,No log,0.744998,0.604895
8,0.570100,0.768538,0.608392
9,0.570100,0.681831,0.688811
10,0.570100,0.703761,0.678322


CPU times: user 9min 15s, sys: 5.09 s, total: 9min 20s
Wall time: 9min 20s


TrainOutput(global_step=630, training_loss=0.5322345612541077, metrics={'train_runtime': 560.5552, 'train_samples_per_second': 22.139, 'train_steps_per_second': 1.124, 'total_flos': 1643920417320960.0, 'train_loss': 0.5322345612541077, 'epoch': 10.0})

In [15]:
outputs = trainer.predict(pd_test_hf_processed) 
outputs.metrics

{'test_loss': 0.5361668467521667,
 'test_accuracy': 0.7622377622377622,
 'test_runtime': 4.2726,
 'test_samples_per_second': 66.938,
 'test_steps_per_second': 3.511}

In [25]:
import numpy as np
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report

def evaluate_detailed_metrics(outputs, labels):
    # Get predictions and probabilities
    logits = outputs.predictions
    # convert logits to tensor
    logits = torch.tensor(logits)

    probs = logits.softmax(axis=1)
    pred_labels = np.argmax(logits, axis=1)

    # labels, pred_labels to numpy
    labels = np.array(labels)
    pred_labels = pred_labels.numpy()
    
    # Calculate all metrics
    metrics = {}
    
    # Binary classification case
    if logits.shape[1] == 2:
        metrics['auc'] = roc_auc_score(labels, probs[:, 1])
        metrics['precision'] = precision_score(labels, pred_labels)
        metrics['recall'] = recall_score(labels, pred_labels)
        metrics['f1'] = f1_score(labels, pred_labels)
    
    # Multi-class case
    else:
        metrics['auc'] = roc_auc_score(labels, probs, multi_class='ovr')
        metrics['precision'] = precision_score(labels, pred_labels, average='macro')
        metrics['recall'] = recall_score(labels, pred_labels, average='macro')
        metrics['f1'] = f1_score(labels, pred_labels, average='macro')
    
    # Get detailed classification report
    class_report = classification_report(labels, pred_labels, output_dict=True)
    
    return metrics, class_report

# Use with trainer outputs
outputs = trainer.predict(pd_test_hf_processed)
metrics, detailed_report = evaluate_detailed_metrics(outputs, pd_test_hf_processed['label'])

# Print results
print("\nMain Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

# print("\nDetailed Classification Report:")
# for class_name, values in detailed_report.items():
#     if isinstance(values, dict):
#         print(f"\nClass {class_name}:")
#         for metric, value in values.items():
#             if isinstance(value, float):
#                 print(f"  {metric}: {value:.4f}")


Main Metrics:
auc: 0.6546
precision: 0.8283
recall: 0.8733
f1: 0.8502


In [37]:
def find_optimal_threshold(y_true, y_pred_proba):
    """
    Find optimal threshold based on sensitivity >= 0.7 or best F1 score.
    """
    best_threshold = 0.5
    best_metrics = {
        "accuracy": 0,
        "precision": 0,
        "recall": 0,
        "sensitivity": 0,
        "specificity": 0,
        "f1": 0,
    }
    y_pred_proba = np.array(y_pred_proba)
    for threshold in np.arange(0.0, 1.0, 0.01):
        y_pred = (y_pred_proba >= threshold).astype(int)

        # Calculate metrics
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = f1_score(y_true, y_pred)

        if sensitivity >= 0.7 or f1 > best_metrics["f1"]:
            best_threshold = threshold
            best_metrics = {
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "sensitivity": sensitivity,
                "specificity": specificity,
                "f1": f1,
            }

    return best_threshold, best_metrics

In [38]:
import numpy as np
from sklearn.metrics import (
    roc_auc_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix,
    accuracy_score
)

def compute_binary_metrics(pred_probs, pred_labels, true_labels):
    """
    Compute comprehensive binary classification metrics
    """
    # labels, pred_labels to numpy
    true_labels = np.array(true_labels)
    pred_labels = np.array(pred_labels)

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(true_labels, pred_labels).ravel()
    
    # Calculate metrics
    metrics = {
        "accuracy": accuracy_score(true_labels, pred_labels),
        "auc": roc_auc_score(true_labels, pred_probs),
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels),
        "sensitivity": tp / (tp + fn),  # Same as recall
        "specificity": tn / (tn + fp),
        "confusion_matrix": {
            "true_negatives": tn,
            "false_positives": fp,
            "false_negatives": fn,
            "true_positives": tp
        }
    }
    
    return metrics

# Process trainer.predict outputs
outputs = trainer.predict(pd_test_hf_processed)
logits = outputs.predictions
true_labels = outputs.label_ids

# Get probabilities and predicted labels
logits = torch.tensor(logits)
probs = logits.softmax(axis=1)
pred_probs = probs[:, 1]  # For binary classification, take probability of positive class
# pred_labels = np.argmax(logits, axis=1)

# Find optimal threshold
best_threshold, best_metrics = find_optimal_threshold(true_labels, pred_probs)
pred_probs = np.array(pred_probs)
pred_labels = (pred_probs >= best_threshold).astype(int)

# Compute all metrics
all_metrics = compute_binary_metrics(pred_probs, pred_labels, true_labels)

# Print results in a formatted way
print("\nModel Evaluation Metrics:")
print("-" * 50)
for metric, value in all_metrics.items():
    if metric != "confusion_matrix":
        print(f"{metric.capitalize():15} : {value:.4f}")

print("\nConfusion Matrix:")
print("-" * 50)
for key, value in all_metrics["confusion_matrix"].items():
    print(f"{key:15} : {value}")

  y_pred_proba = np.array(y_pred_proba)



Model Evaluation Metrics:
--------------------------------------------------
Accuracy        : 0.6608
Auc             : 0.6546
Precision       : 0.8298
Recall          : 0.7059
F1              : 0.7628
Sensitivity     : 0.7059
Specificity     : 0.5077

Confusion Matrix:
--------------------------------------------------
true_negatives  : 33
false_positives : 32
false_negatives : 65
true_positives  : 156


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  pred_probs = np.array(pred_probs)
