In [None]:
# -*- coding: utf-8 -*-
"""FinetuningBERT_BASE_SMILES_QLoRA_Unsloth.ipynb
This script demonstrates fine-tuning the bert-base-smiles model using 4‑bit quantization
and PEFT (QLoRA-style) with optional unsloth optimization, with a custom Trainer that
removes the unsupported "num_items_in_batch" argument, and manually computes MSE.
Before training, we group by activity type and normalize the activity values within each group.
After training, we evaluate overall test MSE as well as per-group accuracy.
"""

# Install required packages (uncomment if needed)
!pip install datasets evaluate unsloth bitsandbytes --quiet
!pip install --upgrade scikit-learn --quiet

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch

from transformers import (
    BertTokenizerFast,
    BertConfig,
    TrainingArguments,
    DataCollatorWithPadding,
    EvalPrediction,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    Trainer
)
from datasets import Dataset
# We compute MSE manually, so evaluate is not used here.

# PEFT imports (ensure your version supports fine-tuning quantized models)
from peft import LoraConfig, get_peft_model

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.6/191.6 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# 1. Mount Google Drive and load the CSV data
from google.colab import drive
drive.mount('/content/drive')

csv_path = "/content/drive/MyDrive/Mohammad/DrugDiscoveryLLMs/Alzheimer_CheMBLv35_Uniprot.csv"
df = pd.read_csv(csv_path)
print("Data shape:", df.shape)

# 2. Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))
!nvidia-smi

Mounted at /content/drive


  df = pd.read_csv(csv_path)


Data shape: (1463277, 25)
Using device: cuda
GPU: NVIDIA A100-SXM4-40GB
Wed Mar 12 04:59:13 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   31C    P0             46W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+

In [None]:
# 3. Preprocessing data
# Select the columns of interest: 'Canonical SMILES', 'Activity Type', 'Activity Value'
df = df[['Canonical SMILES', 'Activity Type', 'Activity Value']].dropna()

# Create an input string combining SMILES and the categorical Activity Type
df["input_text"] = df["Canonical SMILES"].astype(str) + " | Activity Type: " + df["Activity Type"].astype(str)

# Group-based normalization for "Activity Value"
# We'll store group statistics (min and max) in a dictionary for later inverse transformation.
group_stats = {}
def normalize_value(row):
    act_type = row["Activity Type"]
    val = row["Activity Value"]
    if act_type not in group_stats:
        group = df[df["Activity Type"] == act_type]["Activity Value"]
        g_min = group.min()
        g_max = group.max()
        group_stats[act_type] = (g_min, g_max)
    g_min, g_max = group_stats[act_type]
    if g_max == g_min:
        return 0.0
    return (val - g_min) / (g_max - g_min)

df["labels"] = df.apply(normalize_value, axis=1).astype(float)
print("Group stats (min, max) per Activity Type:")
group_stats

Group stats (min, max) per Activity Type:


{'IC50': (-7000.0, 8.912509381337441e+23),
 'Inhibition': (-524.9, 500000.0),
 'Ki': (-10.0, 1e+24),
 'Relative potency': (0.0, 34000000000.0),
 'ED50': (8.4e-06, 100000.0),
 'Inhibition potency': (0.002, 200.0),
 'K inact': (0.00067, 780.0),
 'Km': (0.12, 15900000000000.0),
 'Ks app': (0.01, 0.13),
 'Reduction': (-80.0, 86.0),
 'Relative Potency': (0.16, 126.8),
 'Activity': (-399.0, 761592.0),
 'Ki app': (35.0, 472.0),
 'Ka app': (0.13, 0.28),
 'kinact': (0.00036, 7.7),
 'Activity remaning': (41.0, 97.0),
 'Ks': (1.7e-06, 785.0),
 'Ratio': (-4.24, 1000000000.0),
 'k cat': (0.0007, 1292.0),
 'Effect': (0.0, 363.4),
 'RP': (1.0, 451.2),
 'Log IC50': (0.01, 25.0),
 'Km/Ki': (0.087, 1.51),
 'Ratio LC50/IC50': (333.0, 2273.0),
 'Ratio IC50': (-8.0, 540000.0),
 'AC50': (0.1, 1000000000.0),
 'Kinact': (0.0, 10000.0),
 'INH': (-3.0, 50000.0),
 'Vmax': (0.0, 123000000000.0),
 'pIC50': (-2.176, 1.509),
 'Imax': (0.0, 661.0),
 'T1/2': (0.0005556, 995.0),
 'deltaG': (-60.65, 271.04),
 'EC50': (0

In [None]:
# 4. Split the dataset into training, validation, and test sets
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

print("Example training sample:")
train_dataset[0]

Example training sample:


{'Canonical SMILES': 'Cl.NCc1ccc(OCc2ccccc2OC(F)(F)F)cc1',
 'Activity Type': 'Inhibition',
 'Activity Value': 5.0,
 'input_text': 'Cl.NCc1ccc(OCc2ccccc2OC(F)(F)F)cc1 | Activity Type: Inhibition',
 'labels': 0.0010586885887195622,
 '__index_level_0__': 460830}

In [None]:
# 5. Load the tokenizer from the checkpoint
checkpoint = "unikei/bert-base-smiles"
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/306k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
# 6. Set up model configuration for regression
config = BertConfig.from_pretrained(checkpoint)
config.num_labels = 1
config.problem_type = "regression"

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

In [None]:
# 6. Set up model configuration for regression
config = BertConfig.from_pretrained(checkpoint)
config.num_labels = 1
config.problem_type = "regression"

In [None]:
# 7. Set up BitsAndBytesConfig for 4-bit quantization (QLoRA-style)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load the model with 4-bit quantization enabled, placing the entire model on GPU 0
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    config=config,
    quantization_config=quantization_config,
    device_map={"": 0}
)

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at unikei/bert-base-smiles and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 8. Map data using the tokenizer
def tokenize_function(example):
    return tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# Note: The original "Activity Type" column is still present in the dataset for grouping.

Map:   0%|          | 0/1185254 [00:00<?, ? examples/s]

Map:   0%|          | 0/131695 [00:00<?, ? examples/s]

Map:   0%|          | 0/146328 [00:00<?, ? examples/s]

In [None]:
# 9. Set up LoRA configuration using PEFT
lora_config = LoraConfig(
    task_type="SEQ_CLS",    # Sequence classification/regression
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"]
)
model = get_peft_model(model, lora_config)

# Optional: Cast trainable parameters (LoRA adapters) to FP32 for stability.
for param in model.parameters():
    if param.requires_grad:
        param.data = param.data.to(torch.float32)

In [None]:
# 10. Optionally integrate unsloth optimization if available.
if unsloth_available:
    try:
        model = optimize_model(model)
        print("Unsloth optimization applied.")
    except Exception as e:
        print("Error during unsloth optimization:", e)
else:
    print("Skipping unsloth optimization.")

Skipping unsloth optimization.


In [None]:
# 11. Print trainable parameters to verify
model.print_trainable_parameters()

# 12. Send model to GPU (if not already there)
model.to(device)

trainable params: 295,681 || all params: 109,377,794 || trainable%: 0.2703


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30000, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [None]:
# 13. Define a custom Trainer to override compute_loss and remove the "num_items_in_batch" argument.
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        kwargs.pop("num_items_in_batch", None)
        outputs = model(**inputs)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss


In [None]:
# 14. Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    #max_steps=5,  # For quick testing; remove/increase for full training.
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="mse",
    fp16=True
)

In [None]:
# 15. Define regression metric (Mean Squared Error) manually.
def compute_metrics(eval_pred: EvalPrediction):
    predictions, labels = eval_pred
    predictions = predictions.flatten()
    mse_val = float(((predictions - labels) ** 2).mean())
    return {"mse": mse_val}

In [None]:
# 16. Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# 17. Initialize the custom Trainer.
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = CustomTrainer(


In [None]:
# 18. Evaluate Base Model on Test Set Before Fine-Tuning
print("Evaluating base model performance (before fine-tuning) on test dataset...")
base_test_metrics = trainer.evaluate(test_dataset)
print("Base model Test MSE:", base_test_metrics.get("mse", base_test_metrics.get("eval_mse")))

# Compute group-level accuracy for base model.
# Define tolerance for a prediction to be considered "accurate" (since labels are normalized 0-1).
tolerance = 0.1
base_preds_obj = trainer.predict(test_dataset)
base_preds = base_preds_obj.predictions.flatten()
base_labels = np.array(test_dataset["labels"])
activity_types = np.array(test_dataset["Activity Type"])  # original group info
unique_groups = np.unique(activity_types)

group_results = {}
for group in unique_groups:
    mask = (activity_types == group)
    count = np.sum(mask)
    if count > 0:
        # Accuracy: proportion of predictions with absolute error less than tolerance.
        acc = np.mean(np.abs(base_preds[mask] - base_labels[mask]) < tolerance)
        group_results[group] = {"base_accuracy": acc, "count": count}
    else:
        group_results[group] = {"base_accuracy": None, "count": 0}

Evaluating base model performance (before fine-tuning) on test dataset...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmaghaeef[0m ([33mmaghaeef1994[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Base model Test MSE: 0.16128166019916534


In [None]:
# 19. Fine-Tune the Model
print("Fine-tuning the model...")
trainer.train()

Fine-tuning the model...


Step,Training Loss,Validation Loss,Model Preparation Time,Mse
500,0.0271,0.011474,0.0018,0.011474
1000,0.0247,0.009931,0.0018,0.009931
1500,0.026,0.00941,0.0018,0.00941
2000,0.019,0.009202,0.0018,0.009202
2500,0.0222,0.008006,0.0018,0.008006
3000,0.0197,0.007142,0.0018,0.007142
3500,0.0152,0.006086,0.0018,0.006086
4000,0.0152,0.005341,0.0018,0.005341
4500,0.0119,0.005247,0.0018,0.005247
5000,0.0118,0.005135,0.0018,0.005135


Step,Training Loss,Validation Loss,Model Preparation Time,Mse
500,0.0271,0.011474,0.0018,0.011474
1000,0.0247,0.009931,0.0018,0.009931
1500,0.026,0.00941,0.0018,0.00941
2000,0.019,0.009202,0.0018,0.009202
2500,0.0222,0.008006,0.0018,0.008006
3000,0.0197,0.007142,0.0018,0.007142
3500,0.0152,0.006086,0.0018,0.006086
4000,0.0152,0.005341,0.0018,0.005341
4500,0.0119,0.005247,0.0018,0.005247
5000,0.0118,0.005135,0.0018,0.005135




In [None]:
# 20. Evaluate Fine-Tuned Model on Test Set
print("Evaluating finetuned model performance (after fine-tuning) on test dataset...")
fine_test_metrics = trainer.evaluate(test_dataset)
print("Finetuned model Test MSE:", fine_test_metrics.get("mse", fine_test_metrics.get("eval_mse")))

fine_preds_obj = trainer.predict(test_dataset)
fine_preds = fine_preds_obj.predictions.flatten()

for group in unique_groups:
    mask = (activity_types == group)
    count = np.sum(mask)
    if count > 0:
        acc = np.mean(np.abs(fine_preds[mask] - base_labels[mask]) < tolerance)
        # Add the finetuned accuracy to our group results.
        group_results[group]["finetuned_accuracy"] = acc
    else:
        group_results[group]["finetuned_accuracy"] = None


In [None]:
# 21. Create a summary DataFrame for group-level accuracy and count.
group_df = pd.DataFrame(group_results).T  # Transpose so that rows are groups.
group_df.index.name = "Activity Type"
group_df = group_df.reset_index()

# Compute weighted average accuracy over groups.
total_count = group_df["count"].sum()
weighted_base = (group_df["base_accuracy"] * group_df["count"]).sum() / total_count
weighted_fine = (group_df["finetuned_accuracy"] * group_df["count"]).sum() / total_count

average_row = pd.DataFrame({
    "Activity Type": ["Average"],
    "base_accuracy": [weighted_base],
    "finetuned_accuracy": [weighted_fine],
    "count": [total_count]
})
group_df = pd.concat([group_df, average_row], ignore_index=True)

# Create a summary table comparing overall test MSE.
results_df = pd.DataFrame({
    "base-model": [base_test_metrics.get("mse", base_test_metrics.get("eval_mse"))],
    "finetuned-model": [fine_test_metrics.get("mse", fine_test_metrics.get("eval_mse"))]
}, index=["test_overall"])

In [None]:
print("\nOverall Test MSE Comparison:")
results_df

In [None]:
print("\nGroup-level Accuracy and Data Count on Test Dataset:")
group_df

In [None]:
#
