In [1]:
!python --version

Python 3.11.11


In [2]:
%%capture
!pip install -r requirements.txt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# import dependencies
import torch
from datasets import load_dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
import random
from sklearn.metrics import mean_squared_error


# Load Dataset

In [5]:
# specify dataset name and model name
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"  #MoLFormer model

In [6]:
# load the dataset from HuggingFace
dataset = load_dataset(DATASET_PATH)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

lipophilicity.csv:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4200 [00:00<?, ? examples/s]

In [None]:
# Explore the dataset
# For example, print the column names and display a few sample rows
print("Dataset columns:", dataset["train"].column_names)
for i in range(5):
    print("Sample", i, ":", dataset["train"][i])

Dataset columns: ['SMILES', 'label']
Sample 0 : {'SMILES': 'Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14', 'label': 3.54}
Sample 1 : {'SMILES': 'COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23', 'label': -1.18}
Sample 2 : {'SMILES': 'COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl', 'label': 3.69}
Sample 3 : {'SMILES': 'OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(Cl)sc4[nH]3', 'label': 3.37}
Sample 4 : {'SMILES': 'Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)NCC#N)c1', 'label': 3.1}


In [7]:
# getting the maximum length of input
max_length = max([len(x['SMILES']) for x in dataset['train']])
max_length

267

In [8]:
# define a PyTorch Dataset class for handling SMILES strings and targets

class SMILESDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=256):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(item["SMILES"], padding="max_length", truncation=True, return_tensors="pt", max_length=self.max_length)
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding["labels"] = torch.tensor(item["label"])
        return encoding

# Split Dataset

In [9]:
# tokenize the data
# load a pre-trained tokenizer from HuggingFace
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenization_molformer_fast.py:   0%|          | 0.00/6.50k [00:00<?, ?B/s]

tokenization_molformer.py:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer_fast.py
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/41.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/54.0k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [10]:
# split the data into training and test datasets
# TODO: your code goes here
dataset_train, dataset_test = train_test_split(list(dataset["train"]), test_size=0.2, random_state=42, shuffle=True)

In [11]:
# construct Pytorch data loaders for both train and test datasets
BATCH_SIZE = 16 # adjust based on memory constraints

# constructing datasets
train_dataset = SMILESDataset(dataset_train, tokenizer)
test_dataset = SMILESDataset(dataset_test, tokenizer)

# constructing data loaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Load Model

In [12]:
# load pre-trained model from HuggingFace
model = AutoModel.from_pretrained(MODEL_NAME, deterministic_eval=True, trust_remote_code=True)

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

configuration_molformer.py:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- configuration_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_molformer.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- modeling_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/187M [00:00<?, ?B/s]

In [13]:
# We need to add a regression head on the language model as we are doing a regression task.
# specify model with a regression head

class MoLFormerWithRegressionHeadAndAdapter(nn.Module):
    def __init__(self, base_model, adapter):
        super(MoLFormerWithRegressionHeadAndAdapter, self).__init__()
        self.base_model = adapter(base_model)
        self.regression_head = nn.Linear(base_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.regression_head(pooled_output)

In [14]:
# initialize the regression model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train and Evaluation

## BitFit

In [None]:
for name, param in model.named_parameters():
    print(name)

embeddings.word_embeddings.weight
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key.bias
encoder.layer.1.attention.self.value.weight
encoder.layer.1.attention.self.value.bias
encoder.layer.1.attention.output.dense.w

In [None]:
def bitfit_adapter(model):
    """Applies BitFit by freezing all parameters except biases."""
    for name, param in model.named_parameters():
        if "bias" in name:
            param.requires_grad = True
        else:
            param.requires_grad = False
    return model

In [None]:
regression_model = MoLFormerWithRegressionHeadAndAdapter(model, bitfit_adapter).to(device)

In [None]:
# defining optimizer and loss function
EPOCHS = 10
LR = 5e-5

optimizer = torch.optim.AdamW(regression_model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

In [None]:
# training
for epoch in range(EPOCHS):
    regression_model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 1, Loss: 2.260046707448505


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 2, Loss: 1.3636182312454497


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 3, Loss: 1.2643915064278104


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 4, Loss: 1.2154067082064492


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 5, Loss: 1.1729063955091295


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 6, Loss: 1.1237142128603799


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 7, Loss: 1.085082202724048


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 8, Loss: 1.0678263610317593


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 9, Loss: 1.044202949319567


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 10, Loss: 1.0009461215564184


In [None]:
# evaluation
regression_model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

mse = mean_squared_error(actuals, predictions)
print("MSE:", mse)

MSE: 0.9850746047079454


## LoRA

In [None]:
for name, module in model.named_modules():
    if "attention" in name and isinstance(module, nn.Linear):
        print(name)

encoder.layer.0.attention.self.query
encoder.layer.0.attention.self.key
encoder.layer.0.attention.self.value
encoder.layer.0.attention.output.dense
encoder.layer.1.attention.self.query
encoder.layer.1.attention.self.key
encoder.layer.1.attention.self.value
encoder.layer.1.attention.output.dense
encoder.layer.2.attention.self.query
encoder.layer.2.attention.self.key
encoder.layer.2.attention.self.value
encoder.layer.2.attention.output.dense
encoder.layer.3.attention.self.query
encoder.layer.3.attention.self.key
encoder.layer.3.attention.self.value
encoder.layer.3.attention.output.dense
encoder.layer.4.attention.self.query
encoder.layer.4.attention.self.key
encoder.layer.4.attention.self.value
encoder.layer.4.attention.output.dense
encoder.layer.5.attention.self.query
encoder.layer.5.attention.self.key
encoder.layer.5.attention.self.value
encoder.layer.5.attention.output.dense
encoder.layer.6.attention.self.query
encoder.layer.6.attention.self.key
encoder.layer.6.attention.self.value
enc

In [15]:
def lora_adapter(model, r=8, alpha=16):
    """Applies LoRA to attention layers by injecting trainable low-rank matrices."""
    # freezing all other parameters in layers
    for param in model.parameters():
        param.requires_grad = False

    for name, module in model.named_modules():
        if "attention" in name and isinstance(module, nn.Linear):  # selecting only attention layers
            std_dev = 1 / torch.sqrt(torch.tensor(r).float())
            module.lora_A = nn.Parameter(torch.randn(r, module.out_features) * std_dev) # Gaussian initialization
            module.lora_B = nn.Parameter(torch.zeros(module.in_features, r)) # zero initialization
            module.lora_A.requires_grad = True
            module.lora_B.requires_grad = True

            old_forward = module.forward  # storing original forward

            def lora_forward(x, old_forward=old_forward, self=module):
                return old_forward(x) + (alpha / r) * (x @ self.lora_B @ self.lora_A)

            module.forward = lora_forward  # overriding forward with the new one
    return model

In [16]:
regression_model = MoLFormerWithRegressionHeadAndAdapter(model, lora_adapter).to(device)

In [17]:
# defining optimizer and loss function
EPOCHS = 10
LR = 5e-5

optimizer = torch.optim.AdamW(regression_model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

In [18]:
# training
for epoch in range(EPOCHS):
    regression_model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 1, Loss: 1.4550553018138521


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 2, Loss: 1.0244888409262611


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 3, Loss: 0.8761473559197925


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 4, Loss: 0.7453808709979057


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 5, Loss: 0.6900563333006132


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 6, Loss: 0.6092721962503025


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 7, Loss: 0.5727141860695112


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 8, Loss: 0.5425653465446971


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 9, Loss: 0.5004130703352746


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 10, Loss: 0.4824486521737916


In [19]:
# evaluation
regression_model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

mse = mean_squared_error(actuals, predictions)
print("MSE:", mse)

MSE: 0.6541835710958817


## IA3

In [None]:
for name, module in model.named_modules():
    print(name, module)

 MolformerModel(
  (embeddings): MolformerEmbeddings(
    (word_embeddings): Embedding(2362, 768, padding_idx=2)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (encoder): MolformerEncoder(
    (layer): ModuleList(
      (0-11): 12 x MolformerLayer(
        (attention): MolformerAttention(
          (self): MolformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (rotary_embeddings): MolformerRotaryEmbedding()
            (feature_map): MolformerFeatureMap(
              (kernel): ReLU()
            )
          )
          (output): MolformerSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (i

In [None]:
def ia3_adapter(model):
    """Applies iA3 by introducing learned diagonal scaling factors in attention keys/values and feed-forward networks."""
    # freezing all other parameters in layers
    for param in model.parameters():
        param.requires_grad = False

    for name, module in model.named_modules():
        # targeting key and value functions in attention and the output of first non-linearity in MLP
        if "key" in name or "value" in name or "intermediate.intermediate_act_fn" in name:
            if "intermediate.intermediate_act_fn" in name:
                parent_name = name.replace("intermediate_act_fn", "dense")  # Get parent module
                parent_module = dict(model.named_modules())[parent_name]
                module.ia3_scaling = nn.Parameter(torch.ones(parent_module.out_features))
            else:
                module.ia3_scaling = nn.Parameter(torch.ones(module.out_features))

            module.ia3_scaling.requires_grad = True

            old_forward = module.forward  # storing original forward

            def ia3_forward(x, old_forward=old_forward, self=module):
                return old_forward(x) * self.ia3_scaling

            module.forward = ia3_forward  # overriding forward with the new one
    return model

In [None]:
regression_model = MoLFormerWithRegressionHeadAndAdapter(model, ia3_adapter).to(device)

In [None]:
# defining optimizer and loss function
EPOCHS = 10
LR = 5e-5

optimizer = torch.optim.AdamW(regression_model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

In [None]:
# training
for epoch in range(EPOCHS):
    regression_model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 1, Loss: 3.2926398087115514


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 2, Loss: 1.540277308083716


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 3, Loss: 1.3881824946119672


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 4, Loss: 1.3266865745896386


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 5, Loss: 1.303437308470408


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 6, Loss: 1.2885721713304519


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 7, Loss: 1.2580866585175197


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 8, Loss: 1.2524992072866077


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 9, Loss: 1.2444857576063701


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 10, Loss: 1.2119212059747606


In [None]:
# evaluation
regression_model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

mse = mean_squared_error(actuals, predictions)
print("MSE:", mse)

MSE: 1.223010246349298
