In [None]:
!python --version

Python 3.11.11


# Task 1: Fine-tune Chemical Language Model

The goal is to fine-tune a pre-trained chemical language model on a regression task using the Lipophilicity dataset. The task involves predicting the lipophilicity value for a given molecule representation (SMILES string). You will learn how to load and tokenize a dataset from HuggingFace, how to load a pre-trained language model, and finally, how to run a model in inference mode.

Your task is to complete the missing code blocks below.

In [None]:
%%capture
!pip install -r requirements.txt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import dependencies
import torch
from datasets import load_dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
import random

# 1.Fine-tune a Chemical Language Model on Lipophilicity


## --- Step 1: Load Dataset ---

The dataset we are going to use is the [Lipophilicity](https://huggingface.co/datasets/scikit-fingerprints/MoleculeNet_Lipophilicity) dataset, part of [MoleculeNet](https://pubs.rsc.org/en/content/articlelanding/2018/sc/c7sc02664a) benchmark.

Lipophilicity, also known as hydrophobicity, is a measure of how readily a substance dissolves in nonpolar solvents (such as oil) compared to polar solvents (such as water).

In [None]:
# specify dataset name and model name
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"  #MoLFormer model

In [None]:
# load the dataset from HuggingFace
dataset = load_dataset(DATASET_PATH)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

lipophilicity.csv:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4200 [00:00<?, ? examples/s]

In [None]:
# Explore the dataset
# For example, print the column names and display a few sample rows
# TODO: your code goes here
print("Dataset columns:", dataset["train"].column_names)
for i in range(5):
    print("Sample", i, ":", dataset["train"][i])

Dataset columns: ['SMILES', 'label']
Sample 0 : {'SMILES': 'Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14', 'label': 3.54}
Sample 1 : {'SMILES': 'COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23', 'label': -1.18}
Sample 2 : {'SMILES': 'COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl', 'label': 3.69}
Sample 3 : {'SMILES': 'OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(Cl)sc4[nH]3', 'label': 3.37}
Sample 4 : {'SMILES': 'Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)NCC#N)c1', 'label': 3.1}


In [None]:
# getting the maximum length of input
max_length = max([len(x['SMILES']) for x in dataset['train']])
max_length

267

In [None]:
# define a PyTorch Dataset class for handling SMILES strings and targets

# TODO: your code goes here
class SMILESDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=256):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(item["SMILES"], padding="max_length", truncation=True, return_tensors="pt", max_length=self.max_length)
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding["labels"] = torch.tensor(item["label"])
        return encoding

## --- Step 2: Split Dataset ---

As there is only one split (train split) in the original dataset, we need to split the data into training and testing sets by ourselves.

In [None]:
# tokenize the data
# load a pre-trained tokenizer from HuggingFace
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenization_molformer_fast.py:   0%|          | 0.00/6.50k [00:00<?, ?B/s]

tokenization_molformer.py:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- tokenization_molformer_fast.py
- tokenization_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json:   0%|          | 0.00/41.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/54.0k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
# split the data into training and test datasets
# TODO: your code goes here
dataset_train, dataset_test = train_test_split(list(dataset["train"]), test_size=0.2, random_state=42, shuffle=True)

In [None]:
# construct Pytorch data loaders for both train and test datasets
BATCH_SIZE = 16 # adjust based on memory constraints

# TODO: your code goes here

# constructing datasets
train_dataset = SMILESDataset(dataset_train, tokenizer)
test_dataset = SMILESDataset(dataset_test, tokenizer)

# constructing data loaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

## --- Step 3: Load Model ---

In [None]:
# load pre-trained model from HuggingFace
model = AutoModel.from_pretrained(MODEL_NAME, deterministic_eval=True, trust_remote_code=True)

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

configuration_molformer.py:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- configuration_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_molformer.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- modeling_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/187M [00:00<?, ?B/s]

In [None]:
# We need to add a regression head on the language model as we are doing a regression task.

# specify model with a regression head

class MoLFormerWithRegressionHead(nn.Module):
    # TODO: your code goes here
    def __init__(self, base_model):
        super(MoLFormerWithRegressionHead, self).__init__()
        self.base_model = base_model
        self.regression_head = nn.Linear(base_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.regression_head(pooled_output)


In [None]:
# initialize the regression model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regression_model = MoLFormerWithRegressionHead(model).to(device)

## --- Step 4: Training ---

In [None]:
# TODO: your code goes here

# defining optimizer and loss function
EPOCHS = 10
LR = 5e-5

optimizer = torch.optim.AdamW(regression_model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

In [None]:
# training
for epoch in range(EPOCHS):
    regression_model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 1, Loss: 1.015181790292263


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 2, Loss: 0.5162702809487071


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 3, Loss: 0.36181837760266805


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 4, Loss: 0.2902875392919495


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 5, Loss: 0.244354912851538


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 6, Loss: 0.20329422262452898


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 7, Loss: 0.2056249540299177


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 8, Loss: 0.193525112810589


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 9, Loss: 0.18806791438588075


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 10, Loss: 0.1712883741018318


## --- Step 5: Evaluation ---

In [None]:
# TODO: your code goes here
from sklearn.metrics import mean_squared_error

# evaluation
regression_model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

mse = mean_squared_error(actuals, predictions)
print("MSE:", mse)

MSE: 0.8301883498034074


In [None]:
torch.save(regression_model, "/content/drive/MyDrive/NeuralNetworks-Task1_Results/supervised_regression")

# 2.Add Unsupervised Finetuning
In this step, you will perform unsupervised fine-tuning on the training dataset. This means the model will leverage only the SMILES strings without any corresponding labels to adapt its understanding of the data distribution. By familiarizing the model with the patterns and structure of the SMILES strings, you can potentially enhance its performance on downstream supervised tasks.

For this fine-tuning, you will use the Masked Language Modeling (MLM) objective, where the model learns to predict randomly masked tokens within the input sequence. Remember to save the fine-tuned model for later use.


In [None]:
# TODO: your code goes here

class SMILESDatasetMLM(Dataset):
    def __init__(self, dataset, tokenizer, max_length=256):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(
            item["SMILES"],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        return encoding

train_dataset = SMILESDatasetMLM(dataset_train, tokenizer)
test_dataset = SMILESDatasetMLM(dataset_test, tokenizer)

In [None]:
# load pre-trained model for MLM
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)

# defining data collator for MLM (to do the masking)
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

The repository for ibm/MoLFormer-XL-both-10pct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/ibm/MoLFormer-XL-both-10pct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for ibm/MoLFormer-XL-both-10pct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/ibm/MoLFormer-XL-both-10pct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NeuralNetworks-Task1_Results/Task1_MLM",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    logging_steps=100,
    save_total_limit=2,
)



In [None]:
# defining Trainer for MLM
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collator,
)

# train the model for MLM
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmahsa-ama1391[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.2416,0.208566
2,0.1967,0.178598
3,0.1845,0.174235
4,0.1679,0.146007
5,0.1417,0.146865
6,0.1454,0.158068
7,0.1254,0.133262
8,0.1315,0.138017
9,0.1221,0.134311
10,0.1273,0.130243


TrainOutput(global_step=2100, training_loss=0.1612333075205485, metrics={'train_runtime': 980.8195, 'train_samples_per_second': 34.257, 'train_steps_per_second': 2.141, 'total_flos': 2320737553612800.0, 'train_loss': 0.1612333075205485, 'epoch': 10.0})

In [None]:
# save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/NeuralNetworks-Task1_Results/unsupervised_mlm")

# 3.Fine-Tune for Comparison
After performing unsupervised fine-tuning on the training data, we now fine-tune the model on the regression task with the regression head. By comparing the performance of the model before and after unsupervised fine-tuning, you can evaluate how the unsupervised fine-tuning impacts the model's performance on our target task.


In [None]:
# TODO: your code goes here

# loading the fine-tuned MLM model
MODEL_NAME = "/content/drive/MyDrive/NeuralNetworks-Task1_Results/unsupervised_mlm"

# loading fine-tuned MLM model and add regression head
base_model = AutoModel.from_pretrained(MODEL_NAME)
regression_model = MoLFormerWithRegressionHead(base_model)

The repository for /content/drive/MyDrive/NeuralNetworks-Task1_Results/unsupervised_mlm contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co//content/drive/MyDrive/NeuralNetworks-Task1_Results/unsupervised_mlm.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
optimizer = torch.optim.AdamW(regression_model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

In [None]:
regression_model.to(device)
# training
for epoch in range(EPOCHS):
    regression_model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 1, Loss: 0.9393182836118199


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 2, Loss: 0.4801456727442287


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 3, Loss: 0.33931783657698406


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 4, Loss: 0.2544165210709685


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 5, Loss: 0.23508642170400845


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 6, Loss: 0.19803828834777787


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 7, Loss: 0.1837322650920777


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 8, Loss: 0.17415381682415804


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 9, Loss: 0.1681044571456455


  0%|          | 0/210 [00:00<?, ?it/s]

Epoch 10, Loss: 0.16580968387424946


In [None]:
# evaluation
regression_model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

mse = mean_squared_error(actuals, predictions)
print("MSE:", mse)

MSE: 0.5661676761509575


In [None]:
torch.save(regression_model, "/content/drive/MyDrive/NeuralNetworks-Task1_Results/supervised_regression_after_unsupervised_mlm")