In [None]:
import torch
! pip install datasets
from datasets import load_dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
import random
!pip install gdown
import gdown

# 1. Loading previously trained model

In [2]:
class MoLFormerWithRegressionHead(nn.Module):
    def __init__(self, base_model):
        super(MoLFormerWithRegressionHead, self).__init__()
        self.base_model = base_model
        self.regression_head = nn.Linear(base_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.regression_head(pooled_output)
# loading the fine-tuned MLM model
MODEL_URL = "https://drive.google.com/drive/folders/155k0aND9BfMUZT5tCHjvfPYkY7IuFMRm"
output = './unsupervised_model.pth'
gdown.download_folder(MODEL_URL, output=output, quiet=False)

gdown.download(MODEL_URL, output, quiet=False)
base_model = AutoModel.from_pretrained(output)
regression_model = MoLFormerWithRegressionHead(base_model)

Retrieving folder contents


Processing file 15CIJ2uwO3u1FlzWzVwrfHypE0xDPr2Kk config.json
Processing file 15BqT0pUcvke9yiE1-KslSMOf-BVQe-et model.safetensors


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=15CIJ2uwO3u1FlzWzVwrfHypE0xDPr2Kk
To: /content/unsupervised_model.pth/config.json
100%|██████████| 1.18k/1.18k [00:00<00:00, 2.76MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=15BqT0pUcvke9yiE1-KslSMOf-BVQe-et
From (redirected): https://drive.google.com/uc?id=15BqT0pUcvke9yiE1-KslSMOf-BVQe-et&confirm=t&uuid=d2174316-5b40-4b99-8fd0-aad37877b8f5
To: /content/unsupervised_model.pth/model.safetensors
100%|██████████| 187M/187M [00:03<00:00, 53.5MB/s]
Download completed
Downloading...
From: https://drive.google.com/drive/folders/155k0aND9BfMUZT5tCHjvfPYkY7IuFMRm
To: /content/unsupervised_model.pth
1.23MB [00:00, 7.80MB/s]


The repository for ./unsupervised_model.pth contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/./unsupervised_model.pth.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


configuration_molformer.py:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- configuration_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


The repository for ./unsupervised_model.pth contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/./unsupervised_model.pth.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


modeling_molformer.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- modeling_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [3]:
url = 'https://drive.google.com/uc?id=15Flh6v2fHlyz4ruJ8pi-wMHuAFV81OB8'
output = 'regression_model.pth'
gdown.download(url, output, quiet=False)
regression_model = torch.load(output)

Downloading...
From (original): https://drive.google.com/uc?id=15Flh6v2fHlyz4ruJ8pi-wMHuAFV81OB8
From (redirected): https://drive.google.com/uc?id=15Flh6v2fHlyz4ruJ8pi-wMHuAFV81OB8&confirm=t&uuid=6151ba06-370f-40d3-8495-933bc8379147
To: /content/regression_model.pth
100%|██████████| 179M/179M [00:03<00:00, 47.4MB/s]
  regression_model = torch.load(output)


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd


class SMILESDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=256):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(item["SMILES"], padding="max_length", truncation=True, return_tensors="pt", max_length=self.max_length)
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding["labels"] = torch.tensor(item["label"])
        return encoding


# 2. Compute leverage Scores

In [31]:
def compute_leverage_scores(X):
    """
    Given data matrix X (shape: [N, d]), compute the leverage scores:
    leverage = diag(H), where H = X(X^T X)^{-1}X^T
    using a stable pseudo-inverse if X^T X is nearly singular.
    """
    X = X.double()
    A = X.t().matmul(X)
    A_inv = torch.linalg.pinv(A)
    H = X.matmul(A_inv).matmul(X.t())
    leverage = torch.diagonal(H, 0)
    return leverage


def sample_coreset(X, y, coreset_size):
    """
    Samples a subset of size `coreset_size` from X,y with probabilities
    proportional to each point's leverage score.
    """
    with torch.no_grad():
        leverage = compute_leverage_scores(X)
        probs = leverage / leverage.sum()
        probs_np = probs.cpu().numpy()
    indices = np.random.choice(
        np.arange(X.shape[0]),
        size=coreset_size,
        replace=False,
        p=probs_np
    )
    return indices


In [None]:
def main():
    external_dataset = pd.read_csv("./External-Dataset_for_Task2.csv")
    external_dataset = external_dataset.to_dict(orient="records")
    external_dataset = SMILESDataset(external_dataset, tokenizer=tokenizer)
    external_dataloader = DataLoader(external_dataset, batch_size=1, shuffle=False)


    indices = sample_coreset(
        torch.cat([x["input_ids"] for x in external_dataloader], dim=0).to(device),
        torch.cat([x["labels"] for x in external_dataloader], dim=0).to(device),
        coreset_size=100
    )
    return indices

top_k = main()


# 3. Train model on new dataset

In [None]:
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"
BATCH_SIZE = 16
dataset = load_dataset(DATASET_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
dataset_train, dataset_test = train_test_split(list(dataset["train"]), test_size=0.2, random_state=42, shuffle=True)
train_dataset = SMILESDataset(dataset_train, tokenizer=tokenizer)
test_dataset = SMILESDataset(dataset_test, tokenizer=tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
external_dataset = pd.read_csv("./External-Dataset_for_Task2.csv")
external_dataset = external_dataset.to_dict(orient="records")
external_dataset = SMILESDataset(external_dataset, tokenizer=tokenizer)


top_k_samples = torch.utils.data.Subset(external_dataset, top_k)

combined_dataset = torch.utils.data.ConcatDataset([train_dataset, top_k_samples])
combined_dataset = torch.utils.data.ConcatDataset([train_dataset, external_dataset])

In [37]:
combined_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True)

EPOCHS = 10
LR = 5e-5

optimizer = torch.optim.AdamW(regression_model.parameters(), lr=LR)
loss_fn = nn.MSELoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for epoch in range(EPOCHS):
    regression_model.train()
    total_loss = 0
    for batch in tqdm(combined_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(combined_loader)}")



  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 1, Loss: 0.17091300545105767


  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 2, Loss: 0.14214620841707726


  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 3, Loss: 0.12302938895910068


  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 4, Loss: 0.11106694839471813


  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 5, Loss: 0.10499466500167763


  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 6, Loss: 0.0958512217522188


  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 7, Loss: 0.08748462349664436


  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 8, Loss: 0.08141799103549995


  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 9, Loss: 0.07594509156803117


  0%|          | 0/229 [00:00<?, ?it/s]

Epoch 10, Loss: 0.0748696852735687


# Test Model on Test Dataset

In [38]:
from sklearn.metrics import mean_squared_error

regression_model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

mse = mean_squared_error(actuals, predictions)
print("MSE:", mse)

MSE: 0.40115468621181705
