In [2]:
import torch
! pip install datasets
from datasets import load_dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
import random
!pip install gdown  # if not already installed
import gdown



In [3]:
class MoLFormerWithRegressionHead(nn.Module):
    # TODO: your code goes here
    def __init__(self, base_model):
        super(MoLFormerWithRegressionHead, self).__init__()
        self.base_model = base_model
        self.regression_head = nn.Linear(base_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        return self.regression_head(pooled_output)
# loading the fine-tuned MLM model
MODEL_URL = "https://drive.google.com/drive/folders/155k0aND9BfMUZT5tCHjvfPYkY7IuFMRm"
output = './unsupervised_model.pth'
gdown.download_folder(MODEL_URL, output=output, quiet=False)

gdown.download(MODEL_URL, output, quiet=False)
base_model = AutoModel.from_pretrained(output)
regression_model = MoLFormerWithRegressionHead(base_model)

Retrieving folder contents


Processing file 15CIJ2uwO3u1FlzWzVwrfHypE0xDPr2Kk config.json
Processing file 15BqT0pUcvke9yiE1-KslSMOf-BVQe-et model.safetensors


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=15CIJ2uwO3u1FlzWzVwrfHypE0xDPr2Kk
To: /content/unsupervised_model.pth/config.json
100%|██████████| 1.18k/1.18k [00:00<00:00, 2.51MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=15BqT0pUcvke9yiE1-KslSMOf-BVQe-et
From (redirected): https://drive.google.com/uc?id=15BqT0pUcvke9yiE1-KslSMOf-BVQe-et&confirm=t&uuid=03467ad0-1553-4d41-8edc-4c232cf80270
To: /content/unsupervised_model.pth/model.safetensors
100%|██████████| 187M/187M [00:02<00:00, 74.9MB/s]
Download completed
Downloading...
From: https://drive.google.com/drive/folders/155k0aND9BfMUZT5tCHjvfPYkY7IuFMRm
To: /content/unsupervised_model.pth
1.23MB [00:00, 31.6MB/s]


The repository for ./unsupervised_model.pth contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/./unsupervised_model.pth.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


configuration_molformer.py:   0%|          | 0.00/7.60k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- configuration_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


The repository for ./unsupervised_model.pth contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/./unsupervised_model.pth.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


modeling_molformer.py:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ibm/MoLFormer-XL-both-10pct:
- modeling_molformer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [4]:
url = 'https://drive.google.com/uc?id=15Flh6v2fHlyz4ruJ8pi-wMHuAFV81OB8'
output = 'regression_model.pth'
gdown.download(url, output, quiet=False)
regression_model = torch.load(output)


Downloading...
From (original): https://drive.google.com/uc?id=15Flh6v2fHlyz4ruJ8pi-wMHuAFV81OB8
From (redirected): https://drive.google.com/uc?id=15Flh6v2fHlyz4ruJ8pi-wMHuAFV81OB8&confirm=t&uuid=67170425-ee5c-43f5-8d74-eebc89d0673b
To: /content/regression_model.pth
100%|██████████| 179M/179M [00:02<00:00, 63.3MB/s]
  regression_model = torch.load(output)


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd

# =====================================
#   1) Example Model & Dataset Classes
# =====================================
class SMILESDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=256):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(item["SMILES"], padding="max_length", truncation=True, return_tensors="pt", max_length=self.max_length)
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding["labels"] = torch.tensor(item["label"])
        return encoding

# =====================================
#   2) Load Trained Model from Task1
# =====================================

# =======================================================
#   3) Compute Gradient & Hessian-Vector Product (HVP)
# =======================================================
def compute_gradient(model, loss_fn, inputs, labels, create_graph=False):
    """
    Returns the gradient of L wrt model.parameters() as a list of Tensors.
    If create_graph=True, it retains graph for higher-order derivatives.
    """
    model.zero_grad()
    preds = model(inputs)
    loss = loss_fn(preds, labels)
    loss.backward(create_graph=create_graph)
    grads = [p.grad.clone() for p in model.parameters() if p.requires_grad]
    return grads

def hvp(model, loss_fn, train_loader, vector, damp=0.01, device='cuda'):
    """
    Hessian-vector product:
      H * v = sum over training set ( ∂^2 L / ∂θ^2 ) v
    We'll do a mini-batch approximation:
      1) sample a batch from 'train_loader'
      2) compute gradient
      3) do a second backward pass with dot-product of gradient and v
    This is a standard approach for HVP (Pearlmutter trick).

    'vector' is a list of parameter-shaped tensors.
    """
    # 1) pick one batch to approximate
    batch = next(iter(train_loader))  # for demonstration
    inputs = batch["input_ids"].to(device)
    labels = batch["labels"].to(device)
    inputs, labels = inputs.to(device), labels.to(device)

    # 2) first grad (with create_graph = True) so we can do second derivative
    model.zero_grad()
    preds = model(inputs)
    loss = loss_fn(preds, labels)
    grads = torch.autograd.grad(loss, model.parameters(), create_graph=True)

    # 3) compute dot(grads, vector)
    flat_grad = flatten_tensors(grads)
    flat_vec = flatten_tensors(vector)
    # dot product:
    dot_val = (flat_grad * flat_vec).sum()

    # 4) second backward pass to get Hessian-vector product
    hvp_grad = torch.autograd.grad(dot_val, model.parameters(), retain_graph=False)

    # Dampening: (H + λI) * v
    # hvp_grad[i] += damp * vector[i]
    hvp_plus_damp = []
    for hvg, v in zip(hvp_grad, vector):
        # hvg is partial derivative
        hvp_plus_damp.append(hvg + damp * v)

    return hvp_plus_damp

# =======================================
#   4) LiSSA to approximate iHVP = H^-1 v
# =======================================
def lissa_iHVP(
    model,
    loss_fn,
    train_loader,
    test_grad,         # This is our 'v' (often test_grad = ∇θ L_test)
    damp=0.01,
    scale=1.0,
    recursion_depth=10,
    device='cuda'
):
    """
    LiSSA approximation for iHVP = H^-1 * test_grad.
    We'll do the recursive "power series" style approach:

      x_0 = 0
      x_{t+1} = test_grad + (I - H)*x_t

    But we get H*x_t via hvp(...) each time, plus damping.
    Then we scale the updates to accelerate convergence.

    Pseudocode style:
      cur_estimate = 0
      for i in range(recursion_depth):
          hvp_est = hvp(model, loss_fn, train_loader, cur_estimate, damp=damp)
          cur_estimate = test_grad + scale * (cur_estimate - hvp_est)

      return cur_estimate
    """

    model.eval()
    cur_estimate = [torch.zeros_like(p) for p in test_grad]  # x_0 = 0

    for _ in range(recursion_depth):
        # HVP on cur_estimate
        hvp_est = hvp(model, loss_fn, train_loader, cur_estimate, damp=damp, device=device)

        # update: x_{t+1} = test_grad + (cur_estimate - hvp_est)*scale
        new_estimate = []
        for ce, hvp_e, tg in zip(cur_estimate, hvp_est, test_grad):
            # (cur_estimate - hvp_est)
            tmp = ce - hvp_e
            # multiply by scale
            tmp = scale * tmp
            # add test_grad
            tmp = tg + tmp
            new_estimate.append(tmp)

        cur_estimate = new_estimate

    return cur_estimate

# ================================================
#   5) Compute Influence Scores for External Data
# ================================================
def compute_influence_scores(
    model,
    loss_fn,
    train_loader,  # used for HVP approximation
    test_loader,   # the test set to define L_test
    external_loader,  # external set to rank
    recursion_depth=10,
    damp=0.01,
    scale=1.0,
    device='cuda'
):
    """
    For each external sample, compute:
      Influence(z_ext) = - ∇θ L_test(θ)^T H^-1 ∇θ L_external(θ).
    We do the following steps:
      1) Compute test_grad = ∇θ L_test(θ)  (one pass or entire test set)
      2) Approx iHVP = H^-1 test_grad  via LiSSA
      3) For each external sample z_ext:
         - compute ext_grad = ∇θ L_ext(θ)
         - influence = - dot(ext_grad, iHVP)
    Return a list of (sample_index, influence_score).
    """

    model.to(device)

    # 1) Compute test_grad by summing/averaging over entire test set
    test_grad = approximate_gradient_on_loader(model, loss_fn, test_loader, device=device)

    # 2) Approx iHVP = H^-1 * test_grad
    iHVP = lissa_iHVP(
        model=model,
        loss_fn=loss_fn,
        train_loader=train_loader,
        test_grad=test_grad,
        recursion_depth=recursion_depth,
        damp=damp,
        scale=scale,
        device=device
    )

    # 3) For each external sample, compute influence
    influence_scores = []
    for idx, (batch) in enumerate(external_loader):
        ext_x = batch["input_ids"].to(device)
        ext_y = batch["labels"].to(device)
        ext_x, ext_y = ext_x.to(device), ext_y.to(device)
        # compute gradient wrt this single sample
        ext_grad = compute_gradient(model, loss_fn, ext_x, ext_y, create_graph=False)

        # dot(iHVP, ext_grad)
        dot_val = dot_tensors(iHVP, ext_grad)

        # influence = - dot_val
        influence = -dot_val.item()
        influence_scores.append((idx, influence))

    return influence_scores

# Helper: Sum or average gradient over entire loader
def approximate_gradient_on_loader(model, loss_fn, data_loader, device='cuda'):
    """
    Compute the gradient of L wrt the model parameters,
    where L is the sum (or mean) over the entire data_loader.
    We'll just sum them and then optionally scale.
    """
    model.eval()
    total_grad = None
    count = 0
    for batch in data_loader:
        x = batch["input_ids"].to(device)
        y = batch["labels"].to(device)
        grads = compute_gradient(model, loss_fn, x, y, create_graph=False)
        if total_grad is None:
            total_grad = grads
        else:
            for i in range(len(grads)):
                total_grad[i] += grads[i]
        count += 1

    # (Optionally) average them
    for i in range(len(total_grad)):
        total_grad[i] /= float(count)
    return total_grad

# Flatten/unflatten utilities
def flatten_tensors(tensor_list):
    return torch.cat([t.view(-1) for t in tensor_list], dim=0)

def dot_tensors(list1, list2):
    return torch.sum(flatten_tensors(list1) * flatten_tensors(list2))

# ================
#  Main Example
# ================
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Load your model from Task1
    model = regression_model
    model.to(device)
    loss_fn = nn.MSELoss()

    # 2) Create Datasets / Loaders
    #    (You must replace these with your real CSV file paths)
    DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
    MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"
    BATCH_SIZE = 16
    dataset = load_dataset(DATASET_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    dataset_train, dataset_test = train_test_split(list(dataset["train"]), test_size=0.2, random_state=42, shuffle=True)
    train_dataset = SMILESDataset(dataset_train, tokenizer=tokenizer)
    test_dataset = SMILESDataset(dataset_test, tokenizer=tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


    external_dataset = pd.read_csv("./External-Dataset_for_Task2.csv")
    external_dataset = external_dataset.to_dict(orient="records")
    external_dataset = SMILESDataset(external_dataset, tokenizer=tokenizer)
    external_dataloader = DataLoader(external_dataset, batch_size=1, shuffle=False)


    # 3) Compute influence scores
    influence_scores = compute_influence_scores(
        model=model,
        loss_fn=loss_fn,
        train_loader=train_dataloader,
        test_loader=test_dataloader,
        external_loader=external_dataloader,
        recursion_depth=10,
        damp=0.01,
        scale=1.0,
        device=device
    )

    # 4) Sort and find top-k
    influence_scores.sort(key=lambda x: x[1], reverse=True)
    top_k = 100
    top_k_indices = [x[0] for x in influence_scores[:top_k]]
    return top_k_indices

    # (Optional) You can then combine those top-k external samples
    # with your training set and fine-tune the model as described
    # in the original instructions.

tok_k = main()


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


In [17]:
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"
BATCH_SIZE = 16
dataset = load_dataset(DATASET_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
dataset_train, dataset_test = train_test_split(list(dataset["train"]), test_size=0.2, random_state=42, shuffle=True)
train_dataset = SMILESDataset(dataset_train, tokenizer=tokenizer)
test_dataset = SMILESDataset(dataset_test, tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


external_dataset = pd.read_csv("./External-Dataset_for_Task2.csv")
external_dataset = external_dataset.to_dict(orient="records")
external_dataset = SMILESDataset(external_dataset, tokenizer=tokenizer)
external_dataloader = DataLoader(external_dataset, batch_size=1, shuffle=False)

top_k_samples = torch.utils.data.Subset(external_dataset, tok_k)

combined_dataset = torch.utils.data.ConcatDataset([train_dataset, top_k_samples])
print(len(combined_dataset))
print(len(train_dataset))
combined_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True)

EPOCHS = 10
LR = 5e-5

optimizer = torch.optim.AdamW(regression_model.parameters(), lr=LR)
loss_fn = nn.MSELoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for epoch in range(EPOCHS):
    regression_model.train()
    total_loss = 0
    for batch in tqdm(combined_loader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")



3460
3360


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 1, Loss: 0.15803397723606655


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 2, Loss: 0.12962844416144348


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 3, Loss: 0.1139337760351953


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 4, Loss: 0.10859477464996634


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 5, Loss: 0.0968896527198099


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 6, Loss: 0.08967452918489774


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 7, Loss: 0.08941337072423526


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 8, Loss: 0.0822117507635128


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 9, Loss: 0.0765110279477778


  0%|          | 0/217 [00:00<?, ?it/s]

Epoch 10, Loss: 0.07733576855666581


In [18]:
from sklearn.metrics import mean_squared_error

regression_model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = regression_model(input_ids, attention_mask=attention_mask).squeeze()
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(labels.cpu().numpy())

mse = mean_squared_error(actuals, predictions)
print("MSE:", mse)

MSE: 0.4242777294304062
