In [1]:
# ! pip install PyTDC
# ! pip install transformers==4.46.3

# ! pip uninstall -y torchvision
# ! pip cache purge
# ! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
import copy
import pandas as pd
import numpy as np
from rdkit import Chem
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn import MSELoss, BCEWithLogitsLoss
from sklearn.metrics import roc_auc_score, average_precision_score
from scipy.stats import spearmanr, pearsonr

from transformers import AutoTokenizer, AutoModel
from transformers import get_linear_schedule_with_warmup

import sys
sys.path.append('../')
import utils

In [None]:
from tdc.benchmark_group import admet_group
from tdc.single_pred import ADME, Tox

In [None]:
def to_canonical(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, isomericSmiles=True)
    else:
        return None

class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, smiles_col, target_col=None):
        self.df = df
        self.tokenizer = tokenizer
        self.smiles_col = smiles_col
        self.target_col = target_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        smiles = row[self.smiles_col]
        if self.target_col is not None:
            target = row[self.target_col]
        return smiles, target # if self.target_col is not None else None

def create_collator(tokenizer):
    def collator(batch):
        try:
            smiles, targets = zip(*batch)
            encodings = tokenizer(smiles, padding=True, truncation=True, return_tensors='pt', max_length=512)
            input_ids = encodings['input_ids']
            attention_mask = encodings['attention_mask']
            targets = np.array(targets)
            return input_ids, attention_mask, torch.tensor(targets) # if targets is not None else None
        except Exception as e:
            print('Error in collator:')
            print(e)
            return None
    return collator

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)
collator = create_collator(tokenizer)
base_dir = '../input_data/tdcommons/admet_group'

In [None]:
class Transformer(nn.Module):
    def __init__(self,
                reg_drop_rate=0.1,
                intermediate_size=256,
                num_targets=1):

        super(Transformer, self).__init__()
        self.reg_drop_rate = reg_drop_rate
        self.num_targets = num_targets
        self.intermediate_size = intermediate_size

        self.hidden_size = 768
        self.transformer = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, trust_remote_code=True)
        self.regressor = nn.Sequential(
            nn.Dropout(self.reg_drop_rate),
            nn.Linear(self.hidden_size, self.intermediate_size),
            nn.SiLU(),
            nn.Dropout(self.reg_drop_rate),
            nn.Linear(self.intermediate_size, self.num_targets)
        )

    def forward(self, input_ids, attention_mask, layer_idx=-1):
        model_out = self.transformer(input_ids, attention_mask, output_hidden_states=True)
        embeddings = (
            attention_mask.unsqueeze(2) * model_out.hidden_states[layer_idx]
        ).sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(1)
        output = self.regressor(embeddings)
        return output

In [None]:
tasks = ['pgp_broccatelli', 'bioavailability_ma']

In [None]:
dfs = []

for task in tasks:

    if task.startswith('.'):
        continue

    print(task)

    prefix = 'tdcommons/'
    if prefix+task in utils.tdc_mae_tasks:
        metric = 'mae'
    elif prefix+task in utils.tdc_spearman_task:
        metric = 'spearman'
    elif prefix+task in utils.polaris_pearson_tasks:
        metric = 'pearson'
    elif prefix+task in utils.tdc_auroc_tasks:
        metric = 'auc'
    elif prefix+task in utils.tdc_aucpr_tasks:
        metric = 'aucpr'
    elif prefix+task in utils.tdc_aucpr2_tasks:
        metric = 'aucpr'
    elif prefix+task in utils.polaris_aucpr_tasks:
        metric = 'aucpr'
    else:
        raise ValueError(f"Task {task} not found in any known task list.")

    try:
        data = ADME(name = task)
    except:
        data = Tox(name = task)

    split = data.get_split(method = 'scaffold')
    
    train_df = split['train'].rename({'Drug': 'smiles', 'Y': 'target'}, axis=1).drop('Drug_ID', axis=1)
    val_df = split['valid'].rename({'Drug': 'smiles', 'Y': 'target'}, axis=1).drop('Drug_ID', axis=1)
    test_df = split['test'].rename({'Drug': 'smiles', 'Y': 'target'}, axis=1).drop('Drug_ID', axis=1)
    
    train_df['smiles'] = train_df['smiles'].apply(to_canonical)
    val_df['smiles'] = val_df['smiles'].apply(to_canonical)
    test_df['smiles'] = test_df['smiles'].apply(to_canonical)

    if metric in ('mae', 'spearman', 'pearson'):
        scaler = StandardScaler()
        # fit only on train targets
        train_vals = train_df[['target']].values
        scaler.fit(train_vals)
        # add scaled targets
        train_df['target_scaled'] = scaler.transform(train_vals)
        val_df['target_scaled']   = scaler.transform(val_df[['target']].values)
        # keep original test targets aside for final metrics
        test_targets_orig = test_df['target'].values
        test_df['target_scaled']  = scaler.transform(test_df[['target']].values)
        target_col = 'target_scaled'
    else:
        target_col = 'target'
    
    
    transformer = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, trust_remote_code=True)
    
    train_dataset = CustomDataset(train_df, tokenizer, 'smiles', target_col)
    train_dataloader = DataLoader(train_dataset, batch_size=64, collate_fn=collator, shuffle=True, drop_last=True)

    val_dataset = CustomDataset(val_df, tokenizer, 'smiles',target_col)
    val_dataloader = DataLoader(val_dataset, batch_size=512, collate_fn=collator, shuffle=False, drop_last=False)
    
    test_dataset = CustomDataset(test_df, tokenizer, 'smiles', target_col)
    test_dataloader = DataLoader(test_dataset, batch_size=512, collate_fn=collator, shuffle=False, drop_last=False)
    
    num_layers = transformer.config.num_hidden_layers + 1
    layer_indices = list(range(1, num_layers))

    test_metrics = []
    for layer_idx in layer_indices:
        print(f'Layer: {layer_idx}')

        # hyperparameter sweep over learning rates
        for lr in [1e-5, 2e-5, 5e-5, 1e-4, 2e-4]:
            print(f'LR={lr}')
            model = Transformer().to('cuda')
            optimizer = AdamW(model.parameters(), lr=lr)
            epochs = 50
            num_training_steps = len(train_dataloader) * epochs
            num_warmup_steps = int(0.05 * num_training_steps)
            scheduler_warmup = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
            # choose loss fn by task type
            if metric in ('mae', 'spearman', 'pearson'):
                loss_fn = MSELoss()
            else:
                loss_fn = BCEWithLogitsLoss()
            # For MAE we want to minimize; for others maximize
            if metric == 'mae':
                best_val = float('inf')
            else:
                best_val = -float('inf')
            best_state = None
            best_epoch = -1
            for epoch in range(epochs):
                model.train()
                for input_ids, attention_mask, targets in train_dataloader:
                    input_ids = input_ids.to('cuda')
                    attention_mask = attention_mask.to('cuda')
                    targets = targets.to('cuda').float()
                    optimizer.zero_grad()
                    preds = model(input_ids, attention_mask, layer_idx).squeeze()
                    loss = loss_fn(preds, targets)
                    loss.backward()
                    optimizer.step()
                    scheduler_warmup.step()

                # evaluate on validation set
                model.eval()
                val_preds, val_targs = [], []
                with torch.no_grad():
                    for input_ids, attention_mask, targets in val_dataloader:
                        input_ids = input_ids.to('cuda')
                        attention_mask = attention_mask.to('cuda')
                        preds = model(input_ids, attention_mask, layer_idx) \
                                    .squeeze(-1) \
                                    .cpu() \
                                    .numpy()
                        val_preds.extend(preds.tolist())
                        val_targs.extend(targets.numpy().tolist())

                # compute your chosen metric
                if metric == 'mae':
                    val_score = np.mean(np.abs(np.array(val_preds) - np.array(val_targs)))
                elif metric == 'spearman':
                    val_score = spearmanr(val_targs, val_preds)[0]
                elif metric == 'pearson':
                    val_score = pearsonr(val_targs, val_preds)[0]
                elif metric == 'auc':
                    val_score = roc_auc_score(val_targs, val_preds)
                elif metric == 'aucpr':
                    val_score = average_precision_score(val_targs, val_preds)

                improved = (metric == 'mae' and val_score < best_val) or (metric != 'mae' and val_score > best_val)
                if improved:
                    best_val   = val_score
                    best_state = copy.deepcopy(model.state_dict())
                    best_epoch = epoch
                    # print(f"    ↳ new best val_{metric}: {best_val:.4f} (epoch {best_epoch})")

        # now evaluate that best model on the test set
        model = Transformer().to('cuda')
        model.load_state_dict(best_state)
        model.eval()
        test_preds, test_targs = [], []
        with torch.no_grad():
            for input_ids, attention_mask, targets in test_dataloader:
                input_ids = input_ids.to('cuda')
                attention_mask = attention_mask.to('cuda')
                preds = model(input_ids, attention_mask, layer_idx) \
                            .squeeze(-1) \
                            .cpu() \
                            .numpy()
                test_preds.extend(preds.tolist())
                test_targs.extend(targets.numpy().tolist())

        if metric in ('mae', 'spearman', 'pearson'):
            # bring preds back to original units
            test_preds = scaler.inverse_transform(
                np.array(test_preds).reshape(-1, 1)
            ).flatten()
            # use original test targets (unscaled)
            test_targs = test_targets_orig

        if metric == 'mae':
            test_score = np.mean(np.abs(np.array(test_preds) - np.array(test_targs)))
        elif metric == 'spearman':
            test_score = spearmanr(test_targs, test_preds)[0]
        elif metric == 'pearson':
            test_score = pearsonr(test_targs, test_preds)[0]
        elif metric == 'auc':
            test_score = roc_auc_score(test_targs, test_preds)
        elif metric == 'aucpr':
            test_score = average_precision_score(test_targs, test_preds)

        print(f"Layer {layer_idx}: {test_score}")
        test_metrics.append(test_score)

    # save a DataFrame with one column per task and rows = layer indices
    results_df = pd.DataFrame({task: test_metrics}, index=layer_indices)
    dfs.append(results_df)
    results_df.to_csv(f"tmp/molf_{task}_layer_results.csv", index=False)

# dfs = pd.concat(dfs, axis=1)
# dfs.to_csv('./results_molf_finetune.csv', index=False)