In [1]:
!pip install pandas numpy spacecutter torch datamol skorch

Collecting spacecutter
  Downloading spacecutter-0.2.1-py3-none-any.whl.metadata (3.3 kB)
Collecting datamol
  Downloading datamol-0.12.5-py3-none-any.whl.metadata (8.0 kB)
Collecting skorch
  Downloading skorch-1.1.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metad

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
proj_dir =  'drive/MyDrive/Polaris_ASAP_competition/polaris_challenge/admet'
# proj_dir = '/Users/robertarbon/Library/CloudStorage/GoogleDrive-robert.arbon@gmail.com/My Drive/Polaris_ASAP_competition/polaris_challenge/admet'


In [None]:
import pandas as pd
import numpy as np
from spacecutter.models import OrdinalLogisticModel
import torch
from torch import nn
import datamol as dm
import matplotlib.pyplot as plt

from skorch import NeuralNet
from skorch.dataset import Dataset
from skorch.helper import SkorchDoctor
from skorch.callbacks import EarlyStopping

from spacecutter.callbacks import AscensionCallback
from spacecutter.losses import CumulativeLinkLoss
from sklearn.metrics import mean_absolute_error
from scipy.stats import kendalltau

from sklearn.preprocessing import RobustScaler

# from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer
from utils import *

In [5]:
# Imputed training data
df_imp = pd.read_csv(f'{proj_dir}/dm_features/ordinal_data_split_2/train_admet_split2_log_pmm_imputed.csv')
# Non-imputed validation data
df_val = pd.read_csv(f'{proj_dir}/dm_features/ordinal_data_split_2/train_admet_split2_features.csv')
# change names
df_val.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)
df_imp.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)

# Smiles columns because they were removed (for some unknown reason)
df_smiles = pd.read_csv(f'{proj_dir}/data/train_admet_all.csv')
df_smiles.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)


df_imp = df_imp.merge(df_smiles.loc[:, ['Molecule.Name', 'CXSMILES']], on='Molecule.Name', how='left')
df_val = df_val.merge(df_smiles.loc[:, ['Molecule.Name', 'CXSMILES']], on='Molecule.Name', how='left')

In [17]:
train, val = train_data(df_train=df_imp,
                        imp_ix=1,
                        df_val=df_val,
                        n_cuts=None, features=['chemberta', 'chem_prop', 'rdkit_simple'])

targets = list(train[1].keys())

training_by_target = {}

for target in targets:
  num_ds = 1
  ix_by_imp = train[2]
  if ix_by_imp is not None:
    num_ds = len(ix_by_imp)

  # accumulators
  all_y_hat = []
  all_epochs = []

  # All imputed training datasets
  all_X = train[0]
  all_y = train[1][target]['values'].reshape(-1, 1)

  # only keep features which are different
  keep_ix = np.std(all_X, axis=0)>0
  all_X = all_X[:, keep_ix].astype(np.float32)

  # Get validation data (not imputed so only done once. contains missing values)
  missing_ix = val[1][target]['missing_ix']
  Xval = val[0]

  Xval = Xval[missing_ix, :].astype(np.float32)
  Xval = Xval[:, keep_ix]
  # validation y values have been digitized using all the training y values.
  # so should be consistent.
  yval = val[1][target]['values'].reshape(-1, 1)
  bins = train[1][target]['bins']

  print(f'{target}')
  for i in range(num_ds):
    if i % 10 == 0:
      print(f'\t{i}/{num_ds}', end=',')

    # Get imputed training dataset.
    imp_ix = ix_by_imp[i+1] if ix_by_imp is not None else np.arange(all_X.shape[0]) # The zeroth imputed dataset is the original data.
    X = all_X[imp_ix, :]
    y = train[1][target]['values'].reshape(-1, 1)
    y = y[imp_ix]

    # print(f"{target}:\n\tn_train_obs: {X.shape[0]}, n_val_obs: {Xval.shape[0]} n_preds: {X.shape[1]}")

    # Stack all data for convenience.
    train_v_X = np.vstack([X, Xval])
    train_v_y = np.vstack([y, yval])
    train_ix = np.arange(X.shape[0])
    val_ix = np.arange(X.shape[0], train_v_X.shape[0])

    num_features = X.shape[1]
    num_classes = len(np.unique(y))

    predictor = nn.Sequential(
        nn.Linear(num_features, num_features),
        nn.ReLU(),
        nn.Linear(num_features, num_features),
        nn.ReLU(),
        nn.Linear(num_features, 1),
    )


    skorch_model = NeuralNet(
        module=OrdinalLogisticModel,
        module__predictor=predictor,
        module__num_classes=num_classes,
        criterion=CumulativeLinkLoss,
        optimizer=torch.optim.Adam,
        train_split=lambda ds, y: (torch.utils.data.Subset(ds, train_ix),
                                  torch.utils.data.Subset(ds, val_ix)),
        callbacks=[
            ('ascension', AscensionCallback()),
            ('early_stopping', EarlyStopping(threshold=0.0001, load_best=True,
                                            patience=100))
        ],
        verbose=0,
        batch_size=X.shape[0],
        max_epochs=500,

    )

    skorch_model.fit(train_v_X, train_v_y)


    y_hat = np.argmax(skorch_model.predict(Xval), axis=1)


    all_y_hat.append(bins[y_hat])

    for i in range(len(skorch_model.history)-1, -1, -1):
      batch = skorch_model.history[i]

      if batch['valid_loss_best']:
        all_epochs.append(batch['epoch'])
        break

  # mean prediction over imputed datasets.
  mean_y_hat = np.mean(np.vstack(all_y_hat), axis=0)
  y_val_cont = bins[yval.reshape(-1)]

  print(f"\tmae: {mean_absolute_error(y_val_cont, mean_y_hat):4.2f}")
  print(f"\tmean epoches: {np.mean(all_epochs)}")

training data
using chemberta
	creating new scaler
using chemprop features
	creating new scaler
using rdkit simple
	creating new scaler
validation data
using chemberta
	using existing scaler
using chemprop features
	using existing scaler
using rdkit simple
	using existing scaler
LogD
	0/1,	mae: 0.36
	mean epoches: 29.0
LogMLM
	0/1,	mae: 0.46
	mean epoches: 1.0
LogHLM
	0/1,	mae: 0.67
	mean epoches: 30.0
LogKSOL
	0/1,	mae: 0.53
	mean epoches: 2.0
LogMDR1.MDCKII
	0/1,	mae: 0.64
	mean epoches: 2.0


In [None]:
# train, val = train_data(df_train=df_imp,
#                         imp_ix=None,
#                         df_val=df_val,
#                         n_cuts=None, features=['rdkit_simple', 'chemp_prop'])

# targets = list(train[1].keys())

# training_by_target = {}
# n_imp_ds = df_imp['.imp'].max()

# for target in targets:
#   all_X = train[0]
#   all_y = train[1][target]['values'].reshape(-1, 1)

#   # only keep features which are different
#   keep_ix = np.std(all_X, axis=0)>0
#   all_X = all_X[:, keep_ix].astype(np.float32)

#   # Get imputed training dataset.
#   shuffle_ix = np.random.choice(np.arange(all_X.shape[0]), size=all_X.shape[0], replace=False)
#   X = all_X[shuffle_ix, :]
#   y = all_y[shuffle_ix]

#   # Get validation data (not imputed so only done once. contains missing values)
#   missing_ix = val[1][target]['missing_ix']
#   Xval = val[0]
#   Xval = Xval[missing_ix, :].astype(np.float32)
#   Xval = Xval[:, keep_ix]
#   # validation y values have been digitized using all the training y values.
#   # so should be consistent.
#   yval = val[1][target]['values'].reshape(-1, 1)
#   bins = train[1][target]['bins']

#   # batch size
#   batch_size = all_X.shape[0]//n_imp_ds

#   # Stack all data for convenience.
#   train_v_X = np.vstack([X, Xval])
#   train_v_y = np.vstack([y, yval])
#   train_ix = np.arange(X.shape[0])
#   val_ix = np.arange(X.shape[0], train_v_X.shape[0])

#   # Model dimensions
#   num_features = X.shape[1]
#   num_classes = len(np.unique(y))

#   # Simple predictor
#   predictor = nn.Sequential(
#     nn.Linear(num_features, num_features),
#     nn.ReLU(),
#     nn.Linear(num_features, num_features),
#     nn.ReLU(),
#     nn.Linear(num_features, 1),
#   )

#   # Model
#   skorch_model = NeuralNet(
#     module=OrdinalLogisticModel,
#     module__predictor=predictor,
#     module__num_classes=num_classes,
#     criterion=CumulativeLinkLoss,
#     optimizer=torch.optim.Adam,
#     train_split=lambda ds, y: (torch.utils.data.Subset(ds, train_ix),
#                               torch.utils.data.Subset(ds, val_ix)),
#     callbacks=[
#         ('ascension', AscensionCallback()),
#         ('early_stopping', EarlyStopping(threshold=0.0001, load_best=True,
#                                         patience=100))
#     ],
#     verbose=0,
#     batch_size=X.shape[0],
#     max_epochs=500,

#   )
#   # Fit
#   skorch_model.fit(train_v_X, train_v_y)

#   # predict on validation
#   y_hat_ord = np.argmax(skorch_model.predict(Xval), axis=1)
#   y_hat = bins[y_hat_ord]
#   y_val = bins[yval.reshape(-1)]

#   print(f"\tmae: {mean_absolute_error(y_val, y_hat):4.2f}")

#   # Find best epoch
#   for i in range(len(skorch_model.history)-1, -1, -1):
#     batch = skorch_model.history[i]

#     if batch['valid_loss_best']:
#       print(f"\tepochs: {batch['epoch']}/{len(skorch_model.history)}")
#       break



using rdkit simple
using chemprop features
using rdkit simple
using chemprop features
	mae: 0.61
	epochs: 12/112
	mae: 0.45
	epochs: 6/106
	mae: 0.51
	epochs: 33/133
	mae: 0.59
	epochs: 20/120
	mae: 0.87
	epochs: 23/123
