In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import warnings
warnings.simplefilter("ignore") 
import logging
logging.basicConfig(level=logging.ERROR)

In [4]:
from kinoml.datasets.chembl import ChEMBLDatasetProvider
chembl = ChEMBLDatasetProvider.from_source(sample=20000)

100%|██████████| 20000/20000 [00:00<00:00, 28127.34it/s]


In [5]:
chembl

<ChEMBLDatasetProvider with 20000 IC50Measurement measurements and 19221 systems>

In [6]:
df = chembl.to_dataframe()
df

Unnamed: 0,Systems,n_components,Measurement,MeasurementType
0,P22607 & COc1cc(OC)c(Cl)c(N2Cc3cnc(Nc4cccc(C)c...,2,7.12,IC50Measurement
1,P00533 & COCCn1c(=O)oc2cc3ncnc(Nc4ccc(OC)c(OC)...,2,5500.00,IC50Measurement
2,P07949 & Cc1cc(CCN(C)C)ccc1Nc1nccc(-c2c[nH]c3n...,2,1117.00,IC50Measurement
3,P43250 & Cc1ccc(-c2nnc(SCC(=O)O)[nH]2)c(O)c1,2,6880.00,IC50Measurement
4,O14965 & O=Nc1c(-c2c(O)[nH]c3c(F)cccc23)[nH]c2...,2,2000.00,IC50Measurement
...,...,...,...,...
19995,O43283 & C[C@]12O[C@H](C[C@]1(O)CO)n1c3ccccc3c...,2,340.00,KdMeasurement
19996,Q08345 & CCN(CC)CCNC(=O)c1c(C)[nH]c(/C=C2\C(=O...,2,2000.00,KdMeasurement
19997,P09619 & COCCOc1cc2ncnc(N3CCN(c4ncc(Cc5ccccc5)...,2,8.00,KdMeasurement
19998,P23458-P23458 & CCN(CC)CCNC(=O)c1c(C)[nH]c(/C=...,2,6000.00,KdMeasurement


In [7]:
print("Measurements:", len(chembl.measurements))
print("Systems:", len(chembl.systems))
print("Proteins:", len(set([s.protein for s in chembl.systems])))
print("Ligands:",len(set([s.ligand.name for s in chembl.systems])))

Measurements: 20000
Systems: 19221
Proteins: 389
Ligands: 16592


Having this many ligands (compared to PKIS2) makes this dataset take much more memory and longer (~10 mins) to initialize!

In [8]:
from kinoml.features.ligand import SmilesToLigandFeaturizer, MorganFingerprintFeaturizer
from kinoml.features.core import HashFeaturizer, Concatenated, Pipeline

morgan_featurizer = Pipeline([SmilesToLigandFeaturizer(), MorganFingerprintFeaturizer(nbits=1024, radius=2)])
hashed_sequence_featurizer = HashFeaturizer(("protein", "sequence"))
concat_featurizers = Concatenated([morgan_featurizer, hashed_sequence_featurizer], axis=0)

You can prefeaturize everything before the loop with this cell:

In [9]:
# %%time
# chembl.featurize(concat_featurizers)

Or delay the featurization until the systems are needed by passing the featurizer to the `to_pytorch` constructor. We will use this strategy!

In [10]:
datasets = chembl.to_pytorch(featurizer=concat_featurizers)
datasets

[<kinoml.datasets.torch_datasets.TorchDataset at 0x7fc9321fcfd0>,
 <kinoml.datasets.torch_datasets.TorchDataset at 0x7fc9321fc3d0>,
 <kinoml.datasets.torch_datasets.TorchDataset at 0x7fc932587110>]

In [11]:
observation_models = chembl.observation_models(backend="pytorch")
observation_models

[<function kinoml.core.measurements.IC50Measurement._observation_model_pytorch(values, substrate_conc=1e-06, michaelis_constant=1, inhibitor_conc=1e-06, **kwargs)>,
 <function kinoml.core.measurements.KiMeasurement._observation_model_pytorch(values, inhibitor_conc=1e-06, **kwargs)>,
 <function kinoml.core.measurements.KdMeasurement._observation_model_pytorch(values, inhibitor_conc=1e-06, **kwargs)>]

In [None]:
import torch
from kinoml.ml.torch_models import NeuralNetworkRegression
from tqdm.auto import trange, tqdm

# precompute input size
input_size = datasets[0].estimate_input_size()
model = NeuralNetworkRegression(input_size=input_size[0])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = torch.nn.MSELoss() # Mean squared error

# Use DataLoader for minibatches
datasets = chembl.to_pytorch(featurizer=concat_featurizers)
loaders = [dataset.as_dataloader(batch_size=512) for dataset in datasets]
loaders

nb_epoch = 100
loss_timeseries = []
for epoch in trange(nb_epoch, desc="Epochs"):
    cumulative_loss = torch.zeros(1, dtype=torch.float64)
    for loader in loaders:
        for x, y in tqdm(loader, desc=f"Batches for {loader.dataset.observation_model.__qualname__.split('.')[0]}", leave=False):
            # Clear gradients
            optimizer.zero_grad()
            # Obtain model prediction given model input
            delta_g = model(x)

            # with observation model
            prediction = loader.dataset.observation_model(delta_g)
            loss = loss_function(prediction, y)

            # Obtain loss for the predicted output
            cumulative_loss[0] += loss.item()

            # Gradients w.r.t. parameters
            loss.backward()

            # Optimizer
            optimizer.step()
    loss_timeseries.append(cumulative_loss[0])
    if epoch % 5 == 0:
        print(f"epoch {epoch} : loss {loss_timeseries[-1]:.4E}")
print("Done!")