In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import warnings
warnings.simplefilter("ignore") 
import logging
logging.basicConfig(level=logging.ERROR)

In [41]:
from kinoml.datasets.chembl import ChEMBLDatasetProvider
chembl = ChEMBLDatasetProvider.from_source(sample=5000)

100%|██████████| 5000/5000 [02:15<00:00, 36.81it/s]


In [42]:
chembl

<ChEMBLDatasetProvider with 5000 IC50Measurement measurements and 4920 systems>

In [43]:
df = chembl.to_dataframe()
df

Unnamed: 0,Systems,n_components,Measurement,MeasurementType
0,P00519 & O=C(Nc1ccc(OC(F)(F)Cl)cc1)c1cnc(NCC[C...,2,7.8,IC50Measurement
1,P15056 & CNC(=O)c1cc(Oc2ccc3[nH]c(Nc4ccc(C(F)(...,2,11.0,IC50Measurement
2,Q9NZJ5 & Fc1ccc(Nc2c(-c3ncccn3)oc3cnccc23)c2cn...,2,3500.0,IC50Measurement
3,P15056 & COc1cc(Nc2cncc(Oc3ccc4c(c3)CCC4=O)n2)...,2,70000.0,IC50Measurement
4,Q15118 & C[C@H]1CCc2cc(-c3nc(Cl)nc4ccsc34)ccc2...,2,198.0,IC50Measurement
...,...,...,...,...
4995,P36888 & CC1(C)CNc2cc(NC(=O)c3cccnc3NCc3ccncc3...,2,1000.0,KdMeasurement
4996,Q13163 & C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc...,2,60.0,KdMeasurement
4997,P31152 & CSc1cccc(Nc2ncc3cc(-c4c(Cl)cccc4Cl)c(...,2,1100.0,KdMeasurement
4998,P29376 & CC(C)NC(=O)[C@H]1CC[C@@H](n2/c(=N/C(=...,2,8.2,KdMeasurement


In [44]:
print("Measurements:", len(chembl.measurements))
print("Systems:", len(chembl.systems))
print("Proteins:", len(set([s.protein for s in chembl.systems])))
print("Ligands:",len(set([s.ligand.name for s in chembl.systems])))

Measurements: 5000
Systems: 4920
Proteins: 301
Ligands: 4516


Having this many ligands (compared to PKIS2) makes this dataset take much more memory and longer (~10 mins) to initialize!

In [45]:
from kinoml.features.ligand import MorganFingerprintFeaturizer
from kinoml.features.core import HashFeaturizer
from kinoml.features.core import Concatenated

morgan_featurizer = MorganFingerprintFeaturizer(nbits=1024, radius=2)
hashed_sequence_featurizer = HashFeaturizer(("protein", "sequence"))
concat_featurizers = Concatenated([morgan_featurizer, hashed_sequence_featurizer], axis=0)

In [46]:
%%time
chembl.featurize(concat_featurizers)

Featurizing systems...: 100%|██████████| 4920/4920 [00:24<00:00, 203.75it/s]

CPU times: user 24.2 s, sys: 234 ms, total: 24.4 s
Wall time: 24.2 s





So far, it's the same... differences arise when measurement type_s_ are considered!

In [47]:
datasets = chembl.to_pytorch()
datasets

[<kinoml.datasets.torch_datasets.TorchDataset at 0x7f9044424c50>,
 <kinoml.datasets.torch_datasets.TorchDataset at 0x7f9044424850>,
 <kinoml.datasets.torch_datasets.TorchDataset at 0x7f9024518d50>]

In [48]:
observation_models = chembl.observation_models(backend="pytorch")
observation_models

[<function kinoml.core.measurements.IC50Measurement._observation_model_pytorch(values, substrate_conc=1e-06, michaelis_constant=1, inhibitor_conc=1e-06, **kwargs)>,
 <function kinoml.core.measurements.KiMeasurement._observation_model_pytorch(values, inhibitor_conc=1e-06, **kwargs)>,
 <function kinoml.core.measurements.KdMeasurement._observation_model_pytorch(values, inhibitor_conc=1e-06, **kwargs)>]

In [49]:
# Use DataLoader for minibatches
loaders = [dataset.as_dataloader(batch_size=512) for dataset in datasets]
loaders

[<torch.utils.data.dataloader.DataLoader at 0x7f9044434d90>,
 <torch.utils.data.dataloader.DataLoader at 0x7f9044434a50>,
 <torch.utils.data.dataloader.DataLoader at 0x7f9010ce7710>]

In [51]:
import torch
from kinoml.ml.models import NeuralNetworkRegression

model = NeuralNetworkRegression(input_size=datasets[0].systems[0].shape[0])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = torch.nn.MSELoss() # Mean squared error

nb_epoch = 100
loss_timeseries = []
for epoch in range(nb_epoch):
    cumulative_loss = 0
    for loader, obs_model in zip(loaders, observation_models):
        for x, y in loader:
            # Clear gradients
            optimizer.zero_grad()
            # Obtain model prediction given model input
            delta_g = model(x)

            # with observation model
            prediction = obs_model(delta_g)
            loss = loss_function(prediction, y)
            
            # Obtain loss for the predicted output
            cumulative_loss += loss.item()

            # Gradients w.r.t. parameters
            loss.backward()

            # Optimizer
            optimizer.step()
    loss_timeseries.append(cumulative_loss)
    if epoch % 5 == 0:
        print(f"epoch {epoch} : loss {loss_timeseries[-1]}")
print("Done!")

epoch 0 : loss 9214469804856.0
epoch 5 : loss 9214469789680.0
epoch 10 : loss 9214458851884.0
epoch 15 : loss 9214317160940.0
epoch 20 : loss 9214415900048.0
epoch 25 : loss 9214434641688.0
epoch 30 : loss 9214424554296.0
epoch 35 : loss 9214306476340.0
epoch 40 : loss 9214453552688.0
epoch 45 : loss 9214431674332.0
epoch 50 : loss 9214100963544.0
epoch 55 : loss 9213855979568.0
epoch 60 : loss 9213615402148.0
epoch 65 : loss 9213388358108.0
epoch 70 : loss 9214343776396.0
epoch 75 : loss 9213184039960.0
epoch 80 : loss 9213436373308.0
epoch 85 : loss 9214313355040.0
epoch 90 : loss 9214431227568.0
epoch 95 : loss 9214413024500.0
Done!
