In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import warnings
warnings.simplefilter("ignore") 
import logging
logging.basicConfig(level=logging.ERROR)

In [4]:
from kinoml.datasets.chembl import ChEMBLDatasetProvider
chembl = ChEMBLDatasetProvider.from_source() #sample=20000)



HBox(children=(FloatProgress(value=0.0, max=203730.0), HTML(value='')))




In [5]:
chembl

<ChEMBLDatasetProvider with 203730 pIC50Measurement measurements and 162819 systems>

In [6]:
df = chembl.to_dataframe()
df

Unnamed: 0,Systems,n_components,Measurement,MeasurementType
0,P00533 & Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F...,2,7.387216,pIC50Measurement
1,P35968 & Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F...,2,4.782516,pIC50Measurement
2,P00533 & Cc1cc(C(=O)N2CCOCC2)[nH]c1/C=C1\C(=O)...,2,6.769551,pIC50Measurement
3,P06239 & Nc1ncnc2c1c(-c1cccc(Oc3ccccc3)c1)cn2C...,2,6.853872,pIC50Measurement
4,P06239 & Nc1ncnc2c1c(-c1cccc(Oc3ccccc3)c1)cn2C...,2,5.928118,pIC50Measurement
...,...,...,...,...
203725,P42345 & CC(C)n1nc(-c2cc3cc(O)ccc3[nH]2)c2c(N)...,2,8.522879,pKdMeasurement
203726,P42345 & CO[C@H]1CC[C@H](N2C(=O)CNc3ncc(-c4ccc...,2,7.552842,pKdMeasurement
203727,P42345 & CNC(=O)c1cccc(-c2ccc3c(N4CCOC[C@@H]4C...,2,9.853872,pKdMeasurement
203728,P42345 & CO[C@H]1C[C@@H]2CC[C@@H](C)[C@@](O)(O...,2,9.221849,pKdMeasurement


In [7]:
print("Measurements:", len(chembl.measurements))
print("Systems:", len(chembl.systems))
print("Proteins:", len(set([s.protein for s in chembl.systems])))
print("Ligands:",len(set([s.ligand.name for s in chembl.systems])))

Measurements: 203730
Systems: 162819
Proteins: 422
Ligands: 103296


Having this many ligands (compared to PKIS2) makes this dataset take much more memory and longer (~10 mins) to initialize!

In [8]:
from kinoml.features.ligand import SmilesToLigandFeaturizer, MorganFingerprintFeaturizer
from kinoml.features.protein import AminoAcidCompositionFeaturizer
from kinoml.features.core import HashFeaturizer, Concatenated, Pipeline

morgan_featurizer = Pipeline([SmilesToLigandFeaturizer(), MorganFingerprintFeaturizer(nbits=1024, radius=2)])
composition_featurizer = AminoAcidCompositionFeaturizer()
concat_featurizers = Concatenated([morgan_featurizer, composition_featurizer], axis=0)

You can prefeaturize everything before the loop with this cell:

In [9]:
# %%time
# chembl.featurize(concat_featurizers)

Or delay the featurization until the systems are needed by passing the featurizer to the `to_pytorch` constructor. We will use this strategy!

## Temporary workaround

To debug the loss problem, we will only consider one protein for now:

In [10]:
from collections import Counter
counts = Counter([s.system.protein.name for s in chembl.measurements])
sorted_counts = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)[:5]
sorted_counts

[('P35968', 10034),
 ('P00533', 8581),
 ('O60674-O60674', 5659),
 ('P15056', 5248),
 ('Q16539', 5057)]

In [11]:
from kinoml.core.measurements import pIC50Measurement
measurements = []
for ms in chembl.measurements:
    if ms.system.protein.name == sorted_counts[0][0] and isinstance(ms, pIC50Measurement):
        measurements.append(ms)
len(measurements)

9628

In [12]:
from kinoml.datasets.chembl import _SingleTypeChEMBLDatasetProvider
subchembl = ChEMBLDatasetProvider([_SingleTypeChEMBLDatasetProvider(measurements)])

In [13]:
subchembl

<ChEMBLDatasetProvider with 9628 pIC50Measurement measurements and 7426 systems>

### /End of temporary workaround; subchembl used from now on, instead of chembl

In [14]:
datasets = subchembl.to_pytorch(featurizer=concat_featurizers)
datasets

[<kinoml.datasets.torch_datasets.TorchDataset at 0x7fd94d5f33d0>]

In [15]:
observation_models = subchembl.observation_models(backend="pytorch")
observation_models

[<function kinoml.core.measurements.pIC50Measurement._observation_model_pytorch(dG_over_KT, substrate_conc=1e-06, michaelis_constant=1, inhibitor_conc=1e-06, **kwargs)>]

In [16]:
import torch
from kinoml.ml.torch_models import NeuralNetworkRegression
from tqdm.auto import trange, tqdm
from kinoml.core.measurements import null_observation_model

# Use DataLoader for minibatches
datasets = subchembl.to_pytorch(featurizer=concat_featurizers)
loaders = [dataset.as_dataloader(batch_size=256) for dataset in datasets]

In [17]:
# precompute input size
input_size = datasets[0].estimate_input_size()
model = NeuralNetworkRegression(input_size=input_size[0])
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_function = torch.nn.MSELoss() # Mean squared error

nb_epoch = 100
loss_timeseries = []
ys = []
range_epochs = trange(nb_epoch, desc="Epochs (+ featurization...)")
for epoch in range_epochs:
    # Single cumulative loss / or loss per loader? look into this!
    cumulative_loss = 0
    ys.append([])
    for i, loader in enumerate(loaders):  # FIXME: make sure we are balancing across measurement types
        for j, (x, y) in enumerate(loader):
            # assert not (torch.isnan(x).any() or torch.isinf(x).any()), f"x has nan and/or infs!"
            # assert not (torch.isnan(y).any() or torch.isinf(y).any()), f"y has nan and/or infs!"
            
            # Obtain model prediction given model input
            delta_g = model(x)
            # with observation model
            prediction = loader.dataset.observation_model(delta_g)
            prediction.retain_grad()

            ys[-1].append((delta_g, prediction, y))
            
            # prediction = delta_g
            loss = loss_function(prediction, y)
            
            # Obtain loss for the predicted output
            # if cumulative loss is global, change this i to 0, or viceversa
            cumulative_loss += loss.item()

            # Clear gradients
            # added after loss is computed! https://pytorch.org/tutorials/beginner/pytorch_with_examples.html
            optimizer.zero_grad()
            
            # Gradients w.r.t. parameters
            loss.backward()
            
            # Optimizer
            optimizer.step()
            
    loss_timeseries.append(cumulative_loss)
    range_epochs.set_description(f"Epochs (loss={loss_timeseries[-1]:.2e})")

HBox(children=(FloatProgress(value=0.0, description='Epochs (+ featurization...)', style=ProgressStyle(descrip…




In [24]:
import numpy as np
from ipywidgets import interact
from matplotlib import pyplot as plt

def predicted_vs_true(i=100):
    fig, ax = plt.subplots()
    predicted = np.concatenate([y[1].detach().numpy() for y in ys[i]])
    true = np.concatenate([y[2].detach().numpy() for y in ys[i]])
    ax.scatter(predicted, true)
    ax.set(xlim=(0, 15), ylim=(0, 15))
    ax.set_xlabel("Predicted y")
    ax.set_ylabel("True y")
    plt.show()
    

interact(predicted_vs_true, i=(0, len(ys)-1));

interactive(children=(IntSlider(value=99, description='i', max=99), Output()), _dom_classes=('widget-interact'…

In [21]:
def predicted_grad_vs_true(i=100):
    fig, ax = plt.subplots()
    predicted = np.concatenate([y[1].grad.detach().numpy() for y in ys[i]])
    ax.hist(predicted)
    ax.set(xlim=(-1, 1), ylim=(0, 3000))
    ax.set_xlabel("Gradients")
    plt.show()
    
interact(predicted_grad_vs_true, i=(0, len(ys)-1));

interactive(children=(IntSlider(value=99, description='i', max=99), Output()), _dom_classes=('widget-interact'…