<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [10]</a>'.</span>

# Train a model with PyTorch

This notebooks trains a single model using only PyTorch.

1. Tensors are loaded from Parquet files generated in the `features/` pipeline. Each Parquet becomes a Torch Dataset sublass.
2. Random splits are applied for train/test/(val).
3. It will train a single for model for a number of epochs across all datasets: epoch> dataloader> minibatch.
4. The loss is computed through the `loss_adapter` method in each measurement_type.
5. If validation is enabled, early stopping and LR schedulers are applied.

## How to use

Run `python run_notebook.py --help` for more information.

In [1]:
# If this is the template file (and not a copy) and you are introducing changes,
# update VERSION with the current date (YYYY.MM.DD)
VERSION = "2021.05.19"

## ✏ Define hyper parameters

In [2]:
# TEMPLATE VALUES -- these are overriden (see below if executed) by papermill using a YAML or Python file as input

# DATA -- Glob paths must be relative to the root of the repository: REPO / features
PARQUET_LOADER_CLS = "kinoml.datasets.torch_datasets.AwkwardArrayDataset"
PARQUET_FILES = [
    "path/to/*.parquet",
]

# Model -- specified with the full import path to the class object
## Machine learning model that will be trained. Pass it as importable string.
MODEL_CLS = "kinoml.ml.torch_models.NeuralNetworkRegression"
## Keyword arguments for the model initialization
MODEL_KWARGS = {"hidden_size": 350}  # input_shape is defined dynamically during training

# OPTIMIZER
## Optimizer class. Pass it as an importable string.
OPTIMIZER = "torch.optim.Adam"
## Keyword arguments for the optimizer
OPTIMIZER_KWARGS = {"lr": 0.001, "eps": 1e-7, "betas": [0.9, 0.999]}

# LOSS FUNCTION
## Loss function class. Pass it as an importable string.
LOSS = "torch.nn.MSELoss"
## Keyword arguments for the loss function, if applicable
LOSS_KWARGS = {}

# TRAINING
## Maximum number of epochs the training will run. In practice it might be less due to early stopping
MAX_EPOCHS = 50
## Enable real-time validation: this will split the test set into two halves: test and validation.
## It will also enable LR scheduling and early stopping, based on the validation loss.
VALIDATION = True
## Options for the builtin early stopper (kinoml.ml.torch_loops.EarlyStopping)
EARLY_STOPPING_KWARGS = {}

# DATALOADER
DATALOADER_CLS = "torch.utils.data.DataLoader"  # you can also use torch_geometric.data.DataLoader
## Minibatch size
BATCH_SIZE = 64
## Proportion of the dataset that will be split into a test set. If VALIDATION=True, 
## this will also cover the validation set. So, 0.2 will mean: 0.8 training, 0.1 test, 0.1 valid.
TRAIN_TEST_SPLIT = 0.2
## Whether to shuffle the indices before splitting
SHUFFLE_SPLITS = True
## Read https://pytorch.org/docs/stable/data.html#dataloader-collate-fn
## IMPORTANT: This will be needed if your X tensors have different shapes across systems!
COLLATE_FN = None

# Plot bootstrapping
## Bootstrapping iterations for the performance plots
N_BOOTSTRAPS = 1
## Proportion of the data that is sampled in each iteration
BOOTSTRAP_SAMPLE_RATIO = 1

# Output
## Enable some extra output, like plots and logging statements.
VERBOSE = False

## IGNORE THIS ONE
HERE = _dh[-1]

In [3]:
# Parameters
PARQUET_FILES = [
    "ligand-only-graph-subsample/_output/ligand__SmilesToLigandFeaturizer__GraphLigandFeaturizer/ChEMBLDatasetProvider/*.parquet"
]
MODEL_CLS = "kinoml.ml.torch_geometric_models.GraphConvolutionNeuralNetwork"
MODEL_KWARGS = {}
OPTIMIZER = "torch.optim.Adam"
OPTIMIZER_KWARGS = {"lr": 0.001, "eps": 1e-07, "betas": [0.9, 0.999]}
LOSS = "torch.nn.MSELoss"
LOSS_KWARGS = {}
MAX_EPOCHS = 50
VALIDATION = True
EARLY_STOPPING_KWARGS = {}
DATALOADER_CLS = "torch_geometric.data.DataLoader"
BATCH_SIZE = 64
TRAIN_TEST_SPLIT = 0.2
SHUFFLE_SPLITS = True
COLLATE_FN = None
N_BOOTSTRAPS = 1
BOOTSTRAP_SAMPLE_RATIO = 1
VERBOSE = False
HERE = "/Users/taliakimber/Documents/github/experiments-binding-affinity/experiments/001_example-ligand-only-graph-subsample"


⚠ From here on, you should _not_ need to modify anything else 🤞

---

Define key paths for data and outputs:

In [4]:
from pathlib import Path
from datetime import datetime

HERE = Path(HERE)

for parent in HERE.parents:
    if next(parent.glob(".github/"), None):
        REPO = parent
        break

FEATURES_STORE = REPO / "features"
        
OUT = HERE / "_output" / datetime.now().strftime("%Y%m%d-%H%M%S")
OUT.mkdir(parents=True, exist_ok=True)

print(f"This notebook:           HERE = {HERE}")
print(f"This repo:               REPO = {REPO}")
print(f"Features:      FEATURES_STORE = {FEATURES_STORE}")
print(f"Outputs in:               OUT = {OUT}")

This notebook:           HERE = /Users/taliakimber/Documents/github/experiments-binding-affinity/experiments/001_example-ligand-only-graph-subsample
This repo:               REPO = /Users/taliakimber/Documents/github/experiments-binding-affinity
Features:      FEATURES_STORE = /Users/taliakimber/Documents/github/experiments-binding-affinity/features
Outputs in:               OUT = /Users/taliakimber/Documents/github/experiments-binding-affinity/experiments/001_example-ligand-only-graph-subsample/_output/20210809-152611


In [5]:
# Nasty trick: save all-caps local variables (CONSTANTS working as hyperparameters) so far in a dict to save it later
_hparams = {key: value for key, value in locals().items() if key.upper() == key and not key.startswith(("_", "OE_"))}

In [6]:
from collections import defaultdict
from warnings import warn
import sys
import shutil

from IPython.display import Markdown
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, SubsetRandomSampler
from tqdm.auto import trange, tqdm

from kinoml.ml.torch_loops import LRScheduler, EarlyStopping
from kinoml.utils import seed_everything, import_object
from kinoml.core import measurements as measurement_types
from kinoml.core.measurements import null_observation_model
from kinoml.analysis.metrics import performance
from kinoml.analysis.plots import predicted_vs_observed

# Fix the seed for reproducible random splits -- otherwise we get mixed train/test groups every time, biasing the model evaluation
seed_everything();
print("Run started at", datetime.now())



Run started at 2021-08-09 15:26:13.399298


## Load featurized data and create observation models

We assume this path structure: `$REPO/features/_output/<FEATURIZATION>/<DATASET>/<MEASUREMENT_TYPE>.npz`

In [7]:
DATASETS = []
MEASUREMENT_TYPES = set()
ParquetLoaderCls = import_object(PARQUET_LOADER_CLS)
for glob in PARQUET_FILES:
    parquets = list(FEATURES_STORE.glob(glob))
    if not parquets:
        warn(f"⚠ Parquet glob `{glob}` did not match any files!")
        continue
        
    for parquet in parquets:
        measurement_type = parquet.stem
        dataset = parquet.parent.name
        
        ds = ParquetLoaderCls.from_parquet(parquet)
        ds.metadata = {
            "dataset": dataset,
            "measurement_type": measurement_type,
        }
        DATASETS.append(ds)
        MEASUREMENT_TYPES.add(measurement_type)

if not DATASETS:
    raise ValueError("Provided `PARQUET_FILES` did not result in any valid datasets!")

Now that we have all the data-dependent objects, we can start with the model-specific definitions.

In [8]:
print(f"Loaded {len(DATASETS)} datasets with a total of {sum(len(d) for d in DATASETS)} measurements.")

Loaded 18 datasets with a total of 100 measurements.


### Prepare splits and dataloaders

Create train / test / validation subsets. Here we implement a random split, but it can take external indices if needed.

In [9]:
dataloaders = {}
for dataset in DATASETS:
    key = dataset.metadata["measurement_type"]
    
    # Generate random indices in situ
    # If you need to provide indices from another source, 
    # replace this block to provide train_indices, test_indices
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(TRAIN_TEST_SPLIT * dataset_size))
    
    if SHUFFLE_SPLITS :
        np.random.shuffle(indices)
    train_indices, test_indices = indices[split:], indices[split:]
    if VALIDATION:
        split2 = int(np.floor(len(test_indices) / 2))
        test_indices, val_indices = test_indices[:split2], test_indices[split2:]
    # End of indices creation
    
    collate_fn = None
    if COLLATE_FN:
        # IMPORTANT: This will be needed if your X tensors have different shapes across systems!
        # COLLATE_FN can be an import string, or a eval-able lambda
        # Read https://pytorch.org/docs/stable/data.html#dataloader-collate-fn
        try:
            collate_fn = import_object(COLLATE_FN)
        except ImportError:
            collate_fn = eval(COLLATE_FN)
    
    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    test_sampler = SubsetRandomSampler(test_indices)

    dataloaders[key] = {
        "train": import_object(DATALOADER_CLS)(dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn),
        "test": import_object(DATALOADER_CLS)(dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn),
    }
    
    if VALIDATION:
        val_sampler = SubsetRandomSampler(val_indices)
        dataloaders[key]["val"] = import_object(DATALOADER_CLS)(dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)

### Training loop

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [10]:
ModelCls = import_object(MODEL_CLS)

# Note that we assume all dataloaders will provide the
# same kind of input shape, so we onlt test on one
if ModelCls.needs_input_shape:
    a_dataloader = dataloaders[next(iter(dataloaders.keys()))]["train"]
    x_sample, _ = next(iter(a_dataloader))
    MODEL_KWARGS["input_shape"] = ModelCls.estimate_input_shape(x_sample)

nn_model = ModelCls(**MODEL_KWARGS)

optimizer = import_object(OPTIMIZER)(nn_model.parameters(), **OPTIMIZER_KWARGS)
loss_function = import_object(LOSS)()

if VALIDATION:
    lr_scheduler = LRScheduler(optimizer)
    early_stopping = EarlyStopping(**EARLY_STOPPING_KWARGS)

train_loss_timeseries = []
val_loss_timeseries = []

range_epochs = trange(MAX_EPOCHS, desc="Epochs")
for epoch in range_epochs:
    train_loss = 0.0
    val_loss = 0.0
    for key, loader in tqdm(dataloaders.items(), desc="Datasets", leave=False):
        mtype_class = import_object(f"kinoml.core.measurements.{key}")
        loss_adapter = mtype_class.loss_adapter(backend="pytorch")
        
        # TRAIN
        nn_model.train()
        for x, y in tqdm(loader["train"], desc="Minibatches", leave=False):
            # Clear gradients
            optimizer.zero_grad()
            # Obtain model prediction given model input
            prediction = nn_model(x)
            # apply observation model
            loss = loss_adapter(prediction.view_as(y), y, loss_function)
            # Pred. must match y shape!    ^^^^^^^^^^
            # Obtain loss for the predicted output
            train_loss += loss.item()
            # Gradients w.r.t. parameters
            loss.backward()
            # Optimize
            optimizer.step()
            
        
        # VALIDATE
        if VALIDATION:
            nn_model.eval()
            with torch.no_grad():
                for x, y in tqdm(loader["val"], desc="Minibatches", leave=False):
                    prediction = nn_model(x).view_as(y)
                    loss = loss_adapter(prediction.view_as(y), y, loss_function)
                    val_loss += loss.item()
                    range_epochs.set_description(f"Epochs (Avg. val. loss={val_loss / (epoch + 1):.2e})")
    
    # LOG LOSSES
    train_loss_timeseries.append(train_loss)
    
    if VALIDATION:
        val_loss_timeseries.append(val_loss)

        # Adjust training if needed
        lr_scheduler(val_loss)
        early_stopping(val_loss)
        if early_stopping.early_stop:
            break

RuntimeError: stack expects each tensor to be equal size, but got [2, 50] at entry 0 and [2, 52] at entry 1

Save model to disk

In [None]:
torch.save(nn_model, OUT / "nn_model.pt")

### Evaluate model

In [None]:
metrics = {}
nn_model.train(False)
for key, loader in dataloaders.items():
    metrics[key] = {}
    display(Markdown(f"#### {key}"))
    for ttype, dataloader in loader.items():
        display(Markdown(f"##### {ttype}"))
        mtype = import_object(f"kinoml.core.measurements.{key}")
        obs_model = mtype.observation_model(backend="pytorch")
        x, y = dataloader.dataset[dataloader.batch_sampler.sampler.indices]
        prediction = obs_model(nn_model(x).view_as(y).detach().numpy())

        perf_data = performance(prediction, y, verbose=False)
        metrics[key][ttype] = {}
        for perfkey, values in perf_data.items():
            metrics[key][ttype][perfkey] = {"mean": values[0], "std": values[1]}
        display(predicted_vs_observed(prediction, y, mtype))

### Summary

`kinase_metrics` is a nested dictionary with these dimensions:

- measurement type
- metric
- mean & standard deviation

In [None]:
import json

display(Markdown(f"""
### Configuration 

```json
{json.dumps(_hparams, default=str, indent=2)}
```
"""))

if VERBOSE:
    display(Markdown(
f"""
### Kinase metrics

```json
{json.dumps(metrics, default=str, indent=2)}
```
"""))

In [None]:
for mtype_name in MEASUREMENT_TYPES:
    mtype_metrics = metrics.get(mtype_name)
    if not mtype_metrics:
        continue
        
    display(Markdown(f"#### {mtype_name}"))
    
    # flatten dict a bit: from dict["test"]["r2"]["mean"] to dict["test"]["r2_mean"]
    flattened = {}
    for ttype, scores in mtype_metrics.items():
        flattened[ttype] = {}
        for score, stats in scores.items():
            for stat, value in stats.items():
                flattened[ttype][f"{score}_{stat}"] = value
        
    df = pd.DataFrame.from_dict(flattened, orient="index")
    with pd.option_context("display.float_format", "{:.3f}".format, "display.max_rows", len(df)):
        display(
            df.style.background_gradient(subset=["r2_mean"], low=0, high=1, vmin=0, vmax=1)
              .apply(lambda x: ['font-weight: bold' for v in x], subset=["r2_mean"])
        )

In [None]:
print("Run finished at", datetime.now())

### Save reports to disk

In [None]:
from kinoml.utils import watermark
w = watermark()

In [None]:
%%capture cap --no-stderr
w = watermark()

In [None]:
import json

with open(OUT / "performance.json", "w") as f:
    json.dump(metrics, f, default=str, indent=2)
    
with open(OUT/ "watermark.txt", "w") as f:
    f.write(cap.stdout)

with open(OUT / "hparams.json", "w") as f:
    json.dump(_hparams, f, default=str, indent=2)