<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [12]</a>'.</span>

# Train a model with PyTorch

> This notebook is a simplified version of our workflow. It exposes the basic details of the traning and evaluation loop more explicitly, but does not offer advanced features like early stopping, mini-batches or validation. Use the `*-lightning` version for those.

## How to use

Run `python run_notebook.py --help` for more information.

In [1]:
# If this is the template file (and not a copy) and you are introducing changes,
# update VERSION with the current date (YYYY.MM.DD)
VERSION = "2021.05.06"

## ✏ Define hyper parameters

In [2]:
# TEMPLATE VALUES -- these are overriden (see below if executed) by papermill using a YAML or Python file as input

# DATA -- Glob paths must be relative to the root of the repository: REPO / features
NPZ_FILES = [
    "path/to/*.npz",
]

# Model -- specified with the full import path to the class object
MODEL_CLS = "kinoml.ml.torch_models.NeuralNetworkRegression"
MODEL_KWARGS = {"hidden_size": 350}  # input_shape is defined dynamically during training
WITH_OBSERVATION_MODEL = True

# Adam
LEARNING_RATE = 0.001
EPSILON = 1e-7
BETAS = 0.9, 0.999

# Trainer
MAX_EPOCHS = 50
N_SPLITS = 5
SHUFFLE_FOLDS = False
VALIDATION = False  # TODO: VALIDATION=True is not implemented yet!
MIN_ITEMS_PER_DATASET = 50  # skip datasets if len(data) < N

# Bootstrapping
N_BOOTSTRAPS = 1
BOOTSTRAP_SAMPLE_RATIO = 1

# Output
VERBOSE = False

## IGNORE THIS ONE
HERE = _dh[-1]

In [3]:
# Parameters
NPZ_FILES = [
    "example-chembl28-morgan512-hash-1k-subsample/_output/ligand__SmilesToLigandFeaturizer__MorganFingerprintFeaturizer_nbits=512_radius=2__kinase__HashFeaturizer/ChEMBLDatasetProvider/*.npz"
]
MODEL_CLS = "kinoml.ml.torch_models.NeuralNetworkRegression"
MODEL_KWARGS = {"hidden_shape": 350}
WITH_OBSERVATION_MODEL = True
LEARNING_RATE = 0.001
EPSILON = 1e-07
BETAS = [0.9, 0.999]
MAX_EPOCHS = 50
N_SPLITS = 5
SHUFFLE_FOLDS = False
VALIDATION = False
MIN_ITEMS_PER_DATASET = 10
N_BOOTSTRAPS = 1
BOOTSTRAP_SAMPLE_RATIO = 1
VERBOSE = False
HERE = "/Users/taliakimber/Documents/github/experiments-binding-affinity/experiments/ligand-morgan1024-kinase-hash-chembl28-subset"


⚠ From here on, you should _not_ need to modify anything else 🤞

---

Define key paths for data and outputs:

In [4]:
from pathlib import Path
from datetime import datetime

HERE = Path(HERE)

for parent in HERE.parents:
    if next(parent.glob(".github/"), None):
        REPO = parent
        break

FEATURES_STORE = REPO / "features"
        
OUT = HERE / "_output" / datetime.now().strftime("%Y%m%d-%H%M%S")
OUT.mkdir(parents=True, exist_ok=True)

print(f"This notebook:           HERE = {HERE}")
print(f"This repo:               REPO = {REPO}")
print(f"Features:      FEATURES_STORE = {FEATURES_STORE}")
print(f"Outputs in:               OUT = {OUT}")

This notebook:           HERE = /Users/taliakimber/Documents/github/experiments-binding-affinity/experiments/ligand-morgan1024-kinase-hash-chembl28-subset
This repo:               REPO = /Users/taliakimber/Documents/github/experiments-binding-affinity
Features:      FEATURES_STORE = /Users/taliakimber/Documents/github/experiments-binding-affinity/features
Outputs in:               OUT = /Users/taliakimber/Documents/github/experiments-binding-affinity/experiments/ligand-morgan1024-kinase-hash-chembl28-subset/_output/20210510-114111


In [5]:
# Nasty trick: save all-caps local variables (CONSTANTS working as hyperparameters) so far in a dict to save it later
_hparams = {key: value for key, value in locals().items() if key.upper() == key and not key.startswith(("_", "OE_"))}

In [6]:
from collections import defaultdict
from warnings import warn
import sys
import shutil

from IPython.display import Markdown
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, SubsetRandomSampler
import pytorch_lightning as pl

from kinoml.utils import seed_everything, import_object
from kinoml.core import measurements as measurement_types
from kinoml.datasets.torch_datasets import MultiXTorchDataset
from kinoml.core.measurements import null_observation_model

# Fix the seed for reproducible random splits -- otherwise we get mixed train/test groups every time, biasing the model evaluation
seed_everything();
print("Run started at", datetime.now())



Run started at 2021-05-10 11:41:12.980207


## Load featurized data and create observation models

We assume this path structure: `$REPO/features/_output/<FEATURIZATION>/<DATASET>/<GROUP>.npz`

In [7]:
DATASETS = []
MEASUREMENT_TYPES = set()
FEATURIZATIONS = set()
for glob in NPZ_FILES:
    npzs = list(FEATURES_STORE.glob(glob))
    if not npzs:
        warn(f"⚠ NPZ glob `{glob}` did not match any files!")
        continue
        
    for npz in npzs:
        measurement_type = npz.stem.split(".")[0]
        print(measurement_type)
        dataset = npz.parent.name
        featurization = npz.parents[1].name
        print(featurization)
        MEASUREMENT_TYPES.add(measurement_type)
        FEATURIZATIONS.add(featurization)
        
        ds = MultiXTorchDataset.from_npz(npz)
        ds.metadata = {
            "measurement_type": measurement_type,
            "dataset": dataset,
            "featurization": featurization
        }
        DATASETS.append(ds)
        if not VALIDATION:
            ds.indices["test"] = np.concatenate([ds.indices["test"], ds.indices["val"]])
            ds.indices["val"] = np.array([])

if not DATASETS:
    raise ValueError("Provided `NPZ_FILES` did not result in any valid datasets!")

pIC50Measurement
ligand__SmilesToLigandFeaturizer__MorganFingerprintFeaturizer_nbits=512_radius=2__kinase__HashFeaturizer
pKdMeasurement
ligand__SmilesToLigandFeaturizer__MorganFingerprintFeaturizer_nbits=512_radius=2__kinase__HashFeaturizer
pKiMeasurement
ligand__SmilesToLigandFeaturizer__MorganFingerprintFeaturizer_nbits=512_radius=2__kinase__HashFeaturizer


In [8]:
print("Observed...")
print(" - Measurement types:", len(MEASUREMENT_TYPES), "-->", *MEASUREMENT_TYPES)

Observed...
 - Measurement types: 3 --> pIC50Measurement pKdMeasurement pKiMeasurement


Now that we have all the data-dependent objects, we can start with the model-specific definitions.

### Training loop

In [9]:
from kinoml.ml.lightning_modules import KFold3Way, KFold
from IPython.display import Markdown
from tqdm.auto import trange, tqdm
from kinoml.ml.torch_models import NeuralNetworkRegression
from ipywidgets import HBox, VBox, Output, HTML
from kinoml.analysis.plots import predicted_vs_observed, performance
from kinoml.utils import fill_until_next_multiple
import pandas as pd
import torch.nn as nn

if VALIDATION:
    kfold = KFold3Way(n_splits=N_SPLITS, shuffle=SHUFFLE_FOLDS)
    ttypes = ["train", "val", "test"]
else:
    kfold = KFold(n_splits=N_SPLITS, shuffle=SHUFFLE_FOLDS)
    ttypes = ["train", "test"]

ModelCls = import_object(MODEL_CLS)
    
for dataset in tqdm(DATASETS):
    mtype = dataset.metadata["measurement_type"]
    if dataset.shape_X[0] < MIN_ITEMS_PER_DATASET:
        warn(f"Ignoring data because it has less than {MIN_ITEMS_PER_DATASET} entries for type {mtype}")
        continue
            
    if VERBOSE:
        display(Markdown(f"#### {mtype}"))

    mtype_class = getattr(measurement_types, mtype)
    obs_model = mtype_class.observation_model(backend="pytorch")
    metrics = defaultdict(list)
    
    ds_size = list(range(dataset.shape_y[0]))
    for fold_index, splits in enumerate(kfold.split(ds_size, ds_size)):
        if VALIDATION:
            train_indices, val_indices, test_indices = splits
        else:
            train_indices, test_indices = splits

        if VERBOSE:
            display(Markdown(f"##### Fold {fold_index}"))

        ####
        # TRAIN
        ####
        
        x_train, y_train = dataset[train_indices]
        x_test, y_test = dataset[test_indices]

        if VALIDATION:
            x_val, y_val = dataset[val_indices]
        
        if ModelCls.needs_input_shape:
            MODEL_KWARGS["input_shape"] = ModelCls.estimate_input_shape(x_train)
        nn_model = ModelCls(**MODEL_KWARGS)
        nn_model.train(True)

        optimizer = torch.optim.Adam(nn_model.parameters(), lr=LEARNING_RATE, eps=EPSILON, betas=BETAS)
        loss_function = torch.nn.MSELoss()

        if VERBOSE:
            range_epochs = trange(MAX_EPOCHS, desc="Epochs (+ featurization...)")
        else:
            range_epochs = range(MAX_EPOCHS)
        for epoch in range_epochs:
            optimizer.zero_grad()

            prediction = nn_model(x_train)
            if WITH_OBSERVATION_MODEL:
                prediction = obs_model(prediction)

            prediction = prediction.view_as(y_train)

            loss = loss_function(prediction, y_train)
            if VERBOSE:
                range_epochs.set_description(f"Epochs (loss={loss.item():.2e})")

            if VALIDATION:
                warn("Validation step not implemented yet")


            # Gradients w.r.t. parameters
            loss.backward()

            # Optimizer
            optimizer.step()
        
        ###
        # Save model's state -- you will still need to instantiate the model class!
        # Possibly using something like:
        # model = import_object(MODEL_CLS)(**MODEL_KWARGS)
        # model.load_state_dict(torch.load("state_dict.pt"))
        ###
        torch.save(nn_model.state_dict(), OUT / f"state_dict_{mtype}_fold{fold_index}.pt")
        
        ####
        # EVAL
        ####
        nn_model.eval()
        outputs = []
        for ttype in ttypes:
            output = Output()
            with output:
                title = f"fold={fold_index}, {ttype}={locals()[f'{ttype}_indices'].shape[0]}"
                print(title)
                print("-"*(len(title)))

                observed = locals()[f"y_{ttype}"]

                with torch.no_grad():
                    predicted = nn_model(locals()[f"x_{ttype}"])
                    if WITH_OBSERVATION_MODEL:
                        predicted = obs_model(predicted)

                predicted = predicted.view_as(observed).detach().numpy()
                observed = observed.detach().numpy()
                these_metrics = performance(predicted, observed, n_boot=N_BOOTSTRAPS, sample_ratio=BOOTSTRAP_SAMPLE_RATIO)
                metrics[ttype].append(these_metrics)
                if VERBOSE:
                    display(predicted_vs_observed(predicted, observed, mtype_class, with_metrics=False))

            outputs.append(output)
        if VERBOSE:
            display(HBox(outputs))

    # Average performances

    average = defaultdict(dict)
    for key in metrics["test"][0]:
        for label in ttypes:
            # this zero here ---v is super important! we only want the mean of the means!
            values =  [fold[key][0] for fold in metrics[label]]
            average[label][key] = {
                "mean": np.mean(values),
                "std": np.std(values)
            }
    if VERBOSE:
        for label in ttypes:    
            display(HTML(f"Bootstrapped average across folds ({label}):"))
            display(pd.DataFrame.from_dict(average[label]))

  0%|          | 0/3 [00:00<?, ?it/s]

  warn(f"Ignoring data because it has less than {MIN_ITEMS_PER_DATASET} entries for type {mtype}")


### Summary

`metrics` is a nested dictionary with these dimensions:

- measurement type
- metric
- mean & standard deviation

In [10]:
import json

display(Markdown(f"""
### Configuration 

```json
{json.dumps(_hparams, default=str, indent=2)}
```
"""))

if VERBOSE:
    display(Markdown(f"""

    ### Metrics

    ```json
    {json.dumps(default=str, indent=2)}
    ```
    """))


### Configuration 

```json
{
  "VERSION": "2021.05.06",
  "NPZ_FILES": [
    "example-chembl28-morgan512-hash-1k-subsample/_output/ligand__SmilesToLigandFeaturizer__MorganFingerprintFeaturizer_nbits=512_radius=2__kinase__HashFeaturizer/ChEMBLDatasetProvider/*.npz"
  ],
  "MODEL_CLS": "kinoml.ml.torch_models.NeuralNetworkRegression",
  "MODEL_KWARGS": {
    "hidden_shape": 350,
    "input_shape": 513
  },
  "WITH_OBSERVATION_MODEL": true,
  "LEARNING_RATE": 0.001,
  "EPSILON": 1e-07,
  "BETAS": [
    0.9,
    0.999
  ],
  "MAX_EPOCHS": 50,
  "N_SPLITS": 5,
  "SHUFFLE_FOLDS": false,
  "VALIDATION": false,
  "MIN_ITEMS_PER_DATASET": 10,
  "N_BOOTSTRAPS": 1,
  "BOOTSTRAP_SAMPLE_RATIO": 1,
  "VERBOSE": false,
  "HERE": "/Users/taliakimber/Documents/github/experiments-binding-affinity/experiments/ligand-morgan1024-kinase-hash-chembl28-subset",
  "REPO": "/Users/taliakimber/Documents/github/experiments-binding-affinity",
  "FEATURES_STORE": "/Users/taliakimber/Documents/github/experiments-binding-affinity/features",
  "OUT": "/Users/taliakimber/Documents/github/experiments-binding-affinity/experiments/ligand-morgan1024-kinase-hash-chembl28-subset/_output/20210510-114111"
}
```


<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [11]:
for mtype in MEASUREMENT_TYPES:
    display(Markdown(f"#### {mtype}"))

    dict_of_flattened_metrics = {}
    for _, measurement_type_dict in sorted(metrics.items(), key=lambda kv: kv[0].lower()):
        flattened_metrics = {}
        for train_test_key, train_test_dict in measurement_type_dict[0].get(mtype, {}).items():
            for metric_key, mean_std_dict in train_test_dict.items():
                for mean_std_key, value in mean_std_dict.items():
                    flattened_metrics[f"{train_test_key}_{metric_key}_{mean_std_key}"] = (value,)
        if flattened_metrics:
            dict_of_flattened_metrics[kinase_name] = pd.DataFrame.from_dict(flattened_metrics)
    
    if not dict_of_flattened_metrics:
        continue
    
    print(dict_of_flattened_metrics)
    df = pd.concat(dict_of_flattened_metrics)
    df.index = [index[0] for index in df.index]
    with pd.option_context("display.float_format", "{:.3f}".format, "display.max_rows", len(df)):
        display(df.style.background_gradient(subset=["train_r2_mean", "test_r2_mean"], low=0, high=1, vmin=0, vmax=1))
        display(df.describe()[["train_r2_mean", "train_r2_std", "test_r2_mean", "test_r2_std"]].describe().style.apply(lambda x: ['font-weight: bold' for v in x], subset=pd.IndexSlice[["mean", "std"], :]))

#### pIC50Measurement

#### pKdMeasurement

#### pKiMeasurement

In [12]:
print("Run finished at", datetime.now())

Run finished at 2021-05-10 11:41:14.541511


### Save reports to disk

In [13]:
from kinoml.utils import watermark
w = watermark()

Watermark
---------
Last updated: 2021-05-10T11:41:14.556203+02:00

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.23.1

Compiler    : Clang 11.0.1 
OS          : Darwin
Release     : 20.2.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

Hostname: Talias-MBP.fritz.box

Git hash: 10dca3084f0da24483a186493d45277fbcbdb397

torch            : 1.7.1
sys              : 3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:12:38) 
[Clang 11.0.1 ]
json             : 2.0.9
numpy            : 1.20.2
pytorch_lightning: 1.3.0
pandas           : 1.2.4
kinoml           : 0+untagged.439.g5e8ee2e.dirty

Watermark: 2.2.0


conda info
----------
sys.version: 3.7.7 (default, Mar 23 2020, 17:31:31) 
...
sys.prefix: /opt/miniconda3
sys.executable: /opt/miniconda3/bin/python
conda location: /opt/miniconda3/lib/python3.7/site-packages/conda
conda-build: None
conda-env: /opt/miniconda3/bin/conda-env
user site dirs: 

CIO_TEST: <not set>

In [14]:
%%capture cap --no-stderr
w = watermark()

In [15]:
import json

df.to_csv(OUT / "performance.csv")

with open(OUT / "performance.json", "w") as f:
    json.dump(metrics, f, default=str, indent=2)
    
with open(OUT/ "watermark.txt", "w") as f:
    f.write(cap.stdout)

with open(OUT / "hparams.json", "w") as f:
    json.dump(_hparams, f, default=str, indent=2)

NameError: name 'df' is not defined