# Train a model with PyTorch

> This notebook is a simplified version of our workflow. It exposes the basic details of the traning and evaluation loop more explicitly, but does not offer advanced features like early stopping, mini-batches or validation. Use the `*-lightning` version for those.

## How to use

Run `python run_notebook.py --help` for more information.

In [None]:
# If this is the template file (and not a copy) and you are introducing changes,
# update VERSION with the current date (YYYY.MM.DD)
VERSION = "2021.05.18"

## ✏ Define hyper parameters

In [None]:
# TEMPLATE VALUES -- these are overriden (see below if executed) by papermill using a YAML or Python file as input

# DATA -- Glob paths must be relative to the root of the repository: REPO / features
PARQUET_FILES = [
    "path/to/*.parquet",
]

# Model -- specified with the full import path to the class object
MODEL_CLS = "kinoml.ml.torch_models.NeuralNetworkRegression"
MODEL_KWARGS = {"hidden_size": 350}  # input_shape is defined dynamically during training
WITH_OBSERVATION_MODEL = True

# Adam
LEARNING_RATE = 0.001
EPSILON = 1e-7
BETAS = 0.9, 0.999

# Trainer
MAX_EPOCHS = 50
N_SPLITS = 5
SHUFFLE_FOLDS = False
VALIDATION = False  # TODO: VALIDATION=True is not implemented yet!
MIN_ITEMS_PER_DATASET = 50  # skip datasets if len(data) < N

# Bootstrapping
N_BOOTSTRAPS = 1
BOOTSTRAP_SAMPLE_RATIO = 1

# Output
VERBOSE = False

## IGNORE THIS ONE
HERE = _dh[-1]

⚠ From here on, you should _not_ need to modify anything else 🤞

---

Define key paths for data and outputs:

In [None]:
from pathlib import Path
from datetime import datetime

HERE = Path(HERE)

for parent in HERE.parents:
    if next(parent.glob(".github/"), None):
        REPO = parent
        break

FEATURES_STORE = REPO / "features"
        
OUT = HERE / "_output" / datetime.now().strftime("%Y%m%d-%H%M%S")
OUT.mkdir(parents=True, exist_ok=True)

print(f"This notebook:           HERE = {HERE}")
print(f"This repo:               REPO = {REPO}")
print(f"Features:      FEATURES_STORE = {FEATURES_STORE}")
print(f"Outputs in:               OUT = {OUT}")

In [None]:
# Nasty trick: save all-caps local variables (CONSTANTS working as hyperparameters) so far in a dict to save it later
_hparams = {key: value for key, value in locals().items() if key.upper() == key and not key.startswith(("_", "OE_"))}

In [None]:
from collections import defaultdict
from warnings import warn
import sys
import shutil

from IPython.display import Markdown
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, SubsetRandomSampler
import pytorch_lightning as pl

from kinoml.utils import seed_everything, import_object
from kinoml.core import measurements as measurement_types
from kinoml.datasets.torch_datasets import AwkwardArrayDataset
from kinoml.core.measurements import null_observation_model

# Fix the seed for reproducible random splits -- otherwise we get mixed train/test groups every time, biasing the model evaluation
seed_everything();
print("Run started at", datetime.now())

## Load featurized data and create observation models

We assume this path structure: `$REPO/features/_output/<FEATURIZATION>/<DATASET>/<MEASUREMENT_TYPE>.npz`

In [None]:
DATASETS = []
MEASUREMENT_TYPES = set()
for glob in PARQUET_FILES:
    parquets = list(FEATURES_STORE.glob(glob))
    if not parquets:
        warn(f"⚠ NPZ glob `{glob}` did not match any files!")
        continue
        
    for parquet in parquets:
        measurement_type = parquet.stem
        dataset = parquet.parent.name
        
        ds = AwkwardArrayDataset.from_parquet(parquet)
        ds.metadata = {
            "dataset": dataset,
            "measurement_type": measurement_type,
        }
        DATASETS.append(ds)
        MEASUREMENT_TYPES.add(measurement_type)
#         if not VALIDATION:
#             ds.indices["test"] = np.concatenate([ds.indices["test"], ds.indices["val"]])
#             ds.indices["val"] = np.array([])

if not DATASETS:
    raise ValueError("Provided `PARQUET_FILES` did not result in any valid datasets!")

Now that we have all the data-dependent objects, we can start with the model-specific definitions.

In [None]:
print(f"Loaded {len(DATASETS)} datasets with a total of {len(MEASUREMENT_TYPES)} measurement types:", ", ".join(sorted(MEASUREMENT_TYPES)))

### Training loop

In [None]:
from kinoml.ml.lightning_modules import KFold3Way, KFold
from IPython.display import Markdown
from tqdm.auto import trange, tqdm
from kinoml.ml.torch_models import NeuralNetworkRegression
from ipywidgets import HBox, VBox, Output, HTML
from kinoml.analysis.plots import predicted_vs_observed, performance
from kinoml.utils import fill_until_next_multiple
import pandas as pd
import torch.nn as nn

if VALIDATION:
    kfold = KFold3Way(n_splits=N_SPLITS, shuffle=SHUFFLE_FOLDS)
    ttypes = ["train", "val", "test"]
else:
    kfold = KFold(n_splits=N_SPLITS, shuffle=SHUFFLE_FOLDS)
    ttypes = ["train", "test"]

ModelCls = import_object(MODEL_CLS)
    
kinase_metrics = defaultdict(dict)
for dataset in tqdm(DATASETS):
    name = dataset.metadata["measurement_type"]
    mtype = import_object(f"kinoml.core.measurements.{name}")
    if len(dataset) < MIN_ITEMS_PER_DATASET:
        warn(f"Ignoring {name} because it has less than {MIN_ITEMS_PER_DATASET}")
        continue
            
    if VERBOSE:
        display(Markdown(f"#### {name}"))

    metrics = defaultdict(list)
    
    ds_size = list(range(len(dataset)))
    for fold_index, splits in enumerate(kfold.split(ds_size, ds_size)):
        if VALIDATION:
            train_indices, val_indices, test_indices = splits
        else:
            train_indices, test_indices = splits

        if VERBOSE:
            display(Markdown(f"##### Fold {fold_index}"))

        ####
        # TRAIN
        ####
        
        x_train, y_train = dataset[train_indices]
        x_test, y_test = dataset[test_indices]
        
        if VALIDATION:
            x_val, y_val = dataset[val_indices]
        
        if ModelCls.needs_input_shape:
            MODEL_KWARGS["input_shape"] = ModelCls.estimate_input_shape(x_train)
        nn_model = ModelCls(**MODEL_KWARGS)
        nn_model.train(True)

        optimizer = torch.optim.Adam(nn_model.parameters(), lr=LEARNING_RATE, eps=EPSILON, betas=BETAS)
        loss_function = torch.nn.MSELoss()
        loss_adapter = mtype.loss_adapter(backend="pytorch")
        
        if VERBOSE:
            range_epochs = trange(MAX_EPOCHS, desc="Epochs (+ featurization...)")
        else:
            range_epochs = range(MAX_EPOCHS)
        for epoch in range_epochs:
            optimizer.zero_grad()

            prediction = nn_model(x_train).view_as(y_train)
            loss = loss_adapter(prediction, y_train, loss_func=loss_function)
            
            if VERBOSE:
                range_epochs.set_description(f"Epochs (loss={loss.item():.2e})")

            if VALIDATION:
                warn("Validation step not implemented yet")


            # Gradients w.r.t. parameters
            loss.backward()

            # Optimizer
            optimizer.step()
        
        ###
        # Save model's state -- you will still need to instantiate the model class!
        # Possibly using something like:
        # model = import_object(MODEL_CLS)(**MODEL_KWARGS)
        # model.load_state_dict(torch.load("state_dict.pt"))
        ###
        torch.save(nn_model.state_dict(), OUT / f"state_dict_{name}_fold{fold_index}.pt")
        
        ####
        # EVAL
        ####
        nn_model.eval()
        outputs = []
        obs_model = mtype.observation_model(backend="pytorch")
        for ttype in ttypes:
            output = Output()
            with output:
                title = f"fold={fold_index}, {ttype}={locals()[f'{ttype}_indices'].shape[0]}"
                print(title)
                print("-"*(len(title)))

                observed = locals()[f"y_{ttype}"]

                with torch.no_grad():
                    predicted = nn_model(locals()[f"x_{ttype}"])
                    predicted = obs_model(predicted)

                predicted = predicted.view_as(observed).detach().numpy()
                observed = observed.detach().numpy()
                these_metrics = performance(predicted, observed, n_boot=N_BOOTSTRAPS, sample_ratio=BOOTSTRAP_SAMPLE_RATIO)
                metrics[ttype].append(these_metrics)
                if VERBOSE:
                    display(predicted_vs_observed(predicted, observed, mtype_class, with_metrics=False))

            outputs.append(output)
        if VERBOSE:
            display(HBox(outputs))

    # Average performances

    average = defaultdict(dict)
    for key in metrics["test"][0]:
        for label in ttypes:
            # this zero here ---v is super important! we only want the mean of the means!
            values =  [fold[key][0] for fold in metrics[label]]
            average[label][key] = {
                "mean": np.mean(values),
                "std": np.std(values)
            }
    if VERBOSE:
        for label in ttypes:    
            display(HTML(f"Bootstrapped average across folds ({label}):"))
            display(pd.DataFrame.from_dict(average[label]))
    kinase_metrics[name] = average

### Summary

`kinase_metrics` is a nested dictionary with these dimensions:

- kinase name
- measurement type
- metric
- mean & standard deviation

In [None]:
import json

display(Markdown(f"""
### Configuration 

```json
{json.dumps(_hparams, default=str, indent=2)}
```
"""))

if VERBOSE:
    display(Markdown(f"""

    ### Kinase metrics

    ```json
    {json.dumps(kinase_metrics, default=str, indent=2)}
    ```
    """))

In [None]:
for mtype_name in MEASUREMENT_TYPES:
    mtype_metrics = metrics.get(mtype_name)
    if not mtype_metrics:
        continue
        
    display(Markdown(f"#### {mtype_name}"))
    
    # flatten dict a bit: from dict["test"]["r2"]["mean"] to dict["test"]["r2_mean"]
    flattened = {}
    for ttype, scores in mtype_metrics.items():
        flattened[ttype] = {}
        for score, stats in scores.items():
            for stat, value in stats.items():
                flattened[ttype][f"{score}_{stat}"] = value
        
    df = pd.DataFrame.from_dict(flattened, orient="index")
    with pd.option_context("display.float_format", "{:.3f}".format, "display.max_rows", len(df)):
        display(
            df.style.background_gradient(subset=["r2_mean"], low=0, high=1, vmin=0, vmax=1)
              .apply(lambda x: ['font-weight: bold' for v in x], subset=["r2_mean", "r2_std"])
        )

In [None]:
print("Run finished at", datetime.now())

### Save reports to disk

In [None]:
from kinoml.utils import watermark
w = watermark()

In [None]:
%%capture cap --no-stderr
w = watermark()

In [None]:
import json

with open(OUT / "performance.json", "w") as f:
    json.dump(kinase_metrics, f, default=str, indent=2)
    
with open(OUT/ "watermark.txt", "w") as f:
    f.write(cap.stdout)

with open(OUT / "hparams.json", "w") as f:
    json.dump(_hparams, f, default=str, indent=2)