# Run the LNN model

First, we have to create the PyTorch objects out of the NPZ files. NPZ files behave like dictionaries of arrays. In our case, they contain two keys:

- `X`: the featurized systems
- `y`: the associated measurements

We can pass those dict-like arrays to an adapter class for Torch Datasets, which will be ingested by the DataLoaders. We also need the corresponding observation models.

In [None]:
DATASET = "PKIS2"
WITH_OBSERVATION_MODEL = True
WITH_VALIDATION = False
# Adam
LEARNING_RATE = 0.001
EPSILON = 1e-7
BETAS = 0.9, 0.999
# Dataloader
BATCH_SIZE = 128
NUM_WORKERS = 1
# Trainer
MAX_EPOCHS = 100
N_SPLITS = 5
SHUFFLE_FOLDS = False
# Early stopping
MIN_DELTA = 0.00001
PATIENCE = 10

# Bootstrapping
N_BOOTSTRAPS = 1
BOOTSTRAP_SAMPLE_RATIO = 1

In [None]:
MEASUREMENT_TYPES = {
    "ChEMBL": ["pKiMeasurement", "pIC50Measurement", "pKdMeasurement"],
    "PKIS2": ["PercentageDisplacementMeasurement"]
}[DATASET]

ONE_KINASE = {
    "ChEMBL": "P35968",
    "PKIS2": "ABL2",
}[DATASET]

In [None]:
from pathlib import Path
from collections import defaultdict
import numpy as np
import shutil
import time

import torch
from torch.utils.data import DataLoader, SubsetRandomSampler
import pytorch_lightning as pl

from kinoml.utils import seed_everything
from kinoml.core import measurements as measurement_types
from kinoml.datasets.torch_datasets import XyNpzTorchDataset
from kinoml.core.measurements import null_observation_model

HERE = Path(_dh[-1])
_trial = 0
OUT = HERE / "_output" / DATASET / f"{time.time():.0f}"
OUT.mkdir(parents=True, exist_ok=True)
print("Reporting results at path:", OUT)
# Fix the seed for reproducible random splits -- otherwise we get mixed train/test groups every time, biasing the model evaluation
seed_everything()

## Load featurized data and create observation models

In [None]:
datasets = defaultdict(dict)
for npz in HERE.glob(f"../_output/{DATASET}__*.npz"):
    _, kinase, measurement_type = str(npz.stem).split("__")
    datasets[kinase][measurement_type] = ds = XyNpzTorchDataset(npz)
    if not WITH_VALIDATION:  # merge val into test
        ds.indices["test"] = np.concatenate([ds.indices["test"], ds.indices["val"]])
        ds.indices["val"] = np.array([])

In [None]:
backend = "pytorch" if WITH_OBSERVATION_MODEL else "null"
obs_models = {k: getattr(measurement_types, k).observation_model(backend=backend) for k in MEASUREMENT_TYPES}
obs_models

Now that we have all the data-dependent objects, we can start with the model-specific definitions.

In [None]:
# First a little patch (TODO)

from kinoml.core import measurements
measurements.BaseMeasurement.RANGE = getattr(measurements, MEASUREMENT_TYPES[0]).RANGE

## Manual style

### Training loop

In [None]:
from kinoml.ml.lightning_modules import KFold3Way, KFold
from IPython.display import Markdown
from tqdm.auto import trange
from kinoml.ml.torch_models import NeuralNetworkRegression
from ipywidgets import HBox, VBox, Output, HTML
from kinoml.analysis.plots import predicted_vs_observed, performance
from kinoml.utils import fill_until_next_multiple
import pandas as pd
import torch.nn as nn

if WITH_VALIDATION:    
    kfold = KFold3Way(n_splits=N_SPLITS, shuffle=SHUFFLE_FOLDS)
    ttypes = ["train", "val", "test"]
else:
    kfold = KFold(n_splits=N_SPLITS, shuffle=SHUFFLE_FOLDS)
    ttypes = ["train", "test"]

for mtype in MEASUREMENT_TYPES:
    display(Markdown(f"#### {mtype}"))
    dataset = datasets[ONE_KINASE][mtype]
    obs_model = obs_models[mtype]
    mtype_class = getattr(measurement_types, mtype)
    metrics = defaultdict(list)
    
    for fold_index, splits in enumerate(kfold.split(dataset.data_X, dataset.data_y)):
        if WITH_VALIDATION:
            train_indices, val_indices, test_indices = splits
        else:
            train_indices, test_indices = splits

        display(Markdown(f"##### Fold {fold_index}"))
        
        x_train = dataset.data_X[train_indices].double().requires_grad_(True)
        x_test = dataset.data_X[test_indices].double().requires_grad_(True)
        y_train = dataset.data_y[train_indices].double()
        y_test = dataset.data_y[test_indices].double()
        
#         nn_model = NeuralNetworkRegression(input_size=x_train.shape[1], hidden_size=350).double()
        

        nn_model = nn.Sequential(
          nn.Linear(512, 350, bias=True),
          nn.ReLU(),
          nn.Linear(350, 1, bias=True)
        ).double()

        nn_model.train(True)
        
        optimizer = torch.optim.Adam(nn_model.parameters(), lr=LEARNING_RATE, eps=EPSILON, betas=BETAS)
        loss_function = torch.nn.MSELoss()
        
        range_epochs = trange(MAX_EPOCHS, desc="Epochs (+ featurization...)")
        for epoch in range_epochs:
            optimizer.zero_grad()
            
            prediction = nn_model(x_train)
            if WITH_OBSERVATION_MODEL:
                prediction = obs_model(prediction)
            
            prediction = prediction.view_as(y_train)

            # prediction = delta_g
            loss = loss_function(prediction, y_train)
            range_epochs.set_description(f"Epochs (loss={loss.item():.2e})")
            # Gradients w.r.t. parameters
            loss.backward()

            # Optimizer
            optimizer.step()
        
        outputs = []
        for ttype in ttypes:
            output = Output()
            with output:
                title = f"fold={fold_index}, {ttype}={locals()[f'{ttype}_indices'].shape[0]}"
                print(title)
                print("-"*(len(title)))
                
                observed = locals()[f"y_{ttype}"]
                
                nn_model.eval()
                with torch.no_grad():
                    predicted = nn_model(locals()[f"x_{ttype}"])
                    print("Before:", predicted.shape, predicted.dtype, observed.shape, observed.dtype)
                    if WITH_OBSERVATION_MODEL:
                        predicted = obs_model(predicted)
                    
                predicted = predicted.view_as(observed).detach().numpy()
                observed = observed.detach().numpy()
                
                print("After", predicted.shape, predicted.dtype, observed.shape, observed.dtype)

                these_metrics = performance(predicted, observed, n_boot=N_BOOTSTRAPS, sample_ratio=BOOTSTRAP_SAMPLE_RATIO)
                metrics[ttype].append(these_metrics)
                display(predicted_vs_observed(predicted, observed, mtype_class, with_metrics=False))
                
                
            outputs.append(output)
        display(HBox(outputs))
        
    # Average performances
    average = defaultdict(dict)
    for key in metrics["test"][0]:
        for label in ttypes:
            # this zero here ---v is super important! we only want the mean of the means!
            values =  [fold[key][0] for fold in metrics[label]]
            average[label][key] = {
                "mean": np.mean(values),
                "std": np.std(values)
            }
    for label in ttypes:    
        display(HTML(f"Bootstrapped average across folds ({label}):"))
        display(pd.DataFrame.from_dict(average[label]))

## Lightning style

### Train loop

### Performance on the test set

Save best run with an easy to remember path for the next section.

### Analysis

In [None]:
from kinoml.utils import watermark
watermark()