# Run the XGBoost model

First, we have to create the XGBoost objects out of the NPZ files. NPZ files behave like dictionaries of arrays. In our case, they contain two keys:

- `X`: the featurized systems
- `y`: the associated measurements

We can pass those dict-like arrays to an adapter class for Torch Datasets, which will be ingested by the DataLoaders. We also need the corresponding observation models.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
DATASET = "PKIS2"
N_ESTIMATORS = 100
N_SPLITS = 5
ITEMS_PER_ROW = 2
WITH_OBSERVATION_MODEL = True
SEED = 42
XGBOOST_REGRESSOR_KWARGS = dict(
    colsample_bytree=0.3,
    learning_rate=0.1,
    max_depth=5, 
    alpha=10,
    n_estimators=100,
    random_state=SEED
)
N_BOOTSTRAPS = 1
BOOTSTRAP_SAMPLE_RATIO = 1

In [3]:
MEASUREMENT_TYPES = {
    "ChEMBL": ["pKiMeasurement", "pIC50Measurement", "pKdMeasurement"],
    "PKIS2": ["PercentageDisplacementMeasurement"]
}[DATASET]

ONE_KINASE = {
    "ChEMBL": "P35968",
    "PKIS2": "ABL2",
}[DATASET]

In [4]:
from pathlib import Path
from collections import defaultdict
import numpy as np
import shutil
import time

from kinoml.utils import seed_everything
from kinoml.core import measurements as measurement_types

HERE = Path(_dh[-1])
_trial = 0
OUT = HERE / "_output" / DATASET / f"{time.time():.0f}"
OUT.mkdir(parents=True, exist_ok=True)
print("Reporting results at path:", OUT)
# Fix the seed for reproducible random splits -- otherwise we get mixed train/test groups every time, biasing the model evaluation
if SEED is not None:
    seed_everything(SEED)



Reporting results at path: /home/jaime/devel/py/openkinome/experiments-binding-affinity/ligand-based/MorganFingerprint/XGB/_output/PKIS2/1605261690


## Load featurized data and create observation models

In [5]:
datasets = defaultdict(dict)
for npz in HERE.glob(f"../_output/{DATASET}__*.npz"):
    _, kinase, measurement_type = str(npz.stem).split("__")
    datasets[kinase][measurement_type] = ds = np.load(npz)

In [6]:
obs_models = {k: getattr(measurement_types, k).observation_model(backend="numpy") for k in MEASUREMENT_TYPES}
objectives = {k: getattr(measurement_types, k).loss_adapter(backend="xgboost", loss="mse") for k in MEASUREMENT_TYPES}
objectives

{'PercentageDisplacementMeasurement': <function kinoml.core.measurements.PercentageDisplacementMeasurement._loss_adapter_xgboost__mse(labels, dG_over_KT, inhibitor_conc=1, standard_conc=1, **kwargs)>}

Now that we have all the data-dependent objects, we can start with the model-specific definitions.

## Train the model

In [7]:
from xgboost import XGBRegressor
import pandas as pd
from sklearn.model_selection import KFold
from kinoml.analysis.plots import predicted_vs_observed
from kinoml.analysis.metrics import performance
from ipywidgets import HBox, VBox, Output, HTML
from matplotlib import pyplot as plt
from collections import defaultdict

In [8]:
for mtype in MEASUREMENT_TYPES:
    display(HTML(f"<h3>{mtype}</h3>"))
    
    kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    mtype_class = getattr(measurement_types, mtype)
    X = datasets[ONE_KINASE][mtype]["X"].astype("float64")
    y = datasets[ONE_KINASE][mtype]["y"].astype("float64")

    plots, metrics = [], defaultdict(list)
    for fold, (train, test) in enumerate(kfold.split(X)):
        if WITH_OBSERVATION_MODEL:
            model = XGBRegressor(objective=objectives[mtype], **XGBOOST_REGRESSOR_KWARGS)
        else:
            model = XGBRegressor(objective="reg:squarederror", **XGBOOST_REGRESSOR_KWARGS)
        
        model.fit(X[train], y[train])
        
        for label, indices in {"train": train, "test": test}.items():
            output = Output()
            with output:
                title = f"fold={fold}, {label}={indices.shape[0]}"
                print(title)
                print("-"*(len(title)))

                predicted = model.predict(X[indices])
                if WITH_OBSERVATION_MODEL:
                    predicted = obs_models[mtype](predicted)
                these_metrics = performance(predicted, y[indices], n_boot=N_BOOTSTRAPS, sample_ratio=BOOTSTRAP_SAMPLE_RATIO)
                
                metrics[label].append(these_metrics)
                display(predicted_vs_observed(predicted, y[indices], mtype_class, with_metrics=False))
            plots.append(output)
        
        model.save_model(OUT / f"XGBRegressor__fold{fold}.model")

    # Fill with empty objects until the next multiple of ITEMS_PER_ROW
    for _ in range((ITEMS_PER_ROW - (len(plots) % ITEMS_PER_ROW)) % ITEMS_PER_ROW):
        plots.append(Output())

    # Plot in ITEMS_PER_ROW-column table
    display(VBox([HBox(row) for row in np.reshape(plots, (-1, ITEMS_PER_ROW)).tolist()]))

    # Average performances
    average = defaultdict(dict)
    for key in metrics["train"][0]:
        for label in ("train", "test"):
            # this zero here ---v is super important! we only want the mean of the means!
            values = [fold[key][0] for fold in metrics[label]]
            average[label][key] = {
                "mean": np.mean(values),
                "std": np.std(values)
            }
    for label in ("train", "test"):    
        display(HTML(f"Bootstrapped average across folds ({label}):"))
        display(pd.DataFrame.from_dict(average[label]))

HTML(value='<h3>PercentageDisplacementMeasurement</h3>')



VBox(children=(HBox(children=(Output(), Output())), HBox(children=(Output(), Output())), HBox(children=(Outputâ€¦

HTML(value='Bootstrapped average across folds (train):')

Unnamed: 0,mae,mse,r2,rmse
mean,5.610846,91.434114,0.856434,9.54738
std,0.258818,10.216354,0.01853,0.530699


HTML(value='Bootstrapped average across folds (test):')

Unnamed: 0,mae,mse,r2,rmse
mean,14.421723,499.012243,0.220874,21.817669
std,2.892669,236.268462,0.128348,4.795996


In [9]:
from kinoml.utils import watermark
watermark()

Watermark
---------
numpy  1.19.2
pandas 1.1.3
last updated: 2020-11-13 11:01:34 CET 2020-11-13T11:01:34+01:00

CPython 3.7.8
IPython 7.18.1

compiler   : GCC 7.5.0
system     : Linux
release    : 4.19.128-microsoft-standard
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
host name  : jrodriguez
Git hash   : 973da29a357364934e6ab0fa702baa82a245dcd5
watermark 2.0.2

conda
-----
sys.version: 3.7.6 | packaged by conda-forge | (defau...
sys.prefix: /opt/miniconda
sys.executable: /opt/miniconda/bin/python
conda location: /opt/miniconda/lib/python3.7/site-packages/conda
conda-build: /opt/miniconda/bin/conda-build
conda-convert: /opt/miniconda/bin/conda-convert
conda-debug: /opt/miniconda/bin/conda-debug
conda-develop: /opt/miniconda/bin/conda-develop
conda-env: /opt/miniconda/bin/conda-env
conda-index: /opt/miniconda/bin/conda-index
conda-inspect: /opt/miniconda/bin/conda-inspect
conda-metapackage: /opt/miniconda/bin/conda-metapackage
conda-render: /opt/miniconda/bi