# Run the XGBoost model

First, we have to create the XGBoost objects out of the NPZ files. NPZ files behave like dictionaries of arrays. In our case, they contain two keys:

- `X`: the featurized systems
- `y`: the associated measurements

We can pass those dict-like arrays to an adapter class for Torch Datasets, which will be ingested by the DataLoaders. We also need the corresponding observation models.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
DATASET = "PKIS2"
N_ESTIMATORS = 100
N_SPLITS = 5
ITEMS_PER_ROW = 2

In [None]:
MEASUREMENT_TYPES = {
    "ChEMBL": ["pKiMeasurement", "pIC50Measurement", "pKdMeasurement"],
    "PKIS2": ["PercentageDisplacementMeasurement"]
}[DATASET]

ONE_KINASE = {
    "ChEMBL": "P35968",
    "PKIS2": "ABL2",
}[DATASET]

In [4]:
from pathlib import Path
from collections import defaultdict
import numpy as np
import shutil
import time

from kinoml.utils import seed_everything
from kinoml.core import measurements as measurement_types

HERE = Path(_dh[-1])
_trial = 0
OUT = HERE / "_output" / DATASET / f"{time.time():.0f}"
OUT.mkdir(parents=True, exist_ok=True)
print("Reporting results at path:", OUT)
# Fix the seed for reproducible random splits -- otherwise we get mixed train/test groups every time, biasing the model evaluation
seed_everything()



Reporting results at path: /home/jaime/devel/py/openkinome/experiments-binding-affinity/ligand-based/MorganFingerprint/XGB/_output/PKIS2/1605030797


## Load featurized data and create observation models

In [5]:
datasets = defaultdict(dict)
for npz in HERE.glob(f"../_output/{DATASET}__*.npz"):
    _, kinase, measurement_type = str(npz.stem).split("__")
    datasets[kinase][measurement_type] = ds = np.load(npz)

In [6]:
obs_models = {k: getattr(measurement_types, k).observation_model(backend="numpy") for k in MEASUREMENT_TYPES}
objectives = {k: getattr(measurement_types, k).loss_adapter(backend="xgboost", loss="mse") for k in MEASUREMENT_TYPES}
objectives

{'PercentageDisplacementMeasurement': <function kinoml.core.measurements.PercentageDisplacementMeasurement._loss_adapter_xgboost__mse(labels, dG_over_KT, inhibitor_conc=1, standard_conc=1, **kwargs)>}

Now that we have all the data-dependent objects, we can start with the model-specific definitions.

## Train the model

In [7]:
from xgboost import XGBRegressor
import pandas as pd
from sklearn.model_selection import KFold
from kinoml.analysis.plots import predicted_vs_observed
from kinoml.analysis.metrics import performance
from ipywidgets import HBox, VBox, Output, HTML
from matplotlib import pyplot as plt

In [8]:
_with_observation_model = True
for mtype in MEASUREMENT_TYPES:
    display(HTML(f"<h3>{mtype}</h3>"))
    
    kfold = KFold(n_splits=N_SPLITS)
    mtype_class = getattr(measurement_types, mtype)
    X = datasets[ONE_KINASE][mtype]["X"].astype("float64")
    y = datasets[ONE_KINASE][mtype]["y"].astype("float64")

    plots, metrics = [], []
    for fold, (train, test) in enumerate(kfold.split(X)):
        if _with_observation_model:
            model = XGBRegressor(n_estimators=N_ESTIMATORS, objective=objectives[mtype])
        else:
            model = XGBRegressor(n_estimators=N_ESTIMATORS)
        
        model.fit(X[train], y[train])
        
        # TRAIN PERFORMANCE
        output_train = Output()
        with output_train:
            title = f"{mtype} (train={train.shape[0]}), fold={fold}"
            print(title)
            print("-"*(len(title)))
            if _with_observation_model:
                trained = obs_models[mtype](model.predict(X[train]))
            else:
                trained = model.predict(X[train])
            display(predicted_vs_observed(trained, y[train], mtype_class, n_boot=100, sample_ratio=0.75))
            metrics.append(performance(trained, y[train]))
        plots.append(output_train)
        
        # TEST PERFORMANCE
        output_test = Output()
        with output_test:
            title = f"{mtype} (test={test.shape[0]}), fold={fold}"
            print(title)
            print("-"*(len(title)))
            
            delta_g_over_kt = model.predict(X[test])
            if _with_observation_model:
                predicted = obs_models[mtype](delta_g_over_kt)
            else:
                predicted = delta_g_over_kt
            display(predicted_vs_observed(predicted, y[test], mtype_class, n_boot=100, sample_ratio=0.75))
            metrics.append(performance(predicted, y[test]))
        plots.append(output_test)
        
        model.save_model(OUT / f"XGBRegressor__fold{fold}.model")

    # Fill with empty objects until the next multiple of ITEMS_PER_ROW
    for _ in range((ITEMS_PER_ROW - (len(plots) % ITEMS_PER_ROW)) % ITEMS_PER_ROW):
        plots.append(Output())

    # Plot in ITEMS_PER_ROW-column table
    display(VBox([HBox(row) for row in np.reshape(plots, (-1, ITEMS_PER_ROW)).tolist()]))

    # Average performances
    average = {}
    for key in metrics[0]:
        values =  [fold[key] for fold in metrics]
        average[key] = {
            "mean": np.mean(values),
            "std": np.std(values)
        }
    display(HTML("Average across folds:"))
    display(pd.DataFrame.from_dict(average))

HTML(value='<h3>PercentageDisplacementMeasurement</h3>')



VBox(children=(HBox(children=(Output(), Output())), HBox(children=(Output(), Output())), HBox(children=(Output…

HTML(value='Average across folds:')

Unnamed: 0,mae,mse,r2,rmse
mean,7.729395,341.740942,0.281785,14.33772
std,7.469318,372.046492,0.563114,10.562434


In [9]:
from kinoml.utils import watermark
watermark()

Watermark
---------
numpy  1.19.2
pandas 1.1.3
last updated: 2020-11-10 18:53:22 CET 2020-11-10T18:53:22+01:00

CPython 3.7.8
IPython 7.18.1

compiler   : GCC 7.5.0
system     : Linux
release    : 4.19.128-microsoft-standard
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
host name  : jrodriguez
Git hash   : 789f970086b8117fd53f4367393bfc6d041aaeba
watermark 2.0.2

conda
-----
sys.version: 3.7.6 | packaged by conda-forge | (defau...
sys.prefix: /opt/miniconda
sys.executable: /opt/miniconda/bin/python
conda location: /opt/miniconda/lib/python3.7/site-packages/conda
conda-build: /opt/miniconda/bin/conda-build
conda-convert: /opt/miniconda/bin/conda-convert
conda-debug: /opt/miniconda/bin/conda-debug
conda-develop: /opt/miniconda/bin/conda-develop
conda-env: /opt/miniconda/bin/conda-env
conda-index: /opt/miniconda/bin/conda-index
conda-inspect: /opt/miniconda/bin/conda-inspect
conda-metapackage: /opt/miniconda/bin/conda-metapackage
conda-render: /opt/miniconda/bi