In [None]:
%run clone_git_on_colab.py

In [None]:
from course_settings import set_tf_nthreads
set_tf_nthreads(1)

# Higgs Challenge Example using HistogramBDT

## Load the data and preprocessing

In [None]:
# the usual setup: 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# load training data
df = pd.read_csv('data/atlas-higgs-challenge-2014-v2.csv.gz')

In [None]:
# map y values to integers
df['Label'] = df['Label'].map({'b':0, 's':1})

In [None]:
# let's create separate arrays
X = df.loc[:,'DER_mass_MMC':'PRI_jet_all_pt']
columns = list(X.columns)
X = X.to_numpy()
y = df['Label'].to_numpy()
weight = df['Weight'].to_numpy()

In [None]:
#now split into testing and training samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, weight_train, weight_test = train_test_split(
    X, y, weight, test_size=0.33, random_state=42)

We will again use the [approximate median significance][1] from the Kaggle competition to determine how good a solution was. Note that if you do not use the full data set (i.e. you split into training and testing) you have to reweight the inputs so that the subsample yield matches to the total yield, which we will do below.

[1]: AMS.ipynb

In [None]:
# load function to compute approximate median significance (AMS)
%pycat ams.py
%run ams.py

In [None]:
# calculate the total weights (yields)
sigall  = weight.dot(y)
backall = weight.dot(y == 0)

# BDT

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

In [None]:
#bdt = GradientBoostingClassifier(n_estimators=1000, verbose=True)
bdt = HistGradientBoostingClassifier(
    max_iter=1000,
    validation_fraction=0.3,
    max_depth=10,
    min_samples_leaf=200,
    verbose=True,
)

### Introducing Weights

Another innovation we're introducing here is reweighting of the events. We are doing three things here:
1. Applying event-based weights which are stored in `weight_train` (and `weight_test`). This helps to give more weight (in the computation of the loss function) to backgrounds events that have larger cross sections and are therefore more important to suppress than others.
1. Reweighting the signal and background back such that their total weight is again about the same. Note that the unweighted sample has a ratio of about 1:2 for signal:background events, and we had seen that after applying the weight this ratio was reduced to about 1:500. Such a drastic difference in the weights can cause problems in the training, therefore we restore a roughly equal total weight by multiplying with the two (global) weights for signal and background we compute in `class_weight`.
1. Normalizing the weights, such that the mean weight is 1. This avoids producing an overall shift in the loss value which would mean we also have to shift optimization parameters (like learning rate).

In [None]:
class_weight = np.array([
    len(y_train) / weight_train[y_train==0].sum(),
    len(y_train) / weight_train[y_train==1].sum(),
])
class_weight

In [None]:
weight_train_tot = weight_train * class_weight[y_train.astype(int)]
weight_test_tot = weight_test * class_weight[y_test.astype(int)]
weight_train_tot /= weight_train_tot.mean()
weight_test_tot /= weight_test_tot.mean()

In [None]:
weight_train_tot[y_train==0].sum()

In [None]:
weight_train_tot[y_train==1].sum()

In [None]:
len(y_train)

In [None]:
bdt.fit(X_train, y_train, sample_weight=weight_train_tot)

In [None]:
y_train_prob = bdt.predict_proba(X_train)[:, 1]
y_test_prob = bdt.predict_proba(X_test)[:, 1]

In [None]:
plt.hist(y_test_prob[y_test==0], bins=100, weights=weight_test_tot[y_test==0], alpha=0.5);
plt.hist(y_test_prob[y_test==1], bins=100, weights=weight_test_tot[y_test==1], alpha=0.5);

In [None]:
from sklearn.metrics import roc_curve

In [None]:
# Run the AMS scan
from sklearn.metrics import roc_curve
def ams_scan(y, y_prob, weights, label):
    fpr, tpr, thr = roc_curve(y, y_prob, sample_weight=weights)
    ams_vals = ams(tpr * sigall, fpr * backall)
    print("{}: Maximum AMS {:.3f} for pcut {:.3f}".format(label, ams_vals.max(), thr[np.argmax(ams_vals)]))
    return thr, ams_vals

In [None]:
plt.plot(*ams_scan(y_train, y_train_prob, weight_train, "Train"), label="Train")
plt.plot(*ams_scan(y_test, y_test_prob, weight_test, "Test"), label="Test")
plt.xlim(0.8, 1.)
plt.legend()