In [None]:
from course_settings import set_tf_nthreads
set_tf_nthreads(1)

# Higgs Challenge Example using Neural Networks -- some extra tricks


This is essentially the same as what we have done in the notebook 
on the [Higgs Challenge Example using Neural Networks](HiggsChallenge-NN_DL.ipynb)
but here we're going to use a neural network with a more complex (deeper) structure (deeper = more layers) and we are optimizing the usage of event weights
to squeeze out even a bit more performance.

## Load the data and preprocessing

In [None]:
# the usual setup: 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# load training data
df = pd.read_csv('data/atlas-higgs-challenge-2014-v2.csv.gz')

In [None]:
# map y values to integers
df['Label'] = df['Label'].map({'b':0, 's':1})

In [None]:
# let's create separate arrays
X = df.loc[:,'DER_mass_MMC':'PRI_jet_all_pt']
columns = list(X.columns)
X = X.to_numpy()
y = df['Label'].to_numpy()
weight = df['Weight'].to_numpy()

In [None]:
#now split into testing and training samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, weight_train, weight_test = train_test_split(
    X, y, weight, test_size=0.33, random_state=42)

We will again use the [approximate median significance][1] from the Kaggle competition to determine how good a solution was. Note that if you do not use the full data set (i.e. you split into training and testing) you have to reweight the inputs so that the subsample yield matches to the total yield, which we will do below.

[1]: AMS.ipynb

In [None]:
from mltools import ams
ams??

In [None]:
# calculate the total weights (yields)
sigall  = weight.dot(y)
backall = weight.dot(y == 0)

## Rescaling
Neural networks are quite sensitive to feature scaling, so let's try to scale the features. Also, let's set the -999 values to 0.

In [None]:
from sklearn.preprocessing import RobustScaler

X_train[X_train==-999.] = 0.
X_test[X_test==-999.] = 0.

scaler = RobustScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Neutral networks with Keras
SciKit Learn has simple NNs, but if you want to do deep NNs, or train on GPUs, you probably want to use something like Keras instead. 

Example for a deeper NN using Keras

In [None]:
np.random.seed(1337)  # for reproducibility

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, BatchNormalization

In [None]:
model = Sequential([
    Dense(units=128, input_shape=X_train.shape[1:]),
    BatchNormalization(),
    Activation("relu"),
    Dense(units=128),
    BatchNormalization(),
    Activation("relu"),
    Dense(units=128),
    BatchNormalization(),
    Activation("relu"),
    Dense(units=1, activation='sigmoid'),
])

* `Dense`: "Just your regular densely-connected NN layer."
  * implements the operation: output = activation(dot(input, kernel) + bias)
    * kernel is a weights matrix created by the layer
    * bias is a bias vector created by the layer (only applicable if `use_bias` is True)
  * `units`: dimensionality of the output array
  * `input_shape`: expected shape of the input arrays (only needed for first layer)
  * `activation`: element-wise activation function
  * `kernel_regularizer`: constraint function applied to the kernel weights matrix (see [constraints][1])
* `BatchNormalization` : Technical trick to adjust weights and speedup computation - also acts as mild regularization (see [BatchNormalization][2])
* `Activation`: Specify activation function (see [activation discussion](NN_Activation.ipynb))
  
  
[1]: https://keras.io/constraints/
[2]: https://www.dlology.com/blog/one-simple-trick-to-train-keras-model-faster-with-batch-normalization/

Note: There is a common practice to use powers of 2 (or multiples of powers of 2) for parameters that describe shapes of vectors in NN computations. Most of the time this does not matter, but sometimes it does, e.g. see this [tweet from Andrej Karpathy](https://twitter.com/karpathy/status/1621578354024677377)

In [None]:
model.summary()

In [None]:
# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # or weighted metrics

* `optimizer`: name of optimizer or optimizer instance. See [optimizers][1].
  * _Adam_: an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments ([paper][2], a short [summary][4])
* `loss`: name of objective function or objective function. See [losses][3].
  * _binary crossentropy_: a measure of dissimilarity, used here to define the loss function that should be minimized 
    $$H_p(q) = -\frac{1}{N}\sum_{i=1}^N [{y_i} \log(\hat{y}_i)+(1-y_i) \log(1-\hat{y}_i)]$$
    * also called *log loss*: we *maximize the predicted probability* of the true label by *minimizing the negative logarithm of it* (compare e.g. [Maximum Likelihood method](https://en.wikipedia.org/wiki/Maximum_likelihood_estimation))
       * here the true labels are $y_i=1$ for the positive class and $y_i=0$ for the negative class
       * the estimated probabilities are $\hat y_{i}$
       * $N$ runs over all samples
    *  "the cross entropy between two probability distributions p and q over the same underlying set of events measures the average number of bits needed to identify an event drawn from the set if a coding scheme used for the set is optimized for an estimated probability distribution q, rather than the true distribution p."    
    * see [this video by Aurélien Géron](https://www.youtube.com/watch?v=ErfnhcEV1O8) for an explanation of the information theory interpretation of this loss
* `metrics`: list of metrics to be evaluated by the model during training and testing (typically accuracy)

[1]: https://keras.io/optimizers/
[2]: https://arxiv.org/abs/1412.6980v8
[3]: https://keras.io/losses/
[4]: https://medium.com/@nishantnikhil/adam-optimizer-notes-ddac4fd7218

### Introducing Weights

Another innovation we're introducing here is reweighting of the events. We are doing three things here:
1. Applying event-based weights which are stored in `weight_train` (and `weight_test`). This helps to give more weight (in the computation of the loss function) to backgrounds events that have larger cross sections and are therefore more important to suppress than others.
1. Reweighting the signal and background back such that their total weight is again about the same. Note that the unweighted sample has a ratio of about 1:2 for signal:background events, and we had seen that after applying the weight this ratio was reduced to about 1:500. Such a drastic difference in the weights can cause problems in the training, therefore we restore a roughly equal total weight by multiplying with the two (global) weights for signal and background we compute in `class_weight`.
1. Normalizing the weights, such that the mean weight is 1. This avoids producing an overall shift in the loss value which would mean we also have to shift optimization parameters (like learning rate).

In [None]:
class_weight = np.array([
    len(y_train) / weight_train[y_train==0].sum(),
    len(y_train) / weight_train[y_train==1].sum(),
])
class_weight

In [None]:
weight_train_tot = weight_train * class_weight[y_train.astype(int)]
weight_test_tot = weight_test * class_weight[y_test.astype(int)]
weight_train_tot /= weight_train_tot.mean()
weight_test_tot /= weight_test_tot.mean()

In [None]:
weight_train_tot[y_train==0].sum()

In [None]:
weight_train_tot[y_train==1].sum()

In [None]:
len(y_train)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
history = model.fit(
    X_train_scaled,
    y_train,
    epochs=100,
    batch_size=256,
    sample_weight=weight_train_tot,
    validation_data=(X_test_scaled, y_test, weight_test_tot),
    callbacks=[EarlyStopping(verbose=True, patience=5, restore_best_weights=True)]
)

* `batch_size`: number of samples per gradient update
* `epochs`: number of epochs to train the model. An epoch is an iteration over the entire x and y data provided. 


In [None]:
# visualize training history returned by model.fit

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
y_train_prob_keras = model.predict(X_train_scaled)[:, 0]
y_test_prob_keras = model.predict(X_test_scaled)[:, 0]

In [None]:
from sklearn.metrics import roc_curve

In [None]:
# Run the AMS scan
from sklearn.metrics import roc_curve
def ams_scan(y, y_prob, weights, label):
    fpr, tpr, thr = roc_curve(y, y_prob, sample_weight=weights)
    ams_vals = ams(tpr * sigall, fpr * backall)
    print("{}: Maximum AMS {:.3f} for pcut {:.3f}".format(label, ams_vals.max(), thr[np.argmax(ams_vals)]))
    return thr, ams_vals

In [None]:
plt.plot(*ams_scan(y_train, y_train_prob_keras, weight_train, "Train"), label="Train")
plt.plot(*ams_scan(y_test, y_test_prob_keras, weight_test, "Test"), label="Test")
plt.xlim(0.8, 1.)
plt.legend()