In [None]:
from course_settings import set_tf_nthreads
set_tf_nthreads(1) # best setting for this tutorial at CIP

In [None]:
# the usual setup: 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# load training data
df = pd.read_csv('data/atlas-higgs-challenge-2014-v2.csv.gz')

In [None]:
# map y values to integers
df['Label'] = df['Label'].map({'b':0, 's':1})

In [None]:
df.columns

In [None]:
df.loc[:, 'PRI_jet_leading_pt' : 'PRI_jet_subleading_phi']

In [None]:
jet_cols = sum([[f"PRI_{obj}_{field}" for field in ["pt", "eta", "phi"]] for obj in ["jet_leading", "jet_subleading"]], [])

In [None]:
jet_cols

In [None]:
#other_cols = [col for col in df.columns if col.startswith("PRI") and col not in jet_cols]
other_cols = [col for col in df.columns if (col.startswith("PRI") or col.startswith("DER")) and col not in jet_cols]

In [None]:
# let's create separate arrays
eventID = df['EventId']
#X = df.loc[:,'DER_mass_MMC':'PRI_jet_all_pt']
X = df.loc[:,'PRI_tau_pt':'PRI_jet_all_pt']
y = df['Label']
weight = df['Weight']

In [None]:
X_other = df[other_cols].to_numpy()

In [None]:
X_other

In [None]:
X_jet = df[jet_cols].to_numpy().reshape(-1, 2, 3)

In [None]:
X_jet

In [None]:
#now split into testing and training samples
from sklearn.model_selection import train_test_split
(
    X_train, X_test,
    X_jet_train, X_jet_test,
    X_other_train, X_other_test,
    y_train, y_test,
    eventID_train, event_ID_test,
    weight_train, weight_test,
) = train_test_split(
    X, X_jet, X_other, y, eventID, weight, test_size=0.33, random_state=42
)

We will again use the [approximate median significance][1] from the Kaggle competition to determine how good a solution was. Note that if you do not use the full data set (i.e. you split into training and testing) you have to reweigh the inputs so that the subsample yield matches to the total yield, which we will do below.

[1]: AMS.ipynb

In [None]:
# load function to compute approximate median significance (AMS)
%pycat ams.py
%run ams.py

In [None]:
# calculate the total weights (yields)
sigall  = weight.dot(y)
backall = weight.dot(y == 0)

sigtrain  = weight_train.dot(y_train)
backtrain = weight_train.dot(y_train == 0)

sigtest  = weight_test.dot(y_test)
backtest = weight_test.dot(y_test == 0)



## Custom scaling

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
X_train.to_numpy()

In [None]:
X_jet[:, :, 0].ravel()

In [None]:
class JetScaler:
    def __init__(self):
        self.scalers = []
        
    def fit(self, X):
        for i in range(3):
            x = X[:, :, i].ravel()
            x = x[x != -999].reshape(-1, 1)
            scaler = RobustScaler()
            scaler.fit(x)
            self.scalers.append(scaler)
            
    def transform(self, X):
        outputs = []
        for i, scaler in enumerate(self.scalers):
            x = np.array(X[:, :, i].ravel())
            x[x != -999] = scaler.transform(x[x != -999].reshape(-1, 1)).ravel()
            outputs.append(x.reshape(-1, X.shape[1]))
        return np.stack(outputs, axis=2)

In [None]:
scaler_other = RobustScaler()
scaler_other.fit(X_other_train)
scaler_jet = JetScaler()
scaler_jet.fit(X_jet_train)
X_other[X_other == -999] = 0
X_other_scaled = scaler_other.transform(X_other_train)
X_other_test_scaled = scaler_other.transform(X_other_test)
X_jet_scaled = scaler_jet.transform(X_jet_train)
X_jet_test_scaled = scaler_jet.transform(X_jet_test)

# Model with permutation invariant jet embedding

In [None]:
np.random.seed(1337)  # for reproducibility

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, BatchNormalization
import tensorflow as tf


In [None]:
def DenseB(*args, **kwargs):
    def f(inp):
        out = inp
        out = Dense(*args, **kwargs)(out)
        out = BatchNormalization()(out)
        return out
    return f

In [None]:
def make_model():
    inp_jets = tf.keras.layers.Input(shape=(2, 3), name="jets")
    jets = inp_jets
    inp_other = tf.keras.layers.Input(shape=(X_other_train.shape[1],), name="other")
    other = inp_other
    other = DenseB(100, activation="relu")(other)
    other = DenseB(100, activation="relu")(other)
    #other = Dense(100, activation="relu")(other)
    jets = DenseB(100, activation="relu")(jets)
    jets = DenseB(100, activation="relu")(jets)
    #jets = Dense(100, activation="relu")(jets)
    mask = tf.keras.layers.Lambda(lambda x: tf.expand_dims(tf.cast(tf.reduce_all(x != -999, axis=2), tf.float32), axis=2))(inp_jets)
    jets = tf.keras.layers.multiply([mask, jets])
    jets = tf.keras.layers.GlobalAveragePooling1D()(jets)
    out = tf.keras.layers.concatenate([jets, other])
    out = DenseB(100, activation="relu")(out)
    out = Dense(1, activation="sigmoid")(out)
    return tf.keras.Model(inputs=[inp_jets, inp_other], outputs=[out])

In [None]:
model = make_model()

In [None]:
#!pip install pydot

In [None]:
import pydot

In [None]:
# visualize model
from tensorflow.keras.utils import plot_model
plot_model(model)

In [None]:
# compile model
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # or weighted metrics
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=['accuracy']) # or weighted metrics

In [None]:
class_weight = {0: y_train.shape[0]/backtrain, 1:y_train.shape[0]/sigtrain}
class_weight

In [None]:
weight_train_tot = np.array(weight_train*np.array(list(class_weight.values()))[y_train.astype(int)])
weight_test_tot = np.array(weight_test*np.array(list(class_weight.values()))[y_test.astype(int)])
weight_train_tot /= weight_train_tot.mean()
weight_test_tot /= weight_test_tot.mean()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
history = model.fit(
    #X_train_scaled,
    {"jets": X_jet_scaled, "other": X_other_scaled},
    y_train,
    epochs=100,
    batch_size=64,
    sample_weight=weight_train_tot,
    #validation_data=(X_test_scaled, y_test, weight_test_tot),
    validation_data=({"jets": X_jet_test_scaled, "other": X_other_test_scaled}, y_test, weight_test_tot),
    validation_split=0.2,
    callbacks=[EarlyStopping(verbose=True, patience=20, restore_best_weights=True)]
)

In [None]:
# visualize training history returned by model.fit

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
#y_train_prob_keras = model.predict(X_train_scaled)[:, 0]
#y_test_prob_keras = model.predict(X_test_scaled)[:, 0]
y_train_prob_keras = model.predict({"jets": X_jet_scaled, "other": X_other_scaled})[:, 0]
y_test_prob_keras = model.predict({"jets": X_jet_test_scaled, "other": X_other_test_scaled})[:, 0]

In [None]:
from sklearn.metrics import roc_curve

In [None]:
# Run the AMS scan
from sklearn.metrics import roc_curve
def ams_scan(y, y_prob, weights, label):
    fpr, tpr, thr = roc_curve(y, y_prob, sample_weight=weights)
    ams_vals = ams(tpr * sigall, fpr * backall)
    print("{}: Maximum AMS {:.3f} for pcut {:.3f}".format(label, ams_vals.max(), thr[np.argmax(ams_vals)]))
    return thr, ams_vals

In [None]:
plt.plot(*ams_scan(y_train, y_train_prob_keras, weight_train, "Train"), label="Train")
plt.plot(*ams_scan(y_test, y_test_prob_keras, weight_test, "Test"), label="Test")
plt.xlim(0.8, 1.)
plt.legend()