In [None]:
%run clone_git_on_colab.py

In [None]:
from course_settings import set_tf_nthreads
set_tf_nthreads(4)

# Deep sets and graph networks

The ML models we have looked at so far make the assumption that we have a fixed-dimensional vector of input features. In reality that might not always be the case. Some examples:

* Sequences (text, audio, video)
* Point clouds (e.g. points in 3D space)
* Lists of objects (e.g. particles in a collision)
* Graphs with different numbers of connections for each node

For sequences one approach are recurrent neural networks (RNNs) that utilize a state that gets updated as it iteratively processes input. However, these still need a defined ordering of the inputs and they have certain disadvantages (most prominently difficulty to model "long-range" correlations between inputs and difficulty to parallelize since they are sequential in nature).

Another approach are models that apply **permutation invariant** transformations on the inputs. Both deep sets and graph networks make use of this.

## Deep sets

The simplest approach for a permutation invariant transformation is a **per-point transformation** ($\phi$) followed by a **permutation invariant aggregation**, typically taking the sum/mean or min/max whose output can then be transformed ($\rho$) by any means, e.g. another MLP.

![](figures/deep_set_transformation.png)

See [arXiv:1703.06114](https://arxiv.org/abs/1703.06114) for a detailed discussion.


### Application to jets in Higgs dataset

Remember the missing values in the dataset for the [HiggsChallenge](HiggsChallenge.ipynb)? Those occurred since we had a non-fixed length list of jets in each event (0, 1 or 2). Maybe we can embed the jets into a fixed length vector using a permutation invariant transformation.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D

In [None]:
df = pd.read_csv('data/atlas-higgs-challenge-2014-v2.csv.gz')
n_sig_tot = df["Weight"][df.Label == "s"].sum()
n_bkg_tot = df["Weight"][df.Label == "b"].sum()
# comment this out if you want to run on the full dataset
df = df.sample(frac=0.1)

First, we separate the jet features and other features:

In [None]:
jet_cols = sum([[f"PRI_{obj}_{field}" for field in ["pt", "eta", "phi"]] for obj in ["jet_leading", "jet_subleading"]], [])
jet_cols

We also exclude variables that are derived from the jets:

In [None]:
excluded_cols = ['DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality']

In [None]:
other_cols = [col for col in df.columns if (col.startswith("PRI") or col.startswith("DER")) and col not in jet_cols and not col in excluded_cols]
other_cols

We will make the jet features a 3-D array of shape `(nevents, max_njets, n_jet_features)`

In [None]:
X_jet = df[jet_cols].to_numpy().reshape(-1, 2, 3)
X_jet

The rest of the features just stays a 2-D array as usual:

In [None]:
X_other = df[other_cols].to_numpy()
X_other

Still we need to replace missing values by 0 which can occur for the quantity `DER_mass_MMC`

In [None]:
X_other[X_other == -999] = 0

In [None]:
y = (df.Label == "s").to_numpy()
weight = df['Weight'].to_numpy()

In [None]:
(
    X_jet_train, X_jet_test,
    X_other_train, X_other_test,
    y_train, y_test,
    weight_train, weight_test,
) = train_test_split(X_jet, X_other, y, weight)

Now, let's scale the features. For the jets we have to be a bit careful only to consider non-missing values in the scaling. Also the scikit-learn scalers can only deal with 2D arrays - so let's define a custom scaler:

In [None]:
class JetScaler:
    
    def __init__(self):
        self.scaler = RobustScaler()
        
    def fit(self, X):
        X = np.array(X) # copy
        X[X == -999] = np.nan # replace missing values by nan
        X = X.reshape(-1, X.shape[-1]) # make 2D
        self.scaler.fit(X)
        
    def transform(self, X):
        orig_shape = X.shape
        X = np.array(X).reshape(-1, X.shape[-1])
        X[X == -999] = np.nan
        X = self.scaler.transform(X)
        X = np.nan_to_num(X, 0) # replace missing values by 0
        return X.reshape(*orig_shape) # turn back into 3D

In [None]:
jet_scaler = JetScaler()
jet_scaler.fit(X_jet_train)

In [None]:
X_jet_train_scaled = jet_scaler.transform(X_jet_train)

In [None]:
other_scaler = RobustScaler()
other_scaler.fit(X_other_train)

In [None]:
X_other_train_scaled = other_scaler.transform(X_other_train)

Also we again balance the weights to have the same sum of weights for signal and background and average weight 1

In [None]:
class_weight_signal = 1 / weight_train[y_train==1].sum()
class_weight_background = 1 / weight_train[y_train==0].sum()

In [None]:
def transform_weight(weight, y):
    weight = np.array(weight)
    weight[y==0] *= class_weight_background
    weight[y==1] *= class_weight_signal
    return weight / weight.mean()

In [None]:
weight_train_scaled = transform_weight(weight_train, y_train)

Now the model - we use the functional API of keras

**Note:** When applying the keras `Dense` layer to 3D arrays it is applied independently on each element along the second dimension This is precisely what we want for our per-point transformation $\phi$.

In [None]:
def make_model():
    input_jets = Input(shape=(2, 3), name="jets")
    jets = input_jets
    input_other = Input(shape=(X_other_train.shape[1],), name="other")
    
    # embed the jets using 3 hidden layers (shared per-jet)
    jets = Dense(100, activation="relu")(jets)
    jets = Dense(100, activation="relu")(jets)
    jets = Dense(100, activation="relu")(jets)
    # take the mean/average as a permutation invariant operation
    jets = tf.keras.layers.GlobalAveragePooling1D()(jets)
    
    # 3 hidden layers for the other features
    other = input_other
    other = Dense(100, activation="relu")(other)
    other = Dense(100, activation="relu")(other)
    other = Dense(100, activation="relu")(other)
    
    # concatenate embedded jets and other features and add final hidden layer + output
    out = tf.keras.layers.concatenate([jets, other])
    out = Dense(100, activation="relu")(out)
    out = Dense(1, activation="sigmoid")(out)

    return tf.keras.Model(inputs=[input_jets, input_other], outputs=[out])

model = make_model()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.compile(loss="binary_crossentropy", optimizer="Adam")

In [None]:
history = model.fit(
    {"jets": X_jet_train_scaled, "other": X_other_train_scaled},
    y_train,
    sample_weight=weight_train_scaled,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
)

In [None]:
X_jet_test_scaled = jet_scaler.transform(X_jet_test)
X_other_test_scaled = other_scaler.transform(X_other_test)
weight_test_scaled = transform_weight(weight_test, y_test)

In [None]:
y_pred_train = model.predict({"jets": X_jet_train_scaled, "other": X_other_train_scaled}, verbose=True)[:, 0]
y_pred_test = model.predict({"jets": X_jet_test_scaled, "other": X_other_test_scaled}, verbose=True)[:, 0]

In [None]:
from sklearn.metrics import roc_curve

In [None]:
from ams import ams

In [None]:
ams??

In [None]:
def ams_scan(y, y_prob, weights, label):
    fpr, tpr, thr = roc_curve(y, y_prob, sample_weight=weights)
    ams_vals = ams(tpr * n_sig_tot, fpr * n_bkg_tot)
    print("{}: Maximum AMS {:.3f} for pcut {:.3f}".format(label, ams_vals.max(), thr[np.argmax(ams_vals)]))
    return thr, ams_vals

In [None]:
plt.plot(*ams_scan(y_train, y_pred_train, weight_train, "Train"), label="Train")
plt.plot(*ams_scan(y_test, y_pred_test, weight_test, "Test"), label="Test")
plt.xlim(0.8, 1.)
plt.legend()