In [1]:
from dataloader import load_data
import torch
import torch.nn as nn
import utils

### Let's open the training and validation files containing examples for top quarks (signal) and QCD jets (background)

In [2]:
input_folder = '/pscratch/sd/v/vmikuni/datasets'
train_data = load_data('top',input_folder,batch=128,dataset_type='train',num_evt = 100_000)
val_data = load_data('top',input_folder,batch=128,dataset_type='val')

In [3]:
print (f"Loading {len(train_data)} batches of events for training and {len(val_data)} for validation")

Loading 781 batches of events for training and 3148 for validation


### We Now need to create a model that will take the data as input and predict a label for each data entry. Let's create a config file with the network parameters

In [4]:
config = {
    'num_layers': 2,
    'hidden_dim': 64,
}

In [5]:
class Attention(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.q = nn.Linear(dim, dim)
        self.k = nn.Linear(dim, dim)
        self.v = nn.Linear(dim, dim)

    def forward(self, x, mask):
        B, L, C = x.shape
        
        q = self.q(x)*mask
        k = self.k(x)*mask
        v = self.v(x)*mask

        attn = (q @ k.transpose(-2, -1)) #Matrix multiplication: (B, L, C) x (B, C, L) = (B, L, L) shape
        attn = attn.softmax(dim=-1) #Normalization
        x = (attn @ v) # Matrix multiplication: (B, L, L) x (B, L, C) = (B, L, C)

        return x*mask

In [6]:
class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, config, num_classes=2):
        super().__init__()
        self.input_layer = nn.Linear(input_dim, config["hidden_dim"])
        
        layers = []
        for _ in range(config["num_layers"]):
            layers.append(Attention(config["hidden_dim"]))
        self.hidden_layers = nn.ModuleList(layers)

        self.output_layer = nn.Linear(config["hidden_dim"], num_classes)

    def forward(self, inputs):
        zero_pad_mask = (inputs[:, :, 2] != 0).unsqueeze(-1).float()
        x = self.input_layer(inputs) * zero_pad_mask
        for layer in self.hidden_layers:
            x = layer(x,zero_pad_mask)
        x = x.mean(1)  # aggregate over particles
        return self.output_layer(x)

In [7]:
model = SimpleTransformer(input_dim=4,config=config) #remember the inputs are delta eta, delta phi, log(pT), log(E)

### Now we are going to create the training class that will train the model, but first, let's set up the learning rate and the optimizer

In [8]:
optimizer = torch.optim.Adam
lr = 5e-4
epochs = 100
patience = 10 # Number of consecutive epochs to stop the training if the validation loss does not improve

In [9]:
trainer = utils.Trainer(train_data,val_data,model,lr,optimizer)

### Let's train the model!

In [10]:
trainer.train(epochs)

Epoch 1: train loss=0.5040, validation loss=0.4716
Epoch 2: train loss=0.4326, validation loss=0.4017
Epoch 3: train loss=0.4013, validation loss=0.4350
Epoch 4: train loss=0.3907, validation loss=0.3786
Epoch 5: train loss=0.3861, validation loss=0.3783
Epoch 6: train loss=0.3806, validation loss=0.3761
Epoch 7: train loss=0.3759, validation loss=0.3762
Epoch 8: train loss=0.3732, validation loss=0.3622
Epoch 9: train loss=0.3681, validation loss=0.3753
Epoch 10: train loss=0.3669, validation loss=0.3672
Epoch 11: train loss=0.3662, validation loss=0.3643
Epoch 12: train loss=0.3656, validation loss=0.3565
Epoch 13: train loss=0.3635, validation loss=0.3577
Epoch 14: train loss=0.3607, validation loss=0.3628
Epoch 15: train loss=0.3595, validation loss=0.3741
Epoch 16: train loss=0.3587, validation loss=0.3536
Epoch 17: train loss=0.3571, validation loss=0.3560
Epoch 18: train loss=0.3581, validation loss=0.3597
Epoch 19: train loss=0.3592, validation loss=0.3697
Epoch 20: train loss=

### Now let's evaluate the model

In [11]:
test_data = load_data('top',input_folder,batch=128,dataset_type='test')
predictions, labels = trainer.evaluate(test_data)

In [12]:
utils.print_metrics(predictions,labels)

AUC: 0.9195

ACC: 0.8705

Signal class 1 vs Background class 0:
Class 1 effS at 0.30001682568589416 1.0/effB = 28.1074759849645
Class 1 effS at 0.5000049487311453 1.0/effB = 16.49746690635725


<table>
<tr>
<td style="vertical-align: top; padding-right: 20px;">

### Wait, why is it worse than the DeepSets model?
A: Although attention is essential to capture correlations, the standard Transformer Architecture we use also combines additional ingredients such as linear transformations, normalization, and skip connections. These additional operations make the architecture more expressive and more stable.

</td>
<td>
<img src="transformer.webp" alt="Transformer Block" width="300">
</td>
</tr>
</table>


In [13]:
class TransformerBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.att = Attention(dim)
        self.proj1 = nn.Linear(dim, dim)
        self.proj2 = nn.Linear(dim, dim)
        self.activation = nn.GELU()
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

    def forward(self, x, mask):
        x = x + self.att(self.norm1(x),mask) # Attention on normalized inputs is added to the inputs
        x = self.activation(self.proj1(x))*mask #Add a linear layer + non-linear activation        
        x = x + self.proj2(self.norm2(x))*mask #Add another linear layer on normalized inputs
        return x

In [14]:
class Transformer(nn.Module):
    def __init__(self, input_dim, config, num_classes=2):
        super().__init__()
        self.input_layer = nn.Linear(input_dim, config["hidden_dim"])
        
        layers = []
        for _ in range(config["num_layers"]):
            layers.append(TransformerBlock(config["hidden_dim"]))
        self.hidden_layers = nn.ModuleList(layers)

        self.output_layer = nn.Linear(config["hidden_dim"], num_classes)

    def forward(self, inputs):
        zero_pad_mask = (inputs[:, :, 2] != 0).unsqueeze(-1).float()
        x = self.input_layer(inputs) * zero_pad_mask
        for layer in self.hidden_layers:
            x = layer(x,zero_pad_mask)
        x = x.mean(1)  # aggregate over particles
        return self.output_layer(x)

In [15]:
model = Transformer(input_dim=4,config=config) 
optimizer = torch.optim.Adam
lr = 5e-4
epochs = 100
patience = 10 # Number of consecutive epochs to stop the training if the validation loss does not improve
trainer = utils.Trainer(train_data,val_data,model,lr,optimizer)

In [16]:
trainer.train(epochs)

Epoch 1: train loss=0.3955, validation loss=0.2966
Epoch 2: train loss=0.2931, validation loss=0.2758
Epoch 3: train loss=0.2770, validation loss=0.2835
Epoch 4: train loss=0.2589, validation loss=0.2429
Epoch 5: train loss=0.2483, validation loss=0.2440
Epoch 6: train loss=0.2392, validation loss=0.2304
Epoch 7: train loss=0.2311, validation loss=0.2458
Epoch 8: train loss=0.2279, validation loss=0.2371
Epoch 9: train loss=0.2232, validation loss=0.2238
Epoch 10: train loss=0.2210, validation loss=0.2144
Epoch 11: train loss=0.2181, validation loss=0.2212
Epoch 12: train loss=0.2160, validation loss=0.2685
Epoch 13: train loss=0.2144, validation loss=0.2161
Epoch 14: train loss=0.2128, validation loss=0.2119
Epoch 15: train loss=0.2093, validation loss=0.2086
Epoch 16: train loss=0.2085, validation loss=0.2221
Epoch 17: train loss=0.2055, validation loss=0.2133
Epoch 18: train loss=0.2047, validation loss=0.2090
Epoch 19: train loss=0.2026, validation loss=0.2021
Epoch 20: train loss=

In [17]:
predictions, labels = trainer.evaluate(test_data)

In [18]:
utils.print_metrics(predictions,labels)

AUC: 0.9758

ACC: 0.9228

Signal class 1 vs Background class 0:
Class 1 effS at 0.30002127985984983 1.0/effB = 337.623745819398
Class 1 effS at 0.5000371160346219 1.0/effB = 117.58823529411764


### Try changing the hyperparameters of the model to see if you can improve the results!