# Transformers Tutorial: Part II

In [1]:
from dataloader import load_data
import torch
import torch.nn as nn
import utils

### Let's open the training and validation files containing examples for top quarks (signal) and QCD jets (background)

In [2]:
input_folder = '/global/cfs/cdirs/trn016/transformer'
train_data = load_data('top',input_folder,batch=256,dataset_type='train',num_evt = 100_000)
val_data = load_data('top',input_folder,batch=256,dataset_type='val')

In [3]:
print (f"Loading {len(train_data)} batches of events for training and {len(val_data)} for validation")

Loading 390 batches of events for training and 1574 for validation


### We Now need to create a model that will take the data as input and predict a label for each data entry. Let's create a config file with the network parameters

In [4]:
config = {
    'num_layers': 2,
    'hidden_dim': 64,
}

In [5]:
class Attention(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.q = nn.Linear(dim, dim)
        self.k = nn.Linear(dim, dim)
        self.v = nn.Linear(dim, dim)

    def forward(self, x, mask):
        B, L, C = x.shape
        
        q = self.q(x)*mask
        k = self.k(x)*mask
        v = self.v(x)*mask

        attn = (q @ k.transpose(-2, -1)) #Matrix multiplication: (B, L, C) x (B, C, L) = (B, L, L) shape
        attn = attn.softmax(dim=-1) #Normalization
        x = (attn @ v) # Matrix multiplication: (B, L, L) x (B, L, C) = (B, L, C)

        return x*mask

In [6]:
class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, config, num_classes=2):
        super().__init__()
        self.input_layer = nn.Linear(input_dim, config["hidden_dim"])
        
        layers = []
        for _ in range(config["num_layers"]):
            layers.append(Attention(config["hidden_dim"]))
        self.hidden_layers = nn.ModuleList(layers)

        self.output_layer = nn.Linear(config["hidden_dim"], num_classes)

    def forward(self, inputs):
        zero_pad_mask = (inputs[:, :, 2] != 0).unsqueeze(-1).float()
        x = self.input_layer(inputs) * zero_pad_mask
        for layer in self.hidden_layers:
            x = layer(x,zero_pad_mask)
        x = x.mean(1)  # aggregate over particles
        return self.output_layer(x)

In [7]:
model = SimpleTransformer(input_dim=4,config=config) #remember the inputs are delta eta, delta phi, log(pT), log(E)

### Now we are going to create the training class that will train the model, but first, let's set up the learning rate and the optimizer

In [8]:
optimizer = torch.optim.Adam
lr = 5e-4
epochs = 100
patience = 10 # Number of consecutive epochs to stop the training if the validation loss does not improve

In [9]:
trainer = utils.Trainer(train_data,val_data,model,lr,optimizer)

### Let's train the model!

In [10]:
trainer.train(epochs)

Epoch 1: train loss=0.5360, validation loss=0.4765
Epoch 2: train loss=0.4676, validation loss=0.4667
Epoch 3: train loss=0.4336, validation loss=0.4090
Epoch 4: train loss=0.4068, validation loss=0.4209
Epoch 5: train loss=0.3908, validation loss=0.3808
Epoch 6: train loss=0.3879, validation loss=0.4014
Epoch 7: train loss=0.3893, validation loss=0.3859
Epoch 8: train loss=0.3823, validation loss=0.3878
Epoch 9: train loss=0.3828, validation loss=0.3865
Epoch 10: train loss=0.3766, validation loss=0.3895
Epoch 11: train loss=0.3727, validation loss=0.3722
Epoch 12: train loss=0.3710, validation loss=0.3659
Epoch 13: train loss=0.3691, validation loss=0.3668
Epoch 14: train loss=0.3654, validation loss=0.3804
Epoch 15: train loss=0.3650, validation loss=0.3642
Epoch 16: train loss=0.3655, validation loss=0.3646
Epoch 17: train loss=0.3650, validation loss=0.3720
Epoch 18: train loss=0.3668, validation loss=0.3596
Epoch 19: train loss=0.3658, validation loss=0.3715
Epoch 20: train loss=

### Now let's evaluate the model

In [11]:
test_data = load_data('top',input_folder,batch=128,dataset_type='test')
predictions, labels = trainer.evaluate(test_data)

In [12]:
utils.print_metrics(predictions,labels)

AUC: 0.9200

ACC: 0.8716

Signal class 1 vs Background class 0:
Class 1 effS at 0.30002820790712176 1.0/effB = 26.440151911995812
Class 1 effS at 0.5000222694003593 1.0/effB = 16.520497504295882


<table>
<tr>
<td style="vertical-align: top; padding-right: 20px;">

### Wait, why is it just as good as the DeepSets model?
A: Although attention is essential to capture correlations, the standard Transformer Architecture we use also combines additional ingredients such as linear transformations, normalization, and skip connections. These additional operations make the architecture more expressive and more stable.

</td>
<td>
<img src="transformer.webp" alt="Transformer Block" width="300">
</td>
</tr>
</table>


In [13]:
class TransformerBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.att = Attention(dim)
        self.proj1 = nn.Linear(dim, dim)
        self.proj2 = nn.Linear(dim, dim)
        self.activation = nn.GELU()
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

    def forward(self, x, mask):
        x = x + self.att(self.norm1(x),mask) # Attention on normalized inputs is added to the inputs
        x = self.activation(self.proj1(x))*mask #Add a linear layer + non-linear activation        
        x = x + self.proj2(self.norm2(x))*mask #Add another linear layer on normalized inputs
        return x

In [14]:
class Transformer(nn.Module):
    def __init__(self, input_dim, config, num_classes=2):
        super().__init__()
        self.input_layer = nn.Linear(input_dim, config["hidden_dim"])
        
        layers = []
        for _ in range(config["num_layers"]):
            layers.append(TransformerBlock(config["hidden_dim"]))
        self.hidden_layers = nn.ModuleList(layers)

        self.output_layer = nn.Linear(config["hidden_dim"], num_classes)

    def forward(self, inputs):
        zero_pad_mask = (inputs[:, :, 2] != 0).unsqueeze(-1).float()
        x = self.input_layer(inputs) * zero_pad_mask
        for layer in self.hidden_layers:
            x = layer(x,zero_pad_mask)
        x = x.mean(1)  # aggregate over particles
        return self.output_layer(x)

In [15]:
model = Transformer(input_dim=4,config=config) 
optimizer = torch.optim.Adam
lr = 5e-4
epochs = 100
patience = 10 # Number of consecutive epochs to stop the training if the validation loss does not improve
trainer = utils.Trainer(train_data,val_data,model,lr,optimizer)

In [16]:
trainer.train(epochs)

Epoch 1: train loss=0.4656, validation loss=0.3403
Epoch 2: train loss=0.3122, validation loss=0.2855
Epoch 3: train loss=0.2916, validation loss=0.3015
Epoch 4: train loss=0.2788, validation loss=0.2961
Epoch 5: train loss=0.2652, validation loss=0.2552
Epoch 6: train loss=0.2527, validation loss=0.2430
Epoch 7: train loss=0.2471, validation loss=0.2506
Epoch 8: train loss=0.2395, validation loss=0.2334
Epoch 9: train loss=0.2334, validation loss=0.2260
Epoch 10: train loss=0.2298, validation loss=0.2256
Epoch 11: train loss=0.2253, validation loss=0.2235
Epoch 12: train loss=0.2195, validation loss=0.2122
Epoch 13: train loss=0.2202, validation loss=0.2139
Epoch 14: train loss=0.2162, validation loss=0.2355
Epoch 15: train loss=0.2131, validation loss=0.2173
Epoch 16: train loss=0.2134, validation loss=0.2339
Epoch 17: train loss=0.2108, validation loss=0.2257
Epoch 18: train loss=0.2088, validation loss=0.2120
Epoch 19: train loss=0.2105, validation loss=0.2072
Epoch 20: train loss=

In [17]:
predictions, labels = trainer.evaluate(test_data)

In [18]:
utils.print_metrics(predictions,labels)

AUC: 0.9755

ACC: 0.9222

Signal class 1 vs Background class 0:
Class 1 effS at 0.3000024743288383 1.0/effB = 363.1169064748201
Class 1 effS at 0.5001113447977236 1.0/effB = 123.70894607843137


### Try changing the hyperparameters of the model to see if you can improve the results!