# Transformers Tutorial: Part III

In [1]:
from dataloader import load_data
import torch
import torch.nn as nn
import utils
from network import PET2

### Let's open the training and validation files containing examples for top quarks (signal) and QCD jets (background)

In [2]:
input_folder = '/global/cfs/cdirs/trn016/transformer'
train_data = load_data('top',input_folder,batch=256,dataset_type='train',num_evt = 100_000)
val_data = load_data('top',input_folder,batch=256,dataset_type='val')

In [3]:
print (f"Loading {len(train_data)} batches of events for training and {len(val_data)} for validation")

Loading 390 batches of events for training and 1574 for validation


### Let's now load the PET Model

In [4]:
config = {
    'input_dim':4,
    'hidden_size': 128,
    'num_transformers': 8, #number of transformer blocks used
    'num_transformers_head':2, #number of transformer blocks used in the task-specific block
    'num_heads':8, #number of heads for multi-head attention
    'K':10, #number of neighbors considered for the kNN
}

In [5]:
model = PET2(**config) #remember the inputs are delta eta, delta phi, log(pT), log(E)

### Now we are going to create the training class that will train the model, but first, let's set up the learning rate and the optimizer

In [6]:
optimizer = torch.optim.Adam
lr = 5e-4
epochs = 10
patience = 10 # Number of consecutive epochs to stop the training if the validation loss does not improve

In [7]:
trainer = utils.Trainer(train_data,val_data,model,lr,optimizer)

### Let's train the model!

In [8]:
trainer.train(epochs)

Epoch 1: train loss=0.2838, validation loss=0.2392
Epoch 2: train loss=0.2291, validation loss=0.2176
Epoch 3: train loss=0.2098, validation loss=0.1964
Epoch 4: train loss=0.1964, validation loss=0.1890
Epoch 5: train loss=0.1908, validation loss=0.1853
Epoch 6: train loss=0.1857, validation loss=0.1848
Epoch 7: train loss=0.1834, validation loss=0.1899
Epoch 8: train loss=0.1831, validation loss=0.1856
Epoch 9: train loss=0.1797, validation loss=0.1865
Epoch 10: train loss=0.1781, validation loss=0.1794
Training complete. Total time: 700.1s.


### Now let's evaluate the model

In [9]:
test_data = load_data('top',input_folder,batch=128,dataset_type='test')
predictions, labels = trainer.evaluate(test_data)

In [10]:
utils.print_metrics(predictions,labels)

AUC: 0.9798

ACC: 0.9295

Signal class 1 vs Background class 0:
Class 1 effS at 0.30000989756025137 1.0/effB = 532.712401055409
Class 1 effS at 0.500049487801257 1.0/effB = 166.30807248764415


### Now let's load the pre-trained weights

In [11]:
utils.restore_checkpoint(model,input_folder,'best_model_pretrain_s.pt')
#These messages are all fine and related to model layers that are not relevant for classiciation tasks

Skipping MPFourier.freqs: shape mismatch (checkpoint: torch.Size([128]), model: missing)
Skipping MPFourier.phases: shape mismatch (checkpoint: torch.Size([128]), model: missing)
Skipping time_embed.fc1.weight: shape mismatch (checkpoint: torch.Size([256, 128]), model: missing)
Skipping time_embed.fc2.weight: shape mismatch (checkpoint: torch.Size([128, 256]), model: missing)
Skipping time_embed.norm.alpha: shape mismatch (checkpoint: torch.Size([1]), model: missing)
Skipping time_embed.norm.weight: shape mismatch (checkpoint: torch.Size([256]), model: missing)
Skipping add_embed.0.fc1.weight: shape mismatch (checkpoint: torch.Size([256, 4]), model: missing)
Skipping add_embed.0.fc2.weight: shape mismatch (checkpoint: torch.Size([128, 256]), model: missing)
Skipping add_embed.0.norm.alpha: shape mismatch (checkpoint: torch.Size([1]), model: missing)
Skipping add_embed.0.norm.weight: shape mismatch (checkpoint: torch.Size([256]), model: missing)
Skipping pid_embed.0.weight: shape mismat

In [12]:
optimizer = torch.optim.Adam
lr = 5e-5 #Notice the learning rate is much smaller than before
epochs = 10
patience = 10 # Number of consecutive epochs to stop the training if the validation loss does not improve
trainer = utils.Trainer(train_data,val_data,model,lr,optimizer)

In [13]:
trainer.train(epochs)

Epoch 1: train loss=0.1902, validation loss=0.1661
Epoch 2: train loss=0.1626, validation loss=0.1645
Epoch 3: train loss=0.1587, validation loss=0.1676
Epoch 4: train loss=0.1544, validation loss=0.1635
Epoch 5: train loss=0.1501, validation loss=0.1655
Epoch 6: train loss=0.1464, validation loss=0.1637
Epoch 7: train loss=0.1418, validation loss=0.1675
Epoch 8: train loss=0.1369, validation loss=0.1691
Epoch 9: train loss=0.1298, validation loss=0.1751
Epoch 10: train loss=0.1220, validation loss=0.1823
Training complete. Total time: 706.3s.


### Because the pre-trained model already starts from useful weights, they are quicker to overtrain

In [14]:
predictions, labels = trainer.evaluate(test_data)

In [15]:
utils.print_metrics(predictions,labels)

AUC: 0.9834

ACC: 0.9354

Signal class 1 vs Background class 0:
Class 1 effS at 0.3000148463403771 1.0/effB = 1030.091836734694
Class 1 effS at 0.50003464146088 1.0/effB = 275.0653950953678


### Try changing the hyperparameters of the model to see if you can improve the results!