In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os 

import torch
from torch.utils.data import Dataset,DataLoader
from torch import nn

from torchvision import transforms

from tqdm import tqdm

In [3]:
particle_df_path = '../data/particle_df.csv'
particle_preproc_df_path = '../data/particle_df_preprocessed.csv'

In [4]:
par_pre_df = pd.read_csv(particle_preproc_df_path)
par_df = pd.read_csv(particle_df_path)
par_df.head()

Unnamed: 0,eventID,jetID,particleType,particleVx,particleVy,particleVz,particlePx,particlePy,particlePz,particleE,particlePolarPx,particlePolarPy,particlePolarPz,particlePolarE,particlePhi,particleTheta
0,0,0,0,0.0,0.0,0.0,-115.595071,5.513218,107.093643,157.675996,115.726471,0.82763,3.093935,0.2347607,3.093935,0.824122
1,0,0,0,0.0,0.0,0.0,-83.072377,4.831796,75.798599,112.561324,83.212776,0.816948,3.083494,0.5078805,3.083494,0.831991
2,0,0,-211,-0.981025,1.422285,-33.456345,-11.168506,-8.774579,9.043395,16.838385,14.203125,0.600055,-2.475661,0.1395264,-2.475661,1.003814
3,0,0,130,0.073932,0.089866,-2.399344,-8.233158,-1.087632,6.64721,10.637351,8.304688,0.732994,-3.010249,-1.192093e-07,-3.010249,0.895801
4,0,0,-211,0.073905,0.089409,-2.399101,-8.048296,0.478376,6.0979,10.109785,8.0625,0.698202,3.082224,0.1395264,3.082224,0.923257


In [5]:
class ParticleDataset(Dataset):
    # The ParticleDataset class inherits the Dataset class and implements the __init__, __len__, and __getitem__ methods

    def __init__(self, path, transform=None):
        # Initializing the ParticleDataset object.
        # "path" is the path to the csv file containing the particle data.
        # "transform" is an optional argument that specifies the transformations to be applied to the data.

        # Read the csv file into a Pandas DataFrame.
        self.x = pd.read_csv(path)

        # Put the coordinates eta and phi as the first two features
        self.x = self.x.reindex(
            columns=["particlePolarPy", "particlePhi"]
            + [
                col
                for col in self.x.columns
                if col not in ["particlePolarPy", "particlePhi"]
            ]
        )

        # Store the "transform" argument.
        self.transform = transform

    def __len__(self):
        """
        Returns the number of particles in the dataset.
        """
        # Return the number of rows in the DataFrame (i.e., the number of particles).
        return self.x.shape[0]

    def __getitem__(self, idx):
        """
        Returns the particles with jetID = idx.
        """
        # Get the rows in the DataFrame that have a "jetID" column equal to "idx".
        x = self.x[self.x.jetID==idx].to_numpy()
        
        # If "transform" was specified, apply it to the data.
        if self.transform:
            x = self.transform(x)
        
        # Return the transformed data.
        return x

In [6]:
# Create a Compose object that applies the "ToTensor" transformation.
train_transform = transforms.Compose([
    transforms.ToTensor(),
])


# Create a ParticleDataset object using the csv file located at "particle_df_path" and the "train_transform" transformations.
train_data = ParticleDataset(particle_df_path, train_transform)

# Access the first element in the dataset to get its shape.
train_data[0].shape

torch.Size([1, 23, 16])

## Testing the Dataset on the KNN function

In [7]:
def kNN(x,k=10):

    # expand the input tensor s.t. x_knn.shape = [B, n, n, d]
    x_knn = x.unsqueeze(1).expand(-1, x.shape[1], -1, -1)

    # calculate both delta_phi and delta_eta, with the transpose we get every pair
    delta_phieta = x_knn[:, :, :, :2] - x_knn[:, :, :, :2].transpose(1, 2)

    # calculate distances and sort them in ascending order, keep only the indeces
    _, indeces = torch.sqrt(torch.sum(delta_phieta**2, 3)**0.5).sort(dim=2, stable=True)

    # keep the indeces of k nearest neighbours and use them to sort and cut the initial tensor
    knn = indeces[:,:,:k]
    x_knn = torch.gather(x_knn, 2, knn.unsqueeze(-1).expand(-1, -1, -1, x_knn.shape[-1]))

    del delta_phieta, indeces, knn, _

    return x_knn

kNN(train_data[0]).shape

torch.Size([1, 23, 10, 16])

In [8]:
def custom_collate(batch):
    """
    A custom collate function that can handle different shape tensors.
    The default collate function provided by PyTorch's DataLoader assumes that all tensors in a batch have the same shape. 
    However, in our case, each "datum" is a set of particles that compose a jet and the number of particles composing a jet is not fixed. 
    Therefore, each tensor representing a jet has a different shape.

    To handle this scenario, we need to override the collate function to be able to stack the tensors into a batch. 
    This function first determines the maximum number of particles among all jets in the batch. 
    Then, it pads all tensors with zeros to make sure they have the same shape. 
    Finally, it stacks the tensors along the batch dimension to return the padded data and original lengths.

    """
    
    # Get the max number of particles among all the jets in the batch
    n_part_max = max(x.shape[1] for x in batch)

    # Pad all the tensors with zeros so they have the same shape
    data = []
    lengths = []
    for x in batch:
        n_part = x.shape[1]
        data.append(torch.cat([x, torch.zeros(1, n_part_max - n_part, 16)], dim=1))
        lengths.append(n_part)

    # Stack the tensors along the batch dimension
    data = torch.stack(data)

    # Return the padded data, original lengths, and target labels
    return data, lengths

In [9]:
batch_size       = 10
train_dataloader = DataLoader(train_data, batch_size=batch_size, collate_fn=custom_collate)

In [10]:
# loop over the dataloader to get the data in batches
i=0
for batch, original_length in train_dataloader:
    print(batch.shape, original_length)
    i+=1
    if i==2:
        break

torch.Size([10, 1, 62, 16]) [23, 41, 26, 20, 62, 35, 9, 4, 46, 48]
torch.Size([10, 1, 66, 16]) [41, 22, 2, 40, 21, 26, 25, 66, 28, 2]


In the forward pass of your model, you can use the original lengths to process the data correctly, for example, by masking out the padded zeros.

testing nested tensors

In [11]:
tensors = []

for i, original_shape in enumerate(original_length):
    # Slice the tensor along the third dimension to get the desired shape
    a = batch[i, :, :original_shape, :]
    tensors.append(a)

In [12]:
for t in tensors:
    print(t.shape)

torch.Size([1, 41, 16])
torch.Size([1, 22, 16])
torch.Size([1, 2, 16])
torch.Size([1, 40, 16])
torch.Size([1, 21, 16])
torch.Size([1, 26, 16])
torch.Size([1, 25, 16])
torch.Size([1, 66, 16])
torch.Size([1, 28, 16])
torch.Size([1, 2, 16])


In [13]:
nested = torch.nested.nested_tensor(tensors)

AttributeError: module 'torch' has no attribute 'nested'

In [None]:
nested[2].shape

torch.Size([1, 2, 16])

## testing the edge conv block functions

In [14]:
x = train_data[0]
x.shape

torch.Size([1, 23, 16])

In [15]:
def kNN(x, k=10):
    """input: single jet data
        output: tensor with shape [B, n, k, d] where d are the features of the knn particles"""
    # expand the input tensor s.t. x_knn.shape = [B, n, n, d]
    x_knn = x.unsqueeze(1).expand(-1, x.shape[1], -1, -1)

    # calculate both delta_phi and delta_eta
    delta_phieta = x_knn[:, :, :, :2] - x_knn[:, :, :, :2].transpose(1, 2)

    # calculate distances and sort them in ascending order, keep only the indeces
    _, indeces = torch.sqrt(torch.sum(delta_phieta**2, 3)**0.5).sort(dim=2, stable=True)

    # keep the indeces of k nearest neighbours and use them to sort and cut the initial tensor
    knn = indeces[:,:,:k]
    x_knn = torch.gather(x_knn, 2, knn.unsqueeze(-1).expand(-1, -1, -1, x_knn.shape[-1]))

    del delta_phieta, indeces, knn, _

    return x_knn 

In [16]:
x_knn = kNN(x)
x_knn.shape

x_knn_batch = x_knn[0]
x_knn_batch.shape

torch.Size([23, 10, 16])

In [17]:
def linear_aggregate(x):

    '''Applico ad un solo batch!! --> shape finale aspettata  [n,d] '''

    # Here we want to define the operation which applies the mlp,
    # i.e. the linear part, to each couple of n.n. and then
    # aggregates the results.
    # Expected output shape is [B, n, d] (aggregating we collapsed the k dimension)

    '''Concateniamo le Features di due nearest neighbours, 
        passiamo questo Array alla parte lineare, 
        ripetiamo l'operazione per tutte le coppie di particella iesima e suo nearest neighbours,
        facciamo max o softmax su queste
    '''
    # for b in range(x.shape[0]): # scorro sui batch
    for p in range(x.shape[0]): # fisso una particella
        p_feat   = x[p, :, :] # features dei nn di p

        p_feat   = x[b, p, 0, :].unsqueeze(0)
        knn_feat = x[b, p, :, :]
        pairs = torch.stack(p_feat, knn_feat, dim = 0)
        
        print(p_feat.shape)
        break
        # pairs = torch.stack([torch.stack([p_feat[0, :], p_feat[i, :]]) for i in range(0, x.shape[1])])
        difference = p_feat - p_feat[0,:]
        difference = difference[1:,:]

        # print("pairs of the particle ",p, "is",pairs.shape) #expected [k, 2, d]
        print("shape of the differences between p and its nn: (expected [k,d])")

        mlp_pair_list = []
        
        # ciclo per accedere alle coppie
        for i in range(pairs.shape[0]):
            y = (pairs[i, :, :])**2 # give to the MLP part a 2D tensor of shape [2,d]
            mlp_pair_list.append(y) # list of the i-esim particle with every its pairs
        
        print("len of pair list",len(mlp_pair_list)) # expected = k perchè sono il numero di coppie

        mlp_pair = torch.stack(mlp_pair_list) # aggregate all the pairs of the i-esim particle p
        print("shape of all pairs output of layers",mlp_pair.shape) # expected [k, 2, d]

    particles_per_jet = torch.stack(mlp_pair) # aggregate all the particles of the jet
    
    '''
    QUA CI SAREBBE IL SOFTMAX

    '''
    print(particles_per_jet.shape) # in this example (fixing the batch), we expect a shape [n,d]
    '''each particle have its features'''

    return None
    

In [18]:
linear_aggregate(x_knn_batch)

NameError: name 'b' is not defined

In [36]:
def linear_aggregate(x):

    '''Applico ad un solo batch!! --> shape finale aspettata  [n,d] '''

    # for b in range(x.shape[0]): # scorro sui batch
    
    edge_of_p  = [] # lista per le edges della particella

    for p in range(x.shape[0]): # fisso una particella
        
        # p_feat   = x[p, :, :] # features dei nn di p

        p_feat   = x[p, 0, :].unsqueeze(0).expand(x.shape[1], -1)

        #DIFFERENCES 

        knn_feat = x[p, :, :] - p_feat

        pairs = torch.concat([p_feat, knn_feat], dim=1)


        print("shape of the knn_feats between p and its nn: (expected [k,2*d])", pairs.shape)

        mlp_list = []
        
        # ciclo per accedere alla singola riga 
        # -> passo una singola riga alla volta al MLP e salvo in una lista

        for i in range(pairs.shape[0]):
            y = (pairs[i, :])**2 # give to the MLP part a 2D tensor of shape [1,d] 
                                    # -> output expected [1,channels]
            mlp_list.append(y) # list of the i-esim particle with every its pairs
        
        # print("len of pair list",len(mlp_list)) # expected = k perchè sono il numero di coppie
    
        mlp_result = torch.stack(mlp_list) # aggregate all the pairs of the i-esim particle p
        max, _ = torch.max(mlp_result, dim=0) # take the maximum row 

        ##### possiamo prendere anche avg o sum #####
        
        # print("shape of all pairs output of layers",mlp_result.shape) # expected [k, d] or [k, ch]
        # print("shape of all pairs output of layers",max.shape) # expected [1, d] or [1, ch]
        
        edge_of_p.append(max) # append the edges of the p particle

        del mlp_list
        
    print(len(edge_of_p), "expected n =",x.shape[0])

    edges = torch.stack(edge_of_p)
    print("shape of edges:", edges.shape, "expected [",x.shape[0],", ch]")
    
    return None
    

In [37]:
linear_aggregate(x_knn_batch)

shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) torch.Size([10, 32])
shape of the knn_feats between p and its nn: (expected [k,2*d]) t

In [None]:
# Tensor di shape [10,16]
tensor = torch.tensor([[1,2,3],[4,5,6]], dtype=torch.float32)

# Calcola la differenza tra la prima riga e tutte le altre righe
differences = tensor - tensor[0,:]

# Rimuovi la prima riga (che è uguale alla prima riga originale)
result = differences[1:,:]

print(result)
print(differences[1,:].shape)
print(tensor)


tensor([[3., 3., 3.]])
torch.Size([3])
tensor([[1., 2., 3.],
        [4., 5., 6.]])


In [None]:
# Tensor di shape [10,16]
tensor = torch.tensor([[1,2,3],[4,5,6]], dtype=torch.float32)
print(tensor.shape)
# Trova la riga con i valori massimi
max_index = tensor.argmax(dim=0)
max_row = tensor[max_index,:]
max, _ = torch.max(tensor, dim=0)
# Riduci la riga a [1,16]
result = max_row.unsqueeze(0)

print(result)
print(max_row)
print(max_index)
print("\n")
print(max.shape)


torch.Size([2, 3])
tensor([[[4., 5., 6.],
         [4., 5., 6.],
         [4., 5., 6.]]])
tensor([[4., 5., 6.],
        [4., 5., 6.],
        [4., 5., 6.]])
tensor([1, 1, 1])


torch.Size([3])


In [None]:
l