In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os 

import torch
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset,DataLoader
from torch import nn

from tqdm import tqdm

In [16]:
particle_df_path = '../data/particle_df.csv'

In [17]:
class ParticleDataset(Dataset):
    # The ParticleDataset class inherits the Dataset class and implements the __init__, __len__, and __getitem__ methods

    def __init__(self, path, transform=None):
        # Initializing the ParticleDataset object.
        # "path" is the path to the csv file containing the particle data.
        # "transform" is an optional argument that specifies the transformations to be applied to the data.
        
        # Read the csv file into a Pandas DataFrame.
        self.x = pd.read_csv(path)
        
        # Store the "transform" argument.
        self.transform = transform

    def __len__(self):
        """
        Returns the number of particles in the dataset.
        """
        # Return the number of rows in the DataFrame (i.e., the number of particles).
        return self.x.shape[0]

    def __getitem__(self, idx):
        """
        Returns the particles with jetID = idx.
        """
        # Get the rows in the DataFrame that have a "jetID" column equal to "idx".
        x = self.x[self.x.jetID==idx].to_numpy()
        
        # If "transform" was specified, apply it to the data.
        if self.transform:
            x = self.transform(x)
        
        # Return the transformed data.
        return x

In [18]:
# Create a Compose object that applies the "ToTensor" transformation.
train_transform = transforms.Compose([
    transforms.ToTensor(),
])


# Create a ParticleDataset object using the csv file located at "particle_df_path" and the "train_transform" transformations.
train_data = ParticleDataset(particle_df_path, train_transform)

# Access the first element in the dataset to get its shape.
train_data[0].shape

torch.Size([1, 23, 16])

In [19]:
def custom_collate(batch):
    """
    A custom collate function that can handle different shape tensors.
    The default collate function provided by PyTorch's DataLoader assumes that all tensors in a batch have the same shape. 
    However, in our case, each "datum" is a set of particles that compose a jet and the number of particles composing a jet is not fixed. 
    Therefore, each tensor representing a jet has a different shape.

    To handle this scenario, we need to override the collate function to be able to stack the tensors into a batch. 
    This function first determines the maximum number of particles among all jets in the batch. 
    Then, it pads all tensors with zeros to make sure they have the same shape. 
    Finally, it stacks the tensors along the batch dimension to return the padded data and original lengths.

    """
    
    # Get the max number of particles among all the jets in the batch
    n_part_max = max(x.shape[1] for x in batch)

    # Pad all the tensors with zeros so they have the same shape
    data = []
    lengths = []
    for x in batch:
        n_part = x.shape[1]
        data.append(torch.cat([x, torch.zeros(1, n_part_max - n_part, 16)], dim=1))
        lengths.append(n_part)

    # Stack the tensors along the batch dimension
    data = torch.stack(data)

    # Return the padded data, original lengths, and target labels
    return data, lengths

In [20]:
batch_size       = 10
train_dataloader = DataLoader(train_data, batch_size=batch_size, collate_fn=custom_collate)

In [21]:
# loop over the dataloader to get the data in batches
i=0
for batch, original_length in train_dataloader:
    print(batch.shape, original_length)
    i+=1
    if i==2:
        break

torch.Size([10, 1, 62, 16]) [23, 41, 26, 20, 62, 35, 9, 4, 46, 48]
torch.Size([10, 1, 66, 16]) [41, 22, 2, 40, 21, 26, 25, 66, 28, 2]


In the forward pass of your model, you can use the original lengths to process the data correctly, for example, by masking out the padded zeros.

testing nested tensors

In [11]:
tensors = []

for i, original_shape in enumerate(original_length):
    # Slice the tensor along the third dimension to get the desired shape
    a = batch[i, :, :original_shape, :]
    tensors.append(a)

In [12]:
for t in tensors:
    print(t.shape)

torch.Size([1, 41, 16])
torch.Size([1, 22, 16])
torch.Size([1, 2, 16])
torch.Size([1, 40, 16])
torch.Size([1, 21, 16])
torch.Size([1, 26, 16])
torch.Size([1, 25, 16])
torch.Size([1, 66, 16])
torch.Size([1, 28, 16])
torch.Size([1, 2, 16])


In [13]:
nested = torch.nested_tensor(tensors, dim=1)

AttributeError: module 'torch' has no attribute 'nested_tensor'

In [75]:
nested.shape

torch.Size([1, 273, 16])