In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os 

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from tqdm import tqdm

from DataReader import DataReader
import h5py

import dgl

from compute_edges import compute_edge_features

In [2]:
data_path = "../data/train/"

data_reader = DataReader(data_path)
data_reader.read_files(n_files=2)


data   = data_reader.get_features()
labels = data_reader.get_labels()

Reading file jetImage_3_100p_70000_80000.h5
Reading file jetImage_6_100p_20000_30000.h5


# Node features

The particle feature we consider for our analysis are the following:

| Dataset Index | Feature | Description |
|:-------------:|:-------:|:-----------:|
| 0 | $\Delta\eta$ | $\eta$ difference between the particle and the jet |
| 1 | $\Delta\phi$ | $\phi$ difference between the particle and the jet |
| 6 | $\Delta R$ | $\sqrt{(\Delta\eta)^2 + (\Delta\phi)^2}$ |
| 4 | $\log p_T$ | $\log(p_T)$ of the particle (log to be computed) |
| 2 | $\log E$ | $\log(E)$ of the particle (log to be computed)|
| 5 | $\log\frac{p_T}{p_T\,\text{(jet)}}$ | $\log\frac{p_T}{p_T\,\text{(jet)}}$ of the particle (log to be computed)|
| 3 | $\log\frac{E}{E\,\text{(jet)}}$ | $\log\frac{E}{E\,\text{(jet)}}$ of the particle (log to be computed)|

# Edge features

The edge features $\bm{e}_{ij}$ are 3-dimensional vectors, i.e., we have 3 edge features for each edge connecting the nodes $i$ and $j$. Let us define first the distance between the nodes $i$ and $j$ to be

$$
d_{ij} = \min(p_{\text{T}\,i}^{2\alpha}, p_{\text{T}\,j}^{2\alpha})\,\frac{R_{ij}^2}{R^2}
$$

Then, the edge features are defined as $\bm{e}_{ij}=(d_{ij}(\alpha=0),\,\log d_{ij}(\alpha=1),\, \log m_{ij})$ where $m_{ij}$ is the invariant mass of the two particles, i.e., $m_{ij}=\sqrt{E_iE_j - p_{\text{T}\,i}p_{\text{T}\,j}\cos(\Delta\phi_{ij})}$.

We need to compute $\Delta\phi_{ij} = \Delta\phi_i - \Delta\phi_j$ where $\Delta\phi_i$ is the $\phi$ difference between the particle $i$ and the jet $\Delta\eta_{ij}$ using the following formula: $\Delta\eta_{ij} = \Delta\eta_i - \Delta\eta_j$ where $\Delta\eta_i$ is the $\eta$ difference between the particle $i$ and the jet. We also need to compute the distance $R_{ij}$ between the two particles. We can compute these quantities using the following formula: $R_{ij} = \sqrt{(\Delta\eta_{ij})^2 + (\Delta\phi_{ij})^2}$. 

## Compute Edges

In [20]:
# DEPRECATED - use the functions in the compute_edges.py file


def compute_dPhi_ij(dPhi_i, dPhi_j):
    """Compute the difference in azimuthal angle between two particles."""
    return dPhi_i - dPhi_j

def compute_dEta_ij(dEta_i, dEta_j):
    """Compute the difference in pseudorapidity between two particles."""
    return dEta_i - dEta_j

def compute_R_ij(dEta_ij, dPhi_ij):
    """Compute the distance between two particles in the transverse plane."""
    return np.sqrt(dEta_ij**2 + dPhi_ij**2)

def compute_m_ij(e_i, e_j, pt_i, pt_j, dPhi_ij):
    """Compute the invariant mass of two particles."""
    # invariant mass of two massive particles as a function of the two energies, the two transverse momenta and the angle between them
    return np.sqrt(2 * e_i * e_j * (1 - np.cos(dPhi_ij))) # CHECK THIS

def node_distance(pt_i, pt_j, r, r_ij, alpha):
    """Compute the distance between two nodes in the graph."""
    return np.min((pt_i**(2*alpha), pt_j**(2*alpha))) * r_ij**r

def compute_one_edge_feature(jet, i, j):
    """Compute the edge feature for one edge."""
    
    dEta_ij = compute_dEta_ij(jet[i, 0], jet[j, 0])
    dPhi_ij = compute_dPhi_ij(jet[i, 1], jet[j, 1])
    dR_ij   = compute_R_ij(dEta_ij, dPhi_ij)
    m_ij    = compute_m_ij(jet[i, 2], jet[j, 2], jet[i, 4], jet[j, 4], dPhi_ij)

    # compute the edge feature
    e_0 =        node_distance(pt_i=jet[i, 4], pt_j=jet[j, 4], r=jet[i, 6], r_ij=dR_ij, alpha=0)
    e_1 = np.log(node_distance(pt_i=jet[i, 4], pt_j=jet[j, 4], r=jet[i, 6], r_ij=dR_ij, alpha=1))
    e_2 = np.log(m_ij)

    return np.array([e_0, e_1, e_2])
    

def compute_edge_features(data):
    # compute the edge features for all the jets in the dataset
    # final shape of the edges: (n_jets, n_particles * n_particles, 3) 

    # list to store the edge features for all the jets
    edge_features_all = []

    # loop over all the jets
    for k in range(data.shape[0]):

        # list to store the edge features for the current jet
        edge_features = []

        # get the current jet
        jet = data[k, :, :]

        # loop over all the particles in the jet
        for i in range(jet.shape[0]):
            # loop over all the particles in the jet
            for j in range(jet.shape[0]):

                # if the two particles are the same, the edge feature is just a vector of ones
                edge_feature = np.ones(3) if i == j else compute_one_edge_feature(jet, i, j)
                edge_features.append(edge_feature)

        # store the edge features for the current jet
        edge_features = np.array(edge_features)
        edge_features_all.append(edge_features)

    return np.array(edge_features_all)

### Test

In [3]:
# slice only the first 5 jets
jets = data[:5, :, :]
edges = compute_edge_features(jets)
edges.shape

  e_2 = np.log(m_ij)
  e_1 = np.log(node_distance(pt_i=jet[i, 4], pt_j=jet[j, 4], r=jet[i, 6], r_ij=dR_ij, alpha=1))


(5, 10000, 3)

---

## PyTorch Dataset

In [7]:
class ParticleDataset(Dataset):

    def __init__(self, data_reader, n_files=None, transform=None):

        data_reader.read_files(n_files=n_files)

        self.x = data_reader.get_features()
        self.y = data_reader.get_labels()

        self.transform = transform
        
        self.edges = compute_edge_features(self.x)
        

    def __len__(self):

        return self.x.shape[0]
    

    def __getitem__(self, idx):

        x     = self.x[idx]
        y     = self.y[idx]
        edges = self.edges[idx]
        
        if self.transform:
            x = self.transform(x)
        
        return x, edges, y

In [8]:
train_transform = None

data_reader = DataReader("../data/train/")

train_data = ParticleDataset(data_reader=data_reader, n_files=1, transform=train_transform)

Reading file jetImage_3_100p_70000_80000.h5


  e_2 = np.log(m_ij)
  e_1 = np.log(node_distance(pt_i=jet[i, 4], pt_j=jet[j, 4], r=jet[i, 6], r_ij=dR_ij, alpha=1))


### Test

In [9]:
batch_size       = 1
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [12]:
# loop over the dataloader to get the data in batches
i=0
for features, edges, y in train_dataloader:
    print(features.shape)
    print(edges.shape)
    print(y.shape)
    break

torch.Size([1, 100, 7])
torch.Size([1, 10000, 3])
torch.Size([1, 6])
