# Timing Comparison Between Massless GravNet and Edge-based Networks

In [1]:
%load_ext autoreload
%autoreload 2

# System imports
import os
import sys
import yaml

# External imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch_geometric.data import Data
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
import torch.utils.benchmark as benchmark
import scipy as sp
import wandb

import warnings

warnings.filterwarnings("ignore")
sys.path.append("../../")
device = "cuda" if torch.cuda.is_available() else "cpu"

from lightning_modules.jetGNN.submodels.interaction_gnn import InteractionGNN
from lightning_modules.jetGNN.submodels.gravnet import GravNet
from lightning_modules.jetGNN.submodels.particlenet import ParticleNet
from lightning_modules.jetGNN.utils import build_edges

**TOP LINE TAKE-AWAYS**
1. PyG radius graph appears to be the fastest option for large batch, small graph datasets
2. GravNet is around 3-6x faster than ParticleNet (with 3 graph iterations, 64 hidden channels)
3. GravNet takes around 10x less memory than ParticleNet

## GravNet Experiments

In [104]:
# Load the config file
config_file = "jet_tag_config_2.yaml"
with open(config_file, "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [105]:
model = GravNet(config)

### Load Dataset

In [4]:
model.setup(stage="fit")

Loading torch files
Fri Sep 23 05:38:28 2022


100%|██████████| 1/1 [00:44<00:00, 44.79s/it]
100%|██████████| 1/1 [00:49<00:00, 49.82s/it]
100%|██████████| 1/1 [00:52<00:00, 52.55s/it]


Building events
Fri Sep 23 05:40:56 2022
Testing sample quality


100%|██████████| 90000/90000 [00:09<00:00, 9621.01it/s] 


Testing sample quality


100%|██████████| 20000/20000 [00:01<00:00, 10658.53it/s]


Testing sample quality


100%|██████████| 20000/20000 [00:01<00:00, 10814.74it/s]


Defining figures of merit
Fri Sep 23 05:41:11 2022


In [8]:
# For quick debugging
trainset, valset, testset = model.trainset, model.valset, model.testset

In [106]:
model.trainset, model.valset, model.testset = trainset, valset, testset

### Test FRNN

### Explore Time Benchmarking

In [107]:
# Get current memory allocation
torch.cuda.reset_max_memory_allocated()

In [108]:
print(torch.cuda.max_memory_allocated() / 1024 ** 3, "GB") 

0.5189099311828613 GB


In [109]:
model = model.to(device)
for batch in model.train_dataloader():
    sample = batch.to(device)
    break

In [110]:
sample

DataBatch(y=[8000], pE=[390341], px=[390341], py=[390341], pz=[390341], log_pt=[390341], log_E=[390341], delta_pt=[390341], log_delta_pt=[390341], delta_E=[390341], log_delta_E=[390341], delta_R=[390341], delta_eta=[390341], delta_phi=[390341], jet_pt=[8000], jet_pE=[8000], jet_px=[8000], jet_py=[8000], jet_pz=[8000], jet_mass=[8000], jet_eta=[8000], jet_phi=[8000], x=[390341], batch=[390341], ptr=[8001])

In [111]:
with torch.no_grad():
    output = model(sample)

In [112]:
print(torch.cuda.max_memory_allocated() / 1024 ** 3, "GB") 

1.265493392944336 GB


In [113]:
t0 = benchmark.Timer(
    stmt="with torch.no_grad(): output = model(sample)",
    globals={"model": model, "sample": sample, "device": device},
    label="Initial_Run",
    sub_label=""
)

In [114]:
bench0 = t0.timeit(100)

In [58]:
print(f"Total time: {bench0.mean * 1e3}ms, giving {bench0.mean / config['train_batch'] * 1e3}ms per graph")

Total time: 20.848515960387886ms, giving 0.026060644950484855ms per graph


In [115]:
print(f"Total time (with 8 spatial dims): {bench0.mean * 1e3}ms, giving {bench0.mean / config['train_batch'] * 1e3}ms per graph")

Total time (with 8 spatial dims): 109.29274569964036ms, giving 0.013661593212455046ms per graph


In [76]:
print(f"Total time: {bench0.mean * 1e3}ms, giving {bench0.mean / config['train_batch'] * 1e3}ms per graph")

Total time: 1685.99712559022ms, giving 0.021074964069877754ms per graph


## ParticleNet Experiments

In [90]:
# Load the config file
config_file = "particlenet_config_1.yaml"
with open(config_file, "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [91]:
model = ParticleNet(config)

In [92]:
model.trainset, model.valset, model.testset = trainset, valset, testset

In [93]:
# Get current memory allocation
torch.cuda.reset_max_memory_allocated()

In [94]:
print(torch.cuda.max_memory_allocated() / 1024 ** 3, "GB") 

0.49274444580078125 GB


In [95]:
model = model.to(device)
for batch in model.train_dataloader():
    sample = batch.to(device)
    break

In [96]:
sample

DataBatch(y=[8000], pE=[393546], px=[393546], py=[393546], pz=[393546], log_pt=[393546], log_E=[393546], delta_pt=[393546], log_delta_pt=[393546], delta_E=[393546], log_delta_E=[393546], delta_R=[393546], delta_eta=[393546], delta_phi=[393546], jet_pt=[8000], jet_pE=[8000], jet_px=[8000], jet_py=[8000], jet_pz=[8000], jet_mass=[8000], jet_eta=[8000], jet_phi=[8000], x=[393546], batch=[393546], ptr=[8001])

In [97]:
with torch.no_grad():
    output = model(sample)

In [98]:
print(torch.cuda.max_memory_allocated() / 1024 ** 3, "GB") 

9.880566120147705 GB


In [159]:
t1 = benchmark.Timer(
    stmt="with torch.no_grad(): output = model(sample)",
    globals={"model": model, "sample": sample, "device": device},
    label="Initial_Run",
)

In [163]:
bench1 = t1.timeit(1000)

In [164]:
print(f"Total time: {bench1.mean * 1e3}ms, giving {bench1.mean / config['train_batch'] * 1e3}ms per batch")

Total time: 58.271132829017006ms, giving 0.07283891603627127ms per batch


## FRNN: Archived

Timing and tests with FRNN were not successful - it is not well-suited to large batches of small point clouds!

In [None]:
import frnn
from torch_geometric.nn import radius_graph, knn_graph

In [None]:
batch_embed = torch.stack([sample.px, sample.py, sample.pz]).T

In [None]:
from torch_geometric import utils as pyg_utils

In [None]:
grid_embed, grid_mask = pyg_utils.to_dense_batch(batch_embed, batch = sample.batch)

In [None]:
N_lengths = sample.ptr[1:] - sample.ptr[:-1]

In [None]:
r_max, k_max = 0.01, 10
def frnn_batch(batch_embed, batch, ptr, r_max, k_max, grid=None, remove_self_loops=True):
    grid_embed, grid_mask = pyg_utils.to_dense_batch(batch_embed, batch = batch)
    N_lengths = ptr[1:] - ptr[:-1]
    
    dists, idxs, nn, grid = get_neighbors(grid_embed, N_lengths, r_max, k_max, grid=grid)

    idxs = (idxs[grid_mask] + sample.ptr[sample.batch].unsqueeze(1))
    positive_idxs = (idxs.squeeze() >= 0)
    ind = torch.arange(idxs.shape[0], device = positive_idxs.device).unsqueeze(1).expand(idxs.shape)
    edges = torch.stack([ind[positive_idxs],
                        idxs[positive_idxs]
                        ], dim = 0)  
    if remove_self_loops:
        edges = edges[:, edges[0] != edges[1]]
    return edges, grid


def get_neighbors(grid_embed, N_lengths, r_max, k_max, grid=None):
    dists, idxs, nn, grid = frnn.frnn_grid_points(points1 = grid_embed,
                                          points2 = grid_embed,
                                          lengths1 = N_lengths,
                                          lengths2 = N_lengths,
                                          K = k_max,
                                          r = r_max,
                                            grid = grid
                                         )
    return dists, idxs, nn, grid

In [None]:
frnn_edges, grid = frnn_batch(batch_embed, sample.batch, sample.ptr, r_max, k_max)

In [None]:
frnn_edges, _ = frnn_batch(batch_embed, sample.batch, sample.ptr, r_max, k_max, grid)

In [None]:
pyg_edges = radius_graph(batch_embed, r=r_max, max_num_neighbors=k_max, batch=sample.batch)

In [None]:
frnn_whole_function = benchmark.Timer(
    stmt="frnn_batch(batch_embed, sample.batch, sample.ptr, r_max, k_max, grid)",
    globals=globals(),
    label="frnn_batch",
)

frnn_neighbors = benchmark.Timer(
    stmt="get_neighbors(grid_embed, N_lengths, r_max, k_max, grid)",
    globals=globals(),
    label="get_neighbors",
)


In [None]:
frnn_bench = frnn_whole_function.timeit(number=10)
frnn_neighbors_bench = frnn_neighbors.timeit(number=10)

In [None]:
pyg_t = benchmark.Timer(
    stmt="radius_graph(batch_embed, r=r_max, max_num_neighbors=16, batch=sample.batch)",
    globals=globals(),
    label="pyg_radius_graph",
)

In [None]:
pyg_knn = benchmark.Timer(
    stmt="knn_graph(batch_embed, 16, batch=sample.batch)",
    globals=globals(),
    label="pyg_knn",
)

In [None]:
pyg_bench = pyg_t.timeit(1000)

In [None]:
pygknn_bench = pyg_knn.timeit(1000)

PyG radius graph is *1000x* faster than the FRNN implementation (and ~2x faster than PyG knn graph)