In [1]:
import gc
from time import time

from tqdm import trange
import torch
import numpy as np
from torch import Tensor
from torch_geometric import EdgeIndex
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import SAGEConv
from torch_geometric.transforms import LineGraph
from torch_geometric.utils import cumsum, coalesce, degree

import pathpyG as pp

In [2]:
def lift_order_edge_index(edge_index: EdgeIndex | torch.Tensor, num_nodes: int = None) -> torch.Tensor:
    # Since this is a complicated function, we will use the following example to explain the steps:
    # Example:
    #   edge_index = [[0, 0, 1, 1, 1, 3, 4, 5, 6],
    #                 [1, 3, 2, 3, 6, 4, 5, 7, 5]]

    edges = edge_index.size(1)
    if num_nodes is None:
        num_nodes = int(edge_index.max().item() + 1)
    if isinstance(edge_index, torch.Tensor):
        edge_index = EdgeIndex(edge_index, sparse_size=(num_nodes, num_nodes))
    if not edge_index.is_sorted_by_row:
        edge_index = edge_index.sort_by("row")[0]
    
    # Compute the outdegree of each node which we will use to get all the edge combinations that lead to a higher order edge
    # Example:
    #   outdegree = [2, 3, 0, 1, 1, 1, 1, 0]
    outdegree = degree(edge_index[0], dtype=torch.long, num_nodes=num_nodes)

    # For each center node, we need to combine each outgoing edge with each incoming edge
    # We achieve this by creating `outdegree` number of edges for each destination node of the old edge index
    # Example:
    #   outdegree_per_dst = [3, 1, 0, 1, 1, 1, 1, 0, 1]
    #   num_new_edges = 9
    outdegree_per_dst = outdegree[edge_index[1]]
    num_new_edges = outdegree_per_dst.sum()

    # We use each edge from the edge index as new node and assign the new indices in the order of the original edge index
    # Each higher order node has one outgoing edge for each outgoing edge of the original destination node
    # Since we keep the ordering, we can just repeat each node using the outdegree_per_dst tensor
    # Example:
    #   ho_edge_srcs = [0, 0, 0, 1, 3, 4, 5, 6, 8]
    ho_edge_srcs = torch.repeat_interleave(outdegree_per_dst)

    # For each node, we calculate pointers of shape (num_nodes,) that indicate the start of the original edges (new higher order nodes) that have the node as source node
    # (Note we use PyG's cumsum function because it adds a 0 at the beginning of the tensor and we want the `left` boundaries of the intervals, so we also remove the last element of the result with [:-1])
    # Example:
    #   ptrs = [0, 2, 5, 5, 6, 7, 8, 9]
    ptrs = cumsum(outdegree, dim=0)[:-1]

    # Use these pointers to get the start of the edges for each higher order source node and repeat it `outdegree` times
    # Since we keep the ordering, all new higher order edges that have the same source node are indexed consecutively
    # Example:
    #   ho_edge_dsts = [2, 2, 2, 5, 5, 8, 6, 7, 7]
    ho_edge_dsts = torch.repeat_interleave(ptrs[edge_index[1]], outdegree_per_dst)

    # Since the above only repeats the start of the edges, we need to add (0, 1, 2, 3, ...) for all `outdegree` number of edges consecutively to get the correct destination nodes
    # We can achieve this by starting with a range from (0, 1, ..., num_new_edges)
    # Example: 
    #   idx_correction    = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    idx_correction = torch.arange(num_new_edges, dtype=torch.long, device=edge_index.device)
    # Then, we subtract the cumulative sum of the outdegree for each destination node to get a tensor.
    # Example:
    #   idx_correction    = [0, 1, 2, 0, 0, 0, 0, 0, 0]
    idx_correction -= cumsum(outdegree_per_dst, dim=0)[ho_edge_srcs]
    # Finally, we add this tensor to the destination nodes to get the correct destination nodes for each higher order edge
    # Example:
    #   ho_edge_dsts = [2, 3, 4, 5, 5, 8, 6, 7, 7]
    ho_edge_dsts += idx_correction
# tensor([[0, 0, 0, 1, 3, 4, 5, 6, 8],
#         [2, 3, 4, 5, 5, 8, 6, 7, 7]])
    return torch.stack([ho_edge_srcs, ho_edge_dsts], dim=0)

## General evaluation of correctness and runtime

The PyG version on the GPU is omitted since it takes even longer than the CPU version.

In [3]:
from torch_geometric.testing import get_random_edge_index

gnn_conv = SAGEConv(128, 16)
cuda_gnn_conv = SAGEConv(128, 16).to("cuda")
line_graph_transform = LineGraph(force_directed=True)

GNN_baseline_times, cuda_GNN_baseline_times = [], []
PyG_times, MP_times, indexing_times = [], [], []
cuda_PyG_times, cuda_MP_times, cuda_indexing_times = [], [], []
for i in range(50, 51, 50):
    num_nodes = 1000*i
    x = torch.randn(num_nodes, 128)
    cuda_x = x.to("cuda")
    for j in range(10, 21, 10):
        gc.collect()
        torch.cuda.empty_cache()

        num_edges = num_nodes*j
        print(f"Nodes: {num_nodes}, Edges: {num_edges}")
        edge_index = get_random_edge_index(num_nodes, num_nodes, num_edges)
        edge_index = EdgeIndex(edge_index, sparse_size=(num_nodes, num_nodes))
        edge_index = coalesce(edge_index, num_nodes=num_nodes)

        # GNN baseline
        
        t = time()
        gnn_conv(x, edge_index)
        GNN_baseline_times.append(time() - t)
        print(f"\tGNN baseline: {GNN_baseline_times[-1]}s")

        t = time()
        PyG_line_graph_data = line_graph_transform(Data(edge_index=edge_index, num_nodes=num_nodes))
        PyG_times.append(time() - t)
        PyG_line_graph = PyG_line_graph_data.edge_index
        print(f"\tPyG: {PyG_times[-1]}s")

        t = time()
        indexing_line_graph = lift_order_edge_index(edge_index, num_nodes)
        indexing_times.append(time() - t)
        print(f"\tIndexing: {indexing_times[-1]}s")

        cuda_edge_index = edge_index.to("cuda")

        t = time()
        cuda_gnn_conv(cuda_x, cuda_edge_index)
        cuda_GNN_baseline_times.append(time() - t)
        print(f"\tCUDA GNN baseline: {cuda_GNN_baseline_times[-1]}s")

        # t = time()
        # cuda_PyG_line_graph_data = line_graph_transform(Data(edge_index=cuda_edge_index, num_nodes=num_nodes))
        # cuda_PyG_times.append(time() - t)
        # cuda_PyG_line_graph = cuda_PyG_line_graph_data.edge_index
        # print(f"\tCUDA PyG: {cuda_PyG_times[-1]}s")

        t = time()
        cuda_indexing_line_graph = lift_order_edge_index(cuda_edge_index, num_nodes)
        cuda_indexing_times.append(time() - t)
        print(f"\tCUDA Indexing: {cuda_indexing_times[-1]}s")

        if not (PyG_line_graph == indexing_line_graph).all():
            print(f"Iteration {i}: Indexing and PyG are not equal")
            print(PyG_line_graph)
            print(indexing_line_graph)
            print((PyG_line_graph != indexing_line_graph).nonzero())
            print(edge_index)
            break

print(f"Avg PyG time: {np.mean(PyG_times)}s +/- {np.std(PyG_times)}s")
print(f"Avg Indexing time: {np.mean(indexing_times)}s +/- {np.std(indexing_times)}s")
# print(f"Avg CUDA PyG time: {np.mean(cuda_PyG_times)}s +/- {np.std(cuda_PyG_times)}s")
print(f"Avg CUDA Indexing time: {np.mean(cuda_indexing_times)}s +/- {np.std(cuda_indexing_times)}s")

Nodes: 50000, Edges: 500000
	GNN baseline: 0.030921459197998047s
	PyG: 5.057637929916382s
	Indexing: 0.0286409854888916s
	CUDA GNN baseline: 0.21027898788452148s
	CUDA Indexing: 0.021584033966064453s
Nodes: 50000, Edges: 1000000
	GNN baseline: 0.05266857147216797s
	PyG: 10.635715246200562s
	Indexing: 0.11666655540466309s
	CUDA GNN baseline: 0.0031747817993164062s
	CUDA Indexing: 0.34000468254089355s
Avg PyG time: 7.846676588058472s +/- 2.78903865814209s
Avg Indexing time: 0.07265377044677734s +/- 0.04401278495788574s
Avg CUDA Indexing time: 0.180794358253479s +/- 0.15921032428741455s


## Toy Example

In [4]:
edge_index = torch.tensor([[0, 0, 1, 1, 3, 4, 1, 6, 5],
                           [1, 3, 2, 3, 4, 5, 6, 5, 7]])

In [5]:
edge_index_2 = pp.DAGData.lift_order_dag(edge_index.unsqueeze(-1))
print(edge_index_2)

tensor([[[0, 1],
         [0, 1],
         [0, 1],
         [0, 3],
         [1, 3],
         [3, 4],
         [4, 5],
         [6, 5],
         [1, 6]],

        [[1, 2],
         [1, 3],
         [1, 6],
         [3, 4],
         [3, 4],
         [4, 5],
         [5, 7],
         [5, 7],
         [6, 5]]])


In [6]:
edge_index_new = lift_order_edge_index(EdgeIndex(edge_index))
print(edge_index_new)

tensor([[0, 0, 0, 1, 3, 4, 5, 6, 8],
        [2, 3, 4, 5, 5, 8, 6, 7, 7]])


In [7]:
edge_index_pyg = LineGraph()(Data(edge_index=edge_index, num_nodes=8)).edge_index
print(edge_index_pyg)

tensor([[0, 0, 0, 1, 3, 4, 5, 6, 8],
        [2, 3, 4, 5, 5, 8, 6, 7, 7]])


In [8]:
print((edge_index_pyg == edge_index_new).all())

tensor(True)


### With Edge Weights (TODO)

Depending on how you count each walk, you will get different statistics. We can choose the aggregation via `freq_aggr` to be either "propagation", i.e. each walk counts with its weight, or "diffusion" i.e. each walk is counted with the probability of a random walker starting at the first node to end up in the last. 

In [9]:
edge_index = torch.tensor([[0, 0, 1, 1, 3, 4, 1, 6, 5],
                           [1, 3, 2, 3, 4, 5, 6, 5, 7]])
edge_attr = torch.tensor([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.float32)

# Exponentionally Large DAG

In [10]:
layers = 5
branches = 15

edges = []
prev_layer_nodes = [0]
j = 1
for _ in trange(layers):
    layer_nodes = []
    for node in prev_layer_nodes:
        for _ in range(branches):
            layer_nodes.append(j)
            edges.append((f"{node}", f"{j}"))
            j+=1
    prev_layer_nodes = layer_nodes

dag = pp.Graph.from_edge_list(edges)
dag_edge_index = dag.data.edge_index.unsqueeze(-1)

dag_edge_index_gpu = dag.data.edge_index.cuda()

100%|██████████| 5/5 [00:00<00:00, 20.25it/s]


### Current implementation

In [11]:
%timeit pp.DAGData.lift_order_dag(dag_edge_index)

1min 14s ± 1.2 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Tensor Indexing based implementation (CPU)

In [12]:
%timeit lift_order_edge_index(dag.data.edge_index)

12.6 ms ± 155 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Tensor Indexing based implementation (GPU)

In [13]:
%timeit lift_order_edge_index(dag_edge_index_gpu)

1.5 ms ± 43.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### PyG implementation (CPU)

In [14]:
%timeit line_graph_transform(dag.data)

8.04 s ± 634 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## PyG implementation (GPU)

In [15]:
%timeit line_graph_transform(dag.data.cuda())

2min 55s ± 43 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Many Walks

In [16]:
n_walks = 10000
walk_length = 1000

walks = [list(range(walk_length)) for _ in range(n_walks)]
orig_walk = pp.WalkData()
for walk in walks:
    orig_walk.add_walk_seq(walk)

path_list = list(orig_walk.paths.values())
path_freq_tensor = torch.tensor(list(orig_walk.path_freq.values()))
mapping = pp.IndexMap()
nested_walk = pp.WalkDataNested(path_list, path_freq=path_freq_tensor, mapping=mapping)

  self.paths = nested_tensor(paths, dtype=torch.long)


### Original Walk Implementation

In [17]:
%timeit orig_walk.edge_index_k_weighted(2)

41.3 s ± 9.45 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Nested Tensor Implementation (CPU)

In [18]:
pp.config['torch']["device"] = "cpu"
%timeit nested_walk.edge_index_k_weighted(2)

459 ms ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Nested Tensor (GPU)

In [19]:
pp.config['torch']["device"] = "cuda"
cuda_path_list = [path.cuda() for path in path_list]
cuda_path_freq = path_freq_tensor.cuda()
cuda_nested_walk = pp.WalkDataNested(cuda_path_list, path_freq=cuda_path_freq, mapping=mapping)
%timeit cuda_nested_walk.edge_index_k_weighted(2)

394 ms ± 51.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Indexing Implementation (CPU + GPU)

In [20]:
# We create a list of Data objects where each Data object contains the edge index of a path (could also be a DAG in theory)
data_list = [Data(edge_index=path.long(), num_nodes=walk_length) for path in path_list]
# We use a dataloader from PyG to combine all the edge indices into a single graph with multiple disjoint subgraphs
# If two paths share a node, the node is duplicated in the resulting graph and the new higher order edges need to be aggregated afterwards
# Note that due to the `batch_size` parameter, we can also do computations on a set of paths that are too large to fit into memory at once
walk_graph = next(iter(DataLoader(data_list, batch_size=n_walks)))
edge_index = walk_graph.edge_index

The following measures the time to do the LineGraph graph transformation for the edge index that contains all paths as disjunct subgraphs. Since the aggregations afterwards are omitted, the runtimes are not exactly comparable to the above. See the next section (With Weights and the Aggregation) for a full `edge_index_k_weighted` transformation.

In [21]:
%timeit lift_order_edge_index(edge_index)

540 ms ± 42.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
cuda_edge_index = edge_index.cuda()
%timeit lift_order_edge_index(cuda_edge_index)

44.8 ms ± 1.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## PyG Implementation (CPU + GPU)

In [23]:
%timeit line_graph_transform(walk_graph)

1min 19s ± 2.58 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
# %timeit line_graph_transform(walk_graph.cuda())

### With Weights and the Aggregation (TODO: Change to indexing based)

In [25]:
# def edge_index_k_weighted(path_list, path_freq, aggregation="propagation", device="cuda"):
#     data_list = [
#         Data(
#             edge_index=path.long(), 
#             num_nodes=walk_length,
#             edge_attr=torch.ones(path.size(1), dtype=torch.float32) * path_freq[i],
#             node_idx=torch.arange(walk_length).unsqueeze(-1)
#         ) for i, path in enumerate(path_list)
#         ]
#     walk_graph = next(iter(DataLoader(data_list, batch_size=n_walks, follow_batch=["node_idx"]))).to(device)
#     edge_index = walk_graph.edge_index
#     edge_attr = walk_graph.edge_attr
#     node_idx = torch.arange(edge_index.max() + 1, device=device).unsqueeze(-1)
#     edge_index_2, edge_attr_2 = DeBruijnTransform(aggregation)(node_idx, edge_index, edge_attr)
#     orig_edge_index_2 = walk_graph.node_idx.squeeze()[edge_index_2]
#     unique_edge_index_2, inverse_idx = orig_edge_index_2.unique(dim=1, return_inverse=True)
#     edge_attr_2 = torch.zeros(unique_edge_index_2.size(1), device=device).index_add(0, inverse_idx, edge_attr_2)
#     return unique_edge_index_2, edge_attr_2

In [26]:
# %timeit edge_index_k_weighted(path_list, path_freq_tensor, device="cuda")

# Temporal Graph Benchmark

In [3]:
# !pip install py-tgb

from torch_geometric.edge_index import EdgeIndex, SortOrder
from tgb.linkproppred.dataset_pyg import PyGLinkPropPredDataset

from pathpyG import TemporalGraph
from pathpyG.algorithms import temporal_graph_to_event_dag

dataset = PyGLinkPropPredDataset(name="tgbl-wiki", root="datasets")
data = dataset.get_TemporalData()

raw file found, skipping download
Dataset directory is  /opt/conda/lib/python3.10/site-packages/tgb/datasets/tgbl_wiki
loading processed file


In [4]:
edge_index = EdgeIndex(data.edge_index, sparse_size=(data.num_nodes, data.num_nodes))
edge_index._sort_order = SortOrder.ROW

In [5]:
%timeit lift_order_edge_index(edge_index, num_nodes=data.num_nodes)

1.1 ms ± 36.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [6]:
temporal_graph = TemporalGraph(data)
t = time()
temporal_graph_to_event_dag(temporal_graph, delta=100)
print(f"Temporal graph to event DAG: {time() - t:.1f}s")

Temporal graph to event DAG: 5.8s


In [7]:
large_dataset = PyGLinkPropPredDataset(name="tgbl-review", root="datasets")
large_data = large_dataset.get_TemporalData()
print(large_data.is_coalesced())

raw file found, skipping download
Dataset directory is  /opt/conda/lib/python3.10/site-packages/tgb/datasets/tgbl_review
loading processed file
True


In [8]:
large_edge_index = EdgeIndex(large_data.edge_index, sparse_size=(large_data.num_nodes, large_data.num_nodes))
large_edge_index._sort_order = SortOrder.ROW

In [9]:
%timeit lift_order_edge_index(large_edge_index, num_nodes=large_data.num_nodes)

1.43 s ± 37.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
large_temporal_graph = TemporalGraph(large_data)
t = time()
temporal_graph_to_event_dag(large_temporal_graph)
print(f"Temporal graph to event DAG: {time() - t:.1f}s")

: 