## Preparation

Install and import the necessary dependencies.

In [None]:
# !pip install torch
!pip install torch_geometric
!pip install git+https://github.com/pathpy/pathpyG.git

In [1]:
from math import isnan
from collections import defaultdict, deque

import pandas as pd
import torch
import numpy as np
from tqdm import tqdm

from torch_geometric.data import Data

import pathpyG as pp

### Utility Functions

In [2]:
def show_event_graph(event_graph):
    edge_index_array = event_graph.to_numpy()
    event_names, event_name_indices = np.unique(edge_index_array, return_inverse=True)
    edge_index = torch.tensor(event_name_indices.reshape(-1, 2)).T.contiguous()
    mapping = pp.IndexMap(list(event_names))
    G = pp.Graph(Data(edge_index=edge_index), mapping)
    pp.plot(G, node_label=list(G.nodes))

In [3]:
def from_DAG(event_graph: pd.DataFrame, event_node_mapping: pd.DataFrame, delta=float("inf")):
    event_graph = event_graph.copy()
    event_node_mapping = event_node_mapping.copy()

    # Replace underscores with spaces so that they can be used to separate event and node names
    event_graph["src"] = event_graph["src"].str.replace("_", " ").replace("(", "").replace(")", "").replace(",", "")
    event_graph["dst"] = event_graph["dst"].str.replace("_", " ").replace("(", "").replace(")", "").replace(",", "")
    event_node_mapping["event"] = event_node_mapping["event"].str.replace("_", " ").replace("(", "").replace(")", "").replace(",", "")
    event_node_mapping["node"] = event_node_mapping["node"].str.replace("_", " ").replace("(", "").replace(")", "").replace(",", "")
    
    # Create a pathpyG Graph from the event (scene) DAG
    event_edge_index_array = event_graph.to_numpy()
    event_names, event_name_indices = np.unique(event_edge_index_array, return_inverse=True)
    event_edge_index = torch.tensor(event_name_indices.reshape(-1, 2)).T.contiguous()
    event_graph["src_idx"] = event_edge_index[0]
    event_graph["dst_idx"] = event_edge_index[1]
    event_mapping = pp.IndexMap(list(event_names))
    event_G = pp.Graph(Data(edge_index=event_edge_index), event_mapping)

    # Compute the shortest path lengths between all pairs of events
    dist, predecessors = pp.algorithms.shortest_paths.shortest_paths_dijkstra(event_G)

    # Create cliques for each event
    event_node_mapping["event_idx"] = event_node_mapping["event"].apply(lambda x: event_mapping.to_idx(x))
    same_node_pairs = pd.merge(event_node_mapping, event_node_mapping, on="node")
    same_node_edges = same_node_pairs[dist[same_node_pairs["event_idx_x"], same_node_pairs["event_idx_y"]] <= delta].copy()
    same_node_edges["node_x"] = same_node_edges["node"]
    same_node_edges["node_y"] = same_node_edges["node"]
    same_node_edges = same_node_edges.drop(columns=["node"])

    cliques = pd.merge(event_node_mapping, event_node_mapping, on="event")
    cliques["event_x"] = cliques["event"]
    cliques["event_y"] = cliques["event"]
    cliques = cliques.drop(columns=["event"])

    possible_edges = pd.concat([same_node_edges, cliques], ignore_index=True).reset_index(drop=True)
    possible_edges["src_node_ID"] = possible_edges["event_x"] + "_" + possible_edges["node_x"]
    possible_edges["dst_node_ID"] = possible_edges["event_y"] + "_" + possible_edges["node_y"]

    graph_array = possible_edges[["src_node_ID", "dst_node_ID"]].to_numpy()
    unique_event_node_names, event_node_index_array = np.unique(graph_array, return_inverse=True)
    edge_index = torch.tensor(event_node_index_array.reshape(-1, 2)).T.contiguous()
    data = Data(edge_index=edge_index)
    mapping = pp.IndexMap(list(unique_event_node_names))
    node_names = list()
    for event_node_name in unique_event_node_names:
        event_name, node_name = event_node_name.split("_")
        node_names.append(node_name)
    node_names, node_id = np.unique(node_names, return_inverse=True)
    data.node_sequence = torch.tensor(node_id).unsqueeze(1)
    data.node_names = node_names
    return pp.Graph(data, mapping=mapping)

In [19]:
def temporal_betweenness_centrality(g: pp.Graph) -> dict[str, float]:
    """Calculate the temporal betweenness of nodes in a temporal graph.

    The temporal betweenness centrality definition is based on shortest 
    time-respecting paths with a given maximum time difference delta, where 
    the length of a path is given as the number of traversed edges (i.e. not 
    the temporal duration of a path or the earliest arrival at a node).

    The algorithm is an adaptation of Brandes' fast algorithm for betweenness 
    centrality based on the following work:

    S. Buss, H. Molter, R. Niedermeier, M. Rymar: Algorithmic Aspects of Temporal
    Betweenness, arXiv:2006.08668v2

    Different from the algorithm proposed above, the temporal betweenness centrality
    implemented in pathpyG is based on a directed acyclic event graph representation of 
    a temporal graph and it considers a maximum waiting time of delta. The complexity 
    is in O(nm) where n is the number of nodes in the temporal graph and m is the number 
    of time-stamped edges.

    Args:
        g: `TemporalGraph` object for which temporal betweenness centrality will be computed
        delta: maximum waiting time for time-respecting paths

    Example:
        ```py
        import pathpyG as pp
        t = pp.TemporalGraph.from_edge_list([('a', 'b', 1), ('b', 'c', 2),
                            ('b', 'd', 2), ('c', 'e', 3), ('d', 'e', 3)])
        bw = pp.algorithms.temporal_betweenness_centrality(t, delta=1)
        ```
    """
    # generate temporal event DAG
    edge_index = pp.algorithms.lift_order.lift_order_edge_index(G.data.edge_index, G.N)

    # Add indices of first-order nodes as src of paths in augmented
    # temporal event DAG
    src_edges_src = g.data.node_sequence.squeeze()[g.data.edge_index[0]] + g.M
    src_edges_dst = torch.arange(0, g.data.edge_index.size(1))

    # add edges from first-order source nodes to edge events
    src_edges = torch.stack([src_edges_src, src_edges_dst])
    edge_index = torch.cat([edge_index, src_edges], dim=1)
    src_indices = torch.unique(src_edges_src).tolist()

    event_graph = pp.Graph.from_edge_index(edge_index, num_nodes=g.M+g.N)

    e_i = pp.utils.convert.to_numpy(g.data.edge_index)

    fo_nodes = dict()
    for v in range(g.M+g.N):
        if v < g.M:  # return first-order target node otherwise
            fo_nodes[v] = g.data.node_sequence.squeeze()[e_i[1, v]]
        else:
            fo_nodes[v] = v - g.M

    bw: defaultdict[int, float] = defaultdict(lambda: 0.0)

    # for all first-order nodes
    for s in tqdm(src_indices):

        # for any given s, d[v] is the shortest path distance from s to v
        # Note that here we calculate topological distances from sources to events (i.e. time-stamped edges)
        delta_: defaultdict[int, float] = defaultdict(lambda: 0.0)

        # for any given s, sigma[v] counts shortest paths from s to v
        sigma: defaultdict[int, float] = defaultdict(lambda: 0.0)
        sigma[s] = 1.0

        sigma_fo: defaultdict[int, float] = defaultdict(lambda: 0.0)
        sigma_fo[fo_nodes[s]] = 1.0

        dist: defaultdict[int, int] = defaultdict(lambda: -1)
        dist[s] = 0

        dist_fo: defaultdict[int, int] = defaultdict(lambda: -1)
        dist_fo[fo_nodes[s]] = 0
                
        # for any given s, P[v] is the set of predecessors of v on shortest paths from s
        P = defaultdict(set)

        # Q is a queue, so we append at the right and pop from the left
        Q: deque = deque()
        Q.append(s)

        # S is a stack, so we append at the end and pop from the end
        S = list()
    
        # dijkstra with path counting
        while Q:
            v = Q.popleft()
            # for all successor events within delta
            for w in event_graph.successors(v):

                # we dicover w for the first time
                if dist[w] == -1:
                    dist[w] = dist[v] + 1
                    if dist_fo[fo_nodes[w]] == -1:
                        dist_fo[fo_nodes[w]] = dist[v] + 1
                    S.append(w)
                    Q.append(w)
                # we found a shortest path to event w via event v
                if dist[w] == dist[v] + 1:
                    sigma[w] += sigma[v]
                    P[w].add(v)
                    # we found a shortest path to first-order node of event w
                    if dist[w] == dist_fo[fo_nodes[w]]:
                        sigma_fo[fo_nodes[w]] += sigma[v]
        
        c = 0.0
        for i in dist_fo:
            if dist_fo[i] >= 0:
                c += 1.0
        bw[fo_nodes[s]] = bw[fo_nodes[s]] - c + 1.0

        while S:
            w = S.pop()
            # work backwards through paths to all targets and sum delta and sigma   
            if dist[w] == dist_fo[fo_nodes[w]]:
                x = sigma[w]/sigma_fo[fo_nodes[w]]
                if isnan(x):
                    x = 0.0
                delta_[w] += x
            for v in P[w]:
                x = sigma[v]/sigma[w]
                if isnan(x):
                    x = 0.0
                delta_[v] += x * delta_[w]
                bw[fo_nodes[v]] += delta_[w] * x
    
    # map index-based centralities to node IDs
    bw_id = defaultdict(lambda: 0.0)
    for idx in bw:
        bw_id[g.data.node_names[idx]] = bw[idx]
    return bw_id

## Read Data

In the following, we read the data from the book "John Sinclair 32: Turm der 1000 Schrecken".

In [4]:
time_line = np.array([[0, 29], [1, 0], [2, 1], [3, 2], [4, 3], [5, 4], [6, 5], [7, 6], [8, 17], [9, 7], [10, 8], [11, 9], [12, 10], [13, 18], [14, 19], [15, 20], [16, 21], [17, 22], [18, 23], [19, 24], [20, 25], [21, 11], [22, 12], [23, 13], [24, 14], [25, 15], [26, 16], [27, 30], [28, 31], [29, 32], [30, 33], [31, 26], [32, 27], [33, 28], [34, 34], [35, 35], [36, 42], [37, 36], [38, 37], [39, 39], [40, 38], [41, 41], [42, 43], [43, 44], [44, 45], [45, 40], [46, 46], [47, 50], [48, 47], [49, 51], [50, 52], [51, 53], [52, 48], [53, 54], [54, 49], [55, 55], [56, 56], [57, 57]], dtype=str)
figures = np.array([[0, 'Carla'], [1, 'Carla'], [2, 'George'], [3, 'George'], [4, 'John'], [4, 'Suko'], [5, 'John'], [5, 'Inspektor Grey'], [6, 'John'], [6, 'Suko'], [6, 'Jim'], [7, 'John'], [7, 'Suko'], [7, 'Jim'], [7, 'George'], [8, 'Carla'], [8, 'Cedric'], [8, 'Dave'], [8, 'Roy'], [8, 'Odetta'], [8, 'Jerry'], [8, 'Arthur'], [8, 'Laureen'], [8, 'Angela'], [8, 'Sylvia'], [9, 'John'], [9, 'Suko'], [9, 'George'], [10, 'John'], [10, 'Suko'], [11, 'Suko'], [11, 'Der magische Schatten'], [12, 'John'], [12, 'Suko'], [13, 'Carla'], [13, 'Cedric'], [13, 'Dave'], [13, 'Roy'], [13, 'Odetta'], [13, 'Jerry'], [13, 'Arthur'], [13, 'Laureen'], [13, 'Angela'], [13, 'Sylvia'], [14, 'Carla'], [14, 'Cedric'], [14, 'Dave'], [14, 'Roy'], [14, 'Odetta'], [14, 'Jerry'], [14, 'Arthur'], [14, 'Laureen'], [14, 'Angela'], [14, 'Sylvia'], [15, 'Carla'], [15, 'Cedric'], [15, 'Dave'], [15, 'Roy'], [15, 'Odetta'], [15, 'Jerry'], [15, 'Arthur'], [15, 'Laureen'], [15, 'Angela'], [15, 'Sylvia'], [16, 'Carla'], [16, 'Cedric'], [16, 'Dave'], [16, 'Roy'], [16, 'Odetta'], [16, 'Jerry'], [16, 'Arthur'], [16, 'Laureen'], [16, 'Angela'], [16, 'Sylvia'], [16, 'Herb Scatwell'], [17, 'Carla'], [17, 'Dave'], [18, 'Carla'], [19, 'Carla'], [20, 'Carla'], [21, 'John'], [21, 'Suko'], [21, 'Dr. Lesley Calhoun'], [22, 'John'], [22, 'Suko'], [23, 'Suko'], [24, 'Suko'], [24, 'Der magische Schatten'], [25, 'John'], [26, 'John'], [26, 'Suko'], [27, 'Cedric'], [27, 'Dave'], [27, 'Roy'], [27, 'Odetta'], [27, 'Jerry'], [27, 'Arthur'], [27, 'Laureen'], [27, 'Angela'], [27, 'Sylvia'], [27, 'Herb Scatwell'], [28, 'Cedric'], [28, 'Dave'], [28, 'Roy'], [28, 'Odetta'], [28, 'Jerry'], [28, 'Arthur'], [28, 'Laureen'], [28, 'Angela'], [28, 'Sylvia'], [28, 'Herb Scatwell'], [29, 'Cedric'], [29, 'Dave'], [29, 'Roy'], [29, 'Odetta'], [29, 'Jerry'], [29, 'Arthur'], [29, 'Laureen'], [29, 'Angela'], [29, 'Sylvia'], [29, 'Herb Scatwell'], [30, 'Cedric'], [30, 'Dave'], [30, 'Roy'], [30, 'Odetta'], [30, 'Jerry'], [30, 'Arthur'], [30, 'Laureen'], [30, 'Angela'], [30, 'Sylvia'], [31, 'Carla'], [32, 'Carla'], [33, 'Carla'], [33, 'Der magische Schatten'], [34, 'John'], [34, 'Inspektor Grey'], [35, 'Suko'], [35, 'Der magische Schatten'], [36, 'Carla'], [36, 'Der magische Schatten'], [37, 'Cedric'], [37, 'Odetta'], [38, 'Cedric'], [38, 'Der magische Schatten'], [39, 'John'], [39, 'Suko'], [40, 'Cedric'], [41, 'Odetta'], [42, 'Odetta'], [42, 'Carla'], [43, 'Odetta'], [43, 'Carla'], [44, 'Odetta'], [44, 'Carla'], [45, 'John'], [45, 'Cedric'], [46, 'Carla'], [46, 'Elizabeth Walters'], [47, 'John'], [47, 'Inspektor Grey'], [47, 'Suko'], [47, 'Cedric'], [47, 'Odetta'], [48, 'Carla'], [48, 'Der magische Schatten'], [49, 'John'], [49, 'Suko'], [50, 'John'], [50, 'Suko'], [51, 'John'], [51, 'Suko'], [51, 'Herb Scatwell'], [52, 'Carla'], [52, 'Der magische Schatten'], [53, 'John'], [53, 'Suko'], [53, 'Herb Scatwell'], [54, 'Carla'], [54, 'Der magische Schatten'], [55, 'John'], [55, 'Suko'], [56, 'John'], [56, 'Suko'], [57, 'John'], [57, 'Suko'], [57, 'Der magische Schatten']], dtype=str)

scenes = pd.DataFrame(time_line, columns=["src", "dst"])

characters = pd.DataFrame(figures, columns=["event", "node"])

The following contains the data for a smaller example that can alternatively be used. Remove comments (hashtags) to use it.

In [5]:
# scenes = pd.DataFrame({"src": ["a", "b", "b", "c", "d"],
#                             "dst": ["b", "c", "d", "d", "e"]})

# characters = pd.DataFrame({"event": ["a", "a", "b", "b", "c", "d", "d", "e"],
#                                    "node": ["1", "2", "1", "3", "2", "2", "3", "1"]})

The following graph visualizes how the different scenes relate to each other.

In [6]:
show_event_graph(scenes)

The following uses the provided data to construct a time unfolded graph representation where every character at each scene is connected to all other characters in the same scene and to itself if the character appears in later scenes. The parameter `delta` specifies how "far away" these scenes can be. It specifies the maximum walk length on the above scene graph.

In [7]:
G = from_DAG(scenes, characters, delta=float("inf"))

The following visualizes the time unfolded graph but is commented out due to its size.

In [8]:
# pp.plot(G, node_label=list(G.nodes))

We can now count the connections between each pair of characters and construct a static graph representation with the edge counts as weights:

In [9]:
static_graph = pp.algorithms.lift_order.aggregate_edge_index(G.data.edge_index, G.data.node_sequence)
static_mapping = pp.IndexMap(list(G.data.node_names))
static_graph.mapping = static_mapping
pp.plot(static_graph, node_label=list(static_graph.nodes), edge_size=static_graph.data.edge_weight.tolist())

<pathpyG.visualisations.network_plots.StaticNetworkPlot at 0x7f37517435b0>

Using this static representation, we can calculate centralities, e.g. the betweennes centrality (a bigger value suggest a more important role in the plot):

In [16]:
for key, item in pp.algorithms.centrality.betweenness_centrality(static_graph).items():
    print(f"{key}: {item:.2f}")

Dr. Lesley Calhoun: 0.00
George: 0.00
Jim: 0.00
Inspektor Grey: 0.00
John: 76.17
Suko: 76.17
Der magische Schatten: 6.50
Elizabeth Walters: 0.00
Sylvia: 0.00
Roy: 0.00
Odetta: 46.33
Laureen: 0.00
Jerry: 0.00
Herb Scatwell: 36.83
Dave: 0.00
Cedric: 55.00
Carla: 58.00
Arthur: 0.00
Angela: 0.00


Calculating the centrality measures as above, disregards the arrow of time. To only consider paths that respect the arrow of time, we can transform the graph using a line graph transformation. We call the resulting graph to be of second order.

## 2nd Order Transformation

In the following, we do a line graph transformation of the above graph to investigate higher-order patterns. In the graph, each edge represents a walk of length 2 and a node an edge of the original graph.

In [17]:
ho_index = pp.algorithms.lift_order.lift_order_edge_index(G.data.edge_index, G.N)
node_sequence = G.data.node_sequence
node_sequence = torch.cat([node_sequence[G.data.edge_index[0]], node_sequence[G.data.edge_index[1]][:, -1:]], dim=1)

In [18]:
event_graph = pp.Graph(Data(edge_index=ho_index))
# pp.plot(event_graph, node_label=[str(G.data.node_names[tensor]) for tensor in node_sequence])

We can see that the centralities calculated using this representation differ substantially from the static ones:

In [14]:
for key, item in temporal_betweenness_centrality(G).items():
    print(f"{key}: {item:.2f}")

100%|██████████| 19/19 [00:02<00:00,  7.81it/s]

Angela: 19.00
George: 25.00
Dr. Lesley Calhoun: -0.00
Jim: -0.00
Inspektor Grey: -0.00
Suko: 0.00
John: 29.28
Der magische Schatten: 12.00
Elizabeth Walters: -0.00
Herb Scatwell: 109.56
Odetta: 72.08
Dave: 21.33
Cedric: 87.22
Carla: 132.01
Sylvia: 0.00
Laureen: 19.00
Arthur: 19.00
Jerry: 19.00
Roy: 19.00



