In [1]:
import torch

import pathpyG as pp

print('Running on', pp.config['torch']['device'])

Running on cpu


In [92]:
paths = pp.PathData()
paths.add_walk(torch.tensor([[0,2,3],[2,3,4]]),freq=3) # A -> C -> D
paths.add_walk(torch.tensor([[0,2],[2,3]])) # A -> C -> D
paths.add_walk(torch.tensor([[1,2],[2,4]])) # B -> C -> E
paths.add_walk(torch.tensor([[4],[5]]))
# paths.add_walk(torch.tensor([[1,2],[2,4]])) # B -> C -> E


In [99]:
index, edge_weights = paths.edge_index_k_weighted(k=2)
index, edge_weights

(tensor([[[0, 2],
          [1, 2],
          [2, 3]],
 
         [[2, 3],
          [2, 4],
          [3, 4]]]),
 tensor([4., 1., 3.]))

In [80]:
k=3
torch.tensor([[0,2,3,4],[2,3,4,5]]).unfold(1,k,1)

tensor([[[0, 2, 3],
         [2, 3, 4]],

        [[2, 3, 4],
         [3, 4, 5]],

        [[2, 3, 4],
         [3, 4, 5]]])

In [None]:
index, edge_weights = paths.edge_index_k_weighted(k=1)

In [53]:
from collections import defaultdict
def node_traversals(paths):
    """Calculates the number of times any path traverses each of the nodes.

    Parameters
    ----------
    paths: Paths

    Returns
    -------
    dict
    """
    traversals = defaultdict(lambda: 0)
    for path_id, path_edgelist in paths.paths.items():
        path_seq = paths.walk_to_node_seq(path_edgelist)
        for node in path_seq:
            traversals[node.item()] += paths.path_freq[path_id]
    return traversals
# node_traversals(paths)

def visitation_probabilities(paths):
    """Calculates the probabilities that a randomly chosen path passes through each of
    the nodes. If 5 out of 100 paths (of any length) traverse node v, node v will be
    assigned a visitation probability of 0.05. This measure can be interpreted as ground
    truth for the notion of importance captured by PageRank applied to a graphical
    abstraction of the paths.

    Parameters
    ----------
    paths: Paths

    Returns
    -------
    dict
    """
    if not isinstance(paths, pp.PathData):
        assert False, "`paths` must be an instance of Paths"
    # Log.add('Calculating visitation probabilities...', Severity.INFO)

    # entries capture the probability that a given node is visited on an arbitrary path
    # Note: this is identical to the subpath count of zero-length paths
    # (i.e. the relative frequencies of nodes across all pathways)
    visit_probabilities = node_traversals(paths)

    # total number of visits
    visits = 0.0
    for v in visit_probabilities:
        visits += visit_probabilities[v]

    for v in visit_probabilities:
        visit_probabilities[v] /= visits
    # Log.add('finished.', Severity.INFO)
    return visit_probabilities

visitation_probabilities(paths)



defaultdict(<function __main__.node_traversals.<locals>.<lambda>()>,
            {0: 0.3333333333333333,
             2: 0.3333333333333333,
             3: 0.3333333333333333})

In [235]:
from collections import defaultdict
import numpy as _np

def shortest_paths(paths):
    """
    Calculates all shortest paths between all pairs of nodes 
    based on a set of empirically observed paths.
    """
    s_p = defaultdict(lambda: defaultdict(set))
    s_p_lengths = defaultdict(lambda: defaultdict(lambda: _np.inf))

    p_length = 1
    index, edge_weights = paths.edge_index_k_weighted(k=p_length)
    sources = index[0]
    destinations = index[-1]
    for e, (s, d) in enumerate(zip(sources, destinations)):
        s = s.item()
        d = d.item()
        s_p_lengths[s][d] = p_length
        s_p[s][d] = set({torch.tensor([s,d])})
    p_length += 1
    while True: # until max path length
        try:
            index, edge_weights = paths.edge_index_k_weighted(k=p_length)
            sources = index[0, :, 0]
            destinations = index[1, :, -1]
            for e, (s, d) in enumerate(zip(sources, destinations)):
                s = s.item()
                d = d.item()
                if p_length < s_p_lengths[s][d]:
                    # update shortest path length
                    s_p_lengths[s][d] = p_length
                    # redefine set
                    s_p[s][d] = {paths.walk_to_node_seq(index[:, e])}
                elif p_length == s_p_lengths[s][d]:
                    s_p[s][d].add(paths.walk_to_node_seq(index[:, e]))
            p_length += 1
        except IndexError:
            print(f"IndexError occurred. Reached maximum path length of {p_length}")
            break
    return s_p
shortest_paths(paths)

IndexError occurred. Reached maximum path length of 4


defaultdict(<function __main__.shortest_paths.<locals>.<lambda>()>,
            {0: defaultdict(set,
                         {2: {tensor([0, 2])},
                          3: {tensor([0, 2, 3])},
                          4: {tensor([0, 2, 3, 4])}}),
             1: defaultdict(set,
                         {2: {tensor([1, 2])}, 4: {tensor([1, 2, 4])}}),
             2: defaultdict(set, {3: {tensor([2, 3])}, 4: {tensor([2, 4])}}),
             3: defaultdict(set, {4: {tensor([3, 4])}}),
             4: defaultdict(set, {5: {tensor([4, 5])}})})

In [254]:
# @betweenness.register(Paths)
def betweenness(paths, normalized=False):
    """Calculates the betweenness of nodes based on observed shortest paths
    between all pairs of nodes

    Parameters
    ----------
    paths:
        Paths object
    normalized: bool
        normalize such that largest value is 1.0

    Returns
    -------
    dict
    """
    assert isinstance(paths, pp.PathData), "argument must be an instance of pathpy.Paths"
    node_centralities = defaultdict(lambda: 0)

    # Log.add('Calculating betweenness in paths ...', Severity.INFO)

    all_paths = shortest_paths(paths)

    for s in all_paths:
        for d in all_paths[s]:
            for p in all_paths[s][d]:
                for x in p[1:-1]:
                    if s != d != x:
                        node_centralities[x.item()] += 1.0 / len(all_paths[s][d])
    if normalized:
        max_centr = max(node_centralities.values())
        for v in node_centralities:
            node_centralities[v] /= max_centr
    # assign zero values to nodes not occurring on shortest paths
    nodes = [v.item() for v in paths.edge_index.reshape(-1).unique(dim=0)]
    for v in nodes:
        node_centralities[v] += 0
    # Log.add('finished.')
    return node_centralities

betweenness(paths,normalized=False)

IndexError occurred. Reached maximum path length of 4


defaultdict(<function __main__.betweenness.<locals>.<lambda>()>,
            {2: 3.0, 3: 1.0, 0: 0, 1: 0, 4: 0, 5: 0})

In [243]:
paths.paths

{0: tensor([[0, 2, 3],
         [2, 3, 4]]),
 1: tensor([[0, 2],
         [2, 3]]),
 2: tensor([[1, 2],
         [2, 4]]),
 3: tensor([[4],
         [5]])}

In [37]:
paths.path_freq

{0: 3, 1: 1, 2: 1, 3: 1}

In [14]:
paths.num_paths

4

In [255]:
paths.node_id

[]

In [259]:
    # dist = defaultdict(lambda: defaultdict(lambda: _np.inf))

    # Log.add('Calculating distance matrix based on empirical paths ...', Severity.INFO)
    # # Node: no need to initialize shortest_path_lengths[v][v] = 0
    # # since paths of length zero are contained in self.paths

    # for v in paths.nodes:
    #     dist[v][v] = 0

    # for p_length in paths.paths:
    #     for p in paths.paths[p_length]:
    #         start = p[0]
    #         end = p[-1]
    #         if p_length < dist[start][end]:
    #             dist[start][end] = p_length

    # Log.add('finished.', Severity.INFO)

    # return dist
    ####################################################################
# NOTE: pp2 code (see above) for 'distance_matrix' also doesn t return a matrix


In [260]:

def distance_matrix(paths):
    """
    Calculates shortest path distances between all pairs of
    nodes based on the observed shortest paths (and subpaths)
    """
    dist = defaultdict(lambda: defaultdict(lambda: _np.inf))
    # Log.add('Calculating distance matrix based on empirical paths ...', Severity.INFO)
    nodes = [v.item() for v in paths.edge_index.reshape(-1).unique(dim=0)] # NOTE: modify once set of nodes can be obtained from path obeject
    for v in nodes:
        dist[v][v] = 0

    p_length = 1
    index, edge_weights = paths.edge_index_k_weighted(k=p_length)
    sources = index[0]
    destinations = index[-1]
    for e, (s, d) in enumerate(zip(sources, destinations)):
        s = s.item()
        d = d.item()
        dist[s][d] = p_length
        # s_p[s][d] = set({torch.tensor([s,d])})
    p_length += 1
    while True: # until max path length
        try:
            index, edge_weights = paths.edge_index_k_weighted(k=p_length)
            sources = index[0, :, 0]
            destinations = index[1, :, -1]
            for e, (s, d) in enumerate(zip(sources, destinations)):
                s = s.item()
                d = d.item()
                if p_length < dist[s][d]:
                    # update shortest path length
                    dist[s][d] = p_length
            p_length += 1
        except IndexError:
            print(f"IndexError occurred. Reached maximum path length of {p_length}")
            break
    return dist
distance_matrix(paths)
    

IndexError occurred. Reached maximum path length of 4


defaultdict(<function __main__.distance_matrix.<locals>.<lambda>()>,
            {0: defaultdict(<function __main__.distance_matrix.<locals>.<lambda>.<locals>.<lambda>()>,
                         {0: 0, 2: 1, 3: 2, 4: 3}),
             1: defaultdict(<function __main__.distance_matrix.<locals>.<lambda>.<locals>.<lambda>()>,
                         {1: 0, 2: 1, 4: 2}),
             2: defaultdict(<function __main__.distance_matrix.<locals>.<lambda>.<locals>.<lambda>()>,
                         {2: 0, 3: 1, 4: 1}),
             3: defaultdict(<function __main__.distance_matrix.<locals>.<lambda>.<locals>.<lambda>()>,
                         {3: 0, 4: 1}),
             4: defaultdict(<function __main__.distance_matrix.<locals>.<lambda>.<locals>.<lambda>()>,
                         {4: 0, 5: 1}),
             5: defaultdict(<function __main__.distance_matrix.<locals>.<lambda>.<locals>.<lambda>()>,
                         {5: 0})})

In [263]:
def closeness(paths, normalized=False):
    """Calculates the closeness of nodes based on observed shortest paths
    between all nodes

    Parameters
    ----------
    paths: Paths
    normalized: bool
        normalize such that largest value is 1.0

    Returns
    -------
    dict
    """
    node_centralities = defaultdict(lambda: 0)
    distances = distance_matrix(paths)
    nodes = [v.item() for v in paths.edge_index.reshape(-1).unique(dim=0)] # NOTE: modify once set of nodes can be obtained from path obeject

    for x in nodes:
        # calculate closeness centrality of x
        for d in nodes:
            if x != d and distances[d][x] < _np.inf:
                node_centralities[x] += 1.0 / distances[d][x]

    # assign zero values to nodes not occurring
    
    for v in nodes:
        node_centralities[v] += 0

    if normalized:
        m = max(node_centralities.values())
        for v in nodes:
            node_centralities[v] /= m

    return node_centralities
closeness(paths, normalized=True)

IndexError occurred. Reached maximum path length of 4


defaultdict(<function __main__.closeness.<locals>.<lambda>()>,
            {2: 0.7058823529411765,
             3: 0.5294117647058824,
             4: 1.0,
             5: 0.35294117647058826,
             0: 0.0,
             1: 0.0})

In [None]:
# @distance_matrix.register(Paths)
# def _dm(paths):
#     """
#     Calculates shortest path distances between all pairs of
#     nodes based on the observed shortest paths (and subpaths)
#     """
#     dist = defaultdict(lambda: defaultdict(lambda: _np.inf))

#     Log.add('Calculating distance matrix based on empirical paths ...', Severity.INFO)
#     # Node: no need to initialize shortest_path_lengths[v][v] = 0
#     # since paths of length zero are contained in self.paths

#     for v in paths.nodes:
#         dist[v][v] = 0

#     for p_length in paths.paths:
#         for p in paths.paths[p_length]:
#             start = p[0]
#             end = p[-1]
#             if p_length < dist[start][end]:
#                 dist[start][end] = p_length

#     Log.add('finished.', Severity.INFO)

#     return dist




@closeness.register(Paths)
def _cl(paths, normalized=False):
    """Calculates the closeness of nodes based on observed shortest paths
    between all nodes

    Parameters
    ----------
    paths: Paths
    normalized: bool
        normalize such that largest value is 1.0

    Returns
    -------
    dict
    """
    node_centralities = defaultdict(lambda: 0)
    distances = distance_matrix(paths)
    nodes = paths.nodes

    for x in nodes:
        # calculate closeness centrality of x
        for d in nodes:
            if x != d and distances[d][x] < _np.inf:
                node_centralities[x] += 1.0 / distances[d][x]

    # assign zero values to nodes not occurring
    
    for v in nodes:
        node_centralities[v] += 0

    if normalized:
        m = max(node_centralities.values())
        for v in nodes:
            node_centralities[v] /= m

    return node_centralities




