In [1]:
import pathpyG as pp

import torch
import torch_geometric as pyG

In [2]:
walks = pp.WalkData()
walks.mapping = pp.IndexMap(list("abcde"))
walks.add_walk_seq(tuple("acd"), freq = 2)
walks.add_walk_seq(tuple("bce"),freq = 2)

AttributeError: module 'pathpyG' has no attribute 'WalkData'

In [3]:
print(walks.mapping)

a -> 0
b -> 1
c -> 2
d -> 3
e -> 4



In [4]:
walks.paths

{0: tensor([[0, 2],
         [2, 3]], dtype=torch.int32),
 1: tensor([[1, 2],
         [2, 4]], dtype=torch.int32)}

In [5]:
g2 = pp.HigherOrderGraph(walks, order = 2)
pp.plot(g2)

<pathpyG.visualisations.network_plots.StaticNetworkPlot at 0x7f34941c1c60>

In [6]:
from pathpyG import PathData
from pathpyG import HigherOrderGraph
from typing import Any

class MultiOrderGraph:
    """Instances of this class represent a hierarchy of
    higher-order networks which collectively represent
    a multi-order model for path statistics. """

    def __init__(self, paths: PathData, max_order: int = 1, **kwargs: Any):

        self.max_order = max_order
        self.layers = {}
        self.paths = paths

        for k in range(max_order+1):
            if k == 0:
                print("support for zeroth order missing")
                continue
            self.layers[k] = HigherOrderGraph(paths = paths, order = k)


# dof_zeroth = n_nodes - 1

In [7]:
mon = MultiOrderGraph(walks, max_order = 2)
pp.plot(mon.layers[2])

support for zeroth order missing


<pathpyG.visualisations.network_plots.StaticNetworkPlot at 0x7f35846c8160>

In [8]:
g1 = mon.layers[1]
g2 = mon.layers[2]

In [9]:
walks.edge_index

tensor([[0, 1, 2, 2],
        [2, 2, 3, 4]], dtype=torch.int32)

In [10]:
g1.data.edge_index

EdgeIndex([[0, 1, 2, 2],
           [2, 2, 3, 4]], dtype=torch.int32, sparse_size=(3, ?), nnz=4,
          sort_order=row)

In [11]:
walks.edge_index

tensor([[0, 1, 2, 2],
        [2, 2, 3, 4]], dtype=torch.int32)

Computing degrees of freedom

In [12]:
# torch.int64 is necessary to avoid "RuntimeError: scatter(): Expected dtype int64 for index" (from linegraph transformation)
network_topology = pyG.data.Data(edge_index=walks.edge_index.type(torch.int64)
)  
line_graph_tranform = pyG.transforms.LineGraph(force_directed=True)
line_graph = line_graph_tranform(network_topology)

# getting dof of +1 order network
num_len_k_paths = line_graph.num_nodes
num_nonzero_outdegrees = len(line_graph.edge_index[0].unique())
dof = num_len_k_paths - num_nonzero_outdegrees



Computing likelihood

In [13]:
walks

<pathpyG.core.WalkData.WalkData at 0x7f34941c11e0>

In [14]:
g1

<pathpyG.core.HigherOrderGraph.HigherOrderGraph at 0x7f34941c2d70>

In [15]:
g2.edge_to_index

{(0, 2): 0, (1, 3): 1}

In [16]:
g2.data.edge_weight

tensor([2., 2.])

In [17]:
g2.mapping.id_to_idx

{('a', 'c'): 0, ('b', 'c'): 1, ('c', 'd'): 2, ('c', 'e'): 3}

In [18]:
# returns 
out_degrees = pyG.utils.degree(line_graph.edge_index[0])
out_degrees

tensor([2., 2.])

In [19]:
line_graph.num_nodes # 

4

In [20]:
print(g2.mapping)

('a', 'c') -> 0
('b', 'c') -> 1
('c', 'd') -> 2
('c', 'e') -> 3



In [21]:
g2.data.edge_index

EdgeIndex([[0, 1],
           [2, 3]], sparse_size=(2, 4), nnz=2, sort_order=row)

In [22]:
walks.paths

{0: tensor([[0, 2],
         [2, 3]], dtype=torch.int32),
 1: tensor([[1, 2],
         [2, 4]], dtype=torch.int32)}

In [23]:
mon.paths.edge_index_k_weighted(k=1)

(tensor([[0, 1, 2, 2],
         [2, 2, 3, 4]], dtype=torch.int32),
 tensor([2., 2., 2., 2.]))

In [24]:
mon.paths.edge_index_k_weighted(k=2)

(tensor([[[0, 2],
          [1, 2]],
 
         [[2, 3],
          [2, 4]]], dtype=torch.int32),
 tensor([2., 2.]))

In [25]:
g_source = g1
g_target = g2



# NB: this function might fail if we that 
# NB: upward goes from (1,2) to (1,2,x).
#     Downward goes from (x,2,1) to (2,1).
#       (inconsistent) 
#     Alternative could be going to (1,x), such that we anyway take rw step
dk = max(g_source.order, g_target.order) - min(g_source.order, g_target.order)
if g_target.order > g_source.order: 
    if g_source.order == 1:
        source_indexes = [g_source.mapping.to_idx(target_node[0]) for target_node in g_target.mapping.idx_to_id.values()]  
    else:
        # mapping to higher-order that continues current subpath
        source_indexes = [g_source.mapping.to_idx(target_node[:g_source.order]) for target_node in g_target.mapping.idx_to_id.values()]  
    source_to_target_bipartite_edge_index = torch.tensor([source_indexes, list(g_target.mapping.idx_to_id.keys())])
elif g_source.order > g_target.order:
    if g_target.order == 1:
        target_indexes = [g_target.mapping.to_idx(source_node[-1]) for source_node in g_source.mapping.idx_to_id.values()]
    else: 
        # mapping to lower order where only last k_low steps are remembered
        target_indexes = [g_target.mapping.to_idx(source_node[-g_target.order:]) for source_node in g_source.mapping.idx_to_id.values()]
    source_to_target_bipartite_edge_index = torch.tensor([ list(g_source.mapping.idx_to_id.keys()) ,target_indexes])
else: 
    assert False, f"Source and target networks have the same order ({g_source.order},{g_target.order})"

In [26]:
source_indexes#[:2]

[0, 1, 2, 2]

In [27]:
g2.data

Data(edge_index=[2, 2], num_nodes=4, edge_weight=[2])

In [28]:
g2.data.edge_weight

tensor([2., 2.])

In [29]:
g2.data.edge_index

EdgeIndex([[0, 1],
           [2, 3]], sparse_size=(2, 4), nnz=2, sort_order=row)

In [30]:
g1.mapping.id_to_idx.values()

dict_values([0, 1, 2, 3, 4])

In [31]:
edge_index = g1.data.edge_index
edge_weight = g1.data.edge_weight
num_nodes = g1.N

print(edge_index)
print(edge_weight)
row, col = edge_index[0], edge_index[1]
idx = row.to(torch.int64) # if flow == 'source_to_target' else row
# scatter function aggregates (here sums) the weights of edge incidents to each node specified by idx
in_weights = pyG.utils.scatter(edge_weight, idx, dim=0, dim_size=num_nodes, reduce='sum')
print(in_weights)
norm = 1/in_weights
print(norm)
#replaces infinities (from nodes with zero indegrees) with zeros 
norm.masked_fill_(norm == float('inf'), 0)
print(norm)
#
out_probabilities = edge_weight*norm[idx]
print(out_probabilities)
print(list(g1.edges))
# deg_inv_sqrt = deg.pow_(-0.5)
# deg_inv_sqrt.masked_fill_(deg_inv_sqrt == float('inf'), 0)
# edge_weight = deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]



EdgeIndex([[0, 1, 2, 2],
           [2, 2, 3, 4]], dtype=torch.int32, sparse_size=(3, ?), nnz=4,
          sort_order=row)
tensor([2., 2., 2., 2.])
tensor([2., 2., 4., 0., 0.])
tensor([0.5000, 0.5000, 0.2500,    inf,    inf])
tensor([0.5000, 0.5000, 0.2500, 0.0000, 0.0000])
tensor([1.0000, 1.0000, 0.5000, 0.5000])
[('a', 'c'), ('b', 'c'), ('c', 'd'), ('c', 'e')]


In [32]:
def get_transition_probabilities(edge_index, edge_weight, num_nodes):
    """ 
    Use edge indices and edge weights to computes transition probabilities. 
    Example: 
        given edge_index = [[0, 1, 2, 2],[2, 2, 3, 4]] and edge_weight = [2., 2., 2., 2.], 
        returns the out probability of the edges: [1.0000, 1.0000, 0.5000, 0.5000] 
    """
    # TODO: use maybe_num nodes?
    row, col = edge_index[0], edge_index[1]
    idx = row.to(torch.int64) # if flow == 'source_to_target' else row
    # scatter function aggregates (here sums) the weights of edge incidents to each node specified by idx
    in_weights = pyG.utils.scatter(edge_weight, idx, dim=0, dim_size=num_nodes, reduce='sum')
    norm = 1/in_weights
    #replaces infinities (from nodes with zero indegrees) with zeros 
    norm.masked_fill_(norm == float('inf'), 0)
    transition_probabilities = edge_weight*norm[idx]
    return transition_probabilities

edge_index = g1.data.edge_index
edge_weight = g1.data.edge_weight
num_nodes = g1.N

print(edge_index)
print(edge_weight)
get_transition_probabilities(edge_index, edge_weight, num_nodes)

EdgeIndex([[0, 1, 2, 2],
           [2, 2, 3, 4]], dtype=torch.int32, sparse_size=(3, ?), nnz=4,
          sort_order=row)
tensor([2., 2., 2., 2.])


tensor([1.0000, 1.0000, 0.5000, 0.5000])

In [33]:
class Lh_conv(pyG.nn.MessagePassing):
    def __init__(self):
        super().__init__(aggr="sum", flow="source_to_target", node_dim = -1)
    def forward(self, x_low, x_high, source_to_target_edge_index, transition_probability):
        
        N = len(source_to_target_edge_index[0].unique())
        M = len(source_to_target_edge_index[1].unique())
        return self.propagate(
                    source_to_target_edge_index,
                    size = (N,M),
                    x = (x_low, x_high),
                    transition_probability = transition_probability)

    def message(self, x_j, transition_probability):
        return x_j + np.log(transition_probability)

In [34]:
import numpy as np

lh_conv = Lh_conv()
x_low = torch.tensor([np.log(1)])
x_high = None
source_to_target_edge_index = torch.tensor([[0,0],[0,1]])
transition_probability = torch.tensor([.3,.7])
vec_log_lh = lh_conv(x_low, x_high, source_to_target_edge_index, transition_probability)
print(vec_log_lh.sum())

# edgeset is (0,1) (0,0) with probabilities P(0) = 1, P(1|0) = .7 P(0|0) = .3
P_0 = 1
P_1_cond_0 = .7
P_0_cond_0 = .3
#
L_path_01 = P_0*P_1_cond_0
L_path_00 = P_0*P_0_cond_0
expected_log_lh = np.log(L_path_01*L_path_00)  
expected_log_lh

tensor(-1.5606, dtype=torch.float64)


-1.5606477482646683

What is above could compute the Likelihood of a set of paths ALL with the same length

In [35]:
walks = pp.WalkData()
walks.mapping = pp.IndexMap(list("abcdef"))
walks.add_walk_seq(tuple("acd"), freq = 2)
walks.add_walk_seq(tuple("bce"),freq = 2)
walks.add_walk_seq(tuple("acef"),freq = 1)

In [36]:
pp.plot(pp.HigherOrderGraph(walks, order = 1))

<pathpyG.visualisations.network_plots.StaticNetworkPlot at 0x7f35846c84c0>

What if we pad and make all transition to padding vector have probability 1? 

The bip edge indices can be used etc.. 

In [37]:
list_node_seq_paths = [walks.walk_to_node_seq(p) for p in walks.paths.values()]
max_len = max([x.numel() for x in list_node_seq_paths])
padded_paths = [torch.nn.functional.pad(x, pad=(0, max_len - x.numel()), mode='constant', value=-1) for x in list_node_seq_paths]
padded_paths = torch.stack(padded_paths)
padded_paths

tensor([[ 0,  2,  3, -1],
        [ 1,  2,  4, -1],
        [ 0,  2,  4,  5]], dtype=torch.int32)

In [38]:
# from torch.nn.utils import rnn as rnn_utils


# # https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html
# def pad_collate(batch):
#   # (xx, yy) = zip(*batch)
#   # y_lens = [len(y) for y in yy]
#   X = [item[0] for item in batch]
#   x_lens = [len(x) for x in X]
#   y = [item[1] for item in batch]
#   X_pad = rnn_utils.pad_sequence(X, batch_first=True, padding_value = 0) # torch.as_tensor(target_train).view(-1, n_features).float()
#   return X_pad, x_lens, torch.as_tensor(np.array(y))


# class CustomDataset(Dataset):
#     def __init__(self, x, y):
#         self.x = [torch.as_tensor(s) for s in x]
#         self.y = torch.as_tensor(y) #.view(-1, n_nodes)
        
#     def __getitem__(self, index):
#         return (self.x[index], self.y[index])

#     def __len__(self):
#         return len(self.x)


# train_var_data = CustomDataset(source_train, target_train)

# sampler = WeightedRandomSampler(weights=train_subpaths_freq, num_samples=len(train_var_data), replacement=True)
# train_var_loader = DataLoader(train_var_data, batch_size=batch_size, shuffle=False, collate_fn=pad_collate, sampler=sampler)


# from torch.utils.data import DataLoader, Dataset, TensorDataset




In [39]:
g1.mapping.idx_to_id

{0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'}

In [40]:
# https://discuss.pytorch.org/t/transform-matrix-with-set-of-function-per-row/48718/3
# source.apply_(g1.mapping.idx_to_id.get)

max_order = 2
for i in range(0,padded_paths.shape[1]):
    source = padded_paths[:,max(0,i-max_order):i]
    target = padded_paths[:,max(0,i-max_order+1):i+1]
    print(source)
    print(target)
    print("-------------------------------------------")

tensor([], size=(3, 0), dtype=torch.int32)
tensor([[0],
        [1],
        [0]], dtype=torch.int32)
-------------------------------------------
tensor([[0],
        [1],
        [0]], dtype=torch.int32)
tensor([[0, 2],
        [1, 2],
        [0, 2]], dtype=torch.int32)
-------------------------------------------
tensor([[0, 2],
        [1, 2],
        [0, 2]], dtype=torch.int32)
tensor([[2, 3],
        [2, 4],
        [2, 4]], dtype=torch.int32)
-------------------------------------------
tensor([[2, 3],
        [2, 4],
        [2, 4]], dtype=torch.int32)
tensor([[ 3, -1],
        [ 4, -1],
        [ 4,  5]], dtype=torch.int32)
-------------------------------------------


In [42]:
import torch

# Sample tensor
tensor = torch.tensor([[1, 2, 3],
                       [4, 5, 6]])

# Mapping dictionaries
mapping1 = {1: 'a', 2: 'b', 3: 'c'}
mapping2 = {4: 'd', 5: 'e', 6: 'f'}

# Convert tensor to strings using the first mapping
mapped_tensor = tensor.clone().reshape(-1).tolist()  # Convert tensor to 1D list
mapped_tensor = [mapping1.get(x, x) for x in mapped_tensor]  # Map using first mapping
mapped_tensor

# Reshape the mapped_tensor back to the original shape
# mapped_tensor = torch.tensor(mapped_tensor).reshape(tensor.shape)

# # Convert tensor to tuples of strings and apply second mapping
# final_tensor = tensor.clone().tolist()  # Convert tensor to list
# final_tensor = [[mapping2.get(x, x) for x in row] for row in final_tensor]  # Map using second mapping

# print(final_tensor)


['a', 'b', 'c', 4, 5, 6]

In [None]:
g2.mapping.id_to_idx #source

{('a', 'c'): 0, ('b', 'c'): 1, ('c', 'd'): 2, ('c', 'e'): 3}

In [None]:
import numpy as np

lh_conv = Lh_conv()
x_low = torch.tensor([np.log(1)])
x_high = None
source_to_target_edge_index = torch.tensor([[0,0],[0,1]])
transition_probability = torch.tensor([.3,.7])
vec_log_lh = lh_conv(x_low, x_high, source_to_target_edge_index, transition_probability)
print(vec_log_lh.sum())

# edgeset is (0,1) (0,0) with probabilities P(0) = 1, P(1|0) = .7 P(0|0) = .3
P_0 = 1
P_1_cond_0 = .7
P_0_cond_0 = .3
#
L_path_01 = P_0*P_1_cond_0
L_path_00 = P_0*P_0_cond_0
expected_log_lh = np.log(L_path_01*L_path_00)  
expected_log_lh

tensor(-1.5606, dtype=torch.float64)


-1.5606477482646683

In [44]:
walks.paths

{0: tensor([[0, 2],
         [2, 3]], dtype=torch.int32),
 1: tensor([[1, 2],
         [2, 4]], dtype=torch.int32),
 2: tensor([[0, 2, 4],
         [2, 4, 5]], dtype=torch.int32)}

A starting point is doing the calculation on the longest subpaths. Each of their observations (in the higher layer of the mon) corresponds to one of the entries in the likelihood computation. 


However, the counts of the longest subpaths do not correspond to the number of paths (therefore cannot just use them and move down form the computation graph.)


Maybe, the code above with the padding and a LhConv with growing order can help


(then might be enough to have a quick way to map rows of path tensor to ho-node id)

probably important to use bincount at that point: https://pytorch.org/docs/stable/generated/torch.bincount.html