In [1]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import scatter

import pathpyG as pp
pp.config['torch']['device'] = 'cpu'

In [2]:
def compute_weighted_outdegrees(graph):
    weighted_outdegree = scatter(graph.data.edge_weight, graph.data.edge_index[0], dim=0, dim_size=graph.data.num_nodes, reduce='sum')
    return weighted_outdegree

def compute_transition_probabilities(graph):
    weighted_outdegree = compute_weighted_outdegrees(graph)
    source_ids = graph.data.edge_index[0]
    return graph.data.edge_weight/ weighted_outdegree[source_ids]


In [3]:
from torch.utils.data import Dataset
import torch

# https://en.wikipedia.org/wiki/Pairing_function
# https://math.stackexchange.com/questions/1377929/generalization-of-cantor-pairing-function-to-triples-and-n-tuples
def cantor_pairing(x, y):
    """
    Computes the Cantor pairing value for two integers x and y.
    The Cantor pairing maps two integers to a unique integer.

    Args:
        x (int): The first integer.
        y (int): The second integer.

    Returns:
        int: The Cantor pairing value for the given integers x and y.
    """
    return (x + y) * (x + y + 1) // 2 + y

def cantor_encode_tensor(tnsr):
    """
    Encodes a list of integers in tensor rows into single integers using Cantor pairing.
    The function recursively applies the Cantor pairing function to pairs of elements
    in the input tensor until it encodes the entire list into a single integer.

    If the input tensor is empty, the function returns 0.

    Args:
        tnsr (torch.Tensor): A tensor containing a list of integers.

    Returns:
        torch.Tensor: The Cantor encoded integer representing the input list.
    """
    if tnsr.size(1) == 0:
        # Termination point. The added 0 has no effect on the returned integer
        return torch.tensor(0)
    else:
        return cantor_pairing(tnsr[:, 0], cantor_encode_tensor(tnsr[:, 1:]))


class WalksDataset(Dataset):
    """
    Dataset class to handle sequences of node walks.

    Args:
        dag_data (object): The input DAG data.
        dict_cantor_to_honode_ixs_mapping (dict): Dictionary mapping Cantor encoded indices to higher-order node indices.
        max_order (int): Maximum order of nodes in a walk.
    """

    def __init__(self, dag_data, dict_cantor_to_honode_ixs_mapping, max_order):
        self.max_order = max_order
        self.dict_cantor_to_honode_ixs_mapping = dict_cantor_to_honode_ixs_mapping
        self._preprocess_data(dag_data.dags)
        self._create_tensors_and_encodings()

    def _preprocess_data(self, dags):
        """
        Preprocesses the DAG data to extract walks and their counts.

        Args:
            dags DAGData object
        """
        self.walks_by_length = {}
        self.walk_counts_by_length = {}
        self.total_sequences = 0
        
        for dag in dags:
            node_seq_path = dag.node_sequence.T[0]
            seq_length = len(node_seq_path)
            if seq_length not in self.walks_by_length:
                self.walks_by_length[seq_length] = []
                self.walk_counts_by_length[seq_length] = []
            self.walks_by_length[seq_length].append(node_seq_path)
            # Answer to Moritz question
            # probably one weight per DAG would be enough
            self.walk_counts_by_length[seq_length].append(int(dag.edge_weight.unique()))
            self.total_sequences += 1

    def _create_tensors_and_encodings(self):
        """
        Creates tensors and encodings for the walks.
        """
        self.walk_tensors_by_length = {
            length: torch.stack(walks, dim=0)
            for length, walks in self.walks_by_length.items()
        }
        self.bipartite_encoded_walks_by_length = {
            length: self._bipartite_encode(length)
            for length in self.walk_tensors_by_length
        }

    def _bipartite_encode(self, walk_length):
        """
        Encodes the walks in bipartite form, i.e.,
        representing transitions between indexes of higher-order nodes.

        In bipartite encoding, each transition in the walk sequence corresponds to a pair of indexes,
        where the first `self.max_order` transitions utilize indexes from the higher-order nodes of the i-th order tensors, and the subsequent transitions use indexes of the `self.max_order` most recent nodes.

        Args:
            walk_length (int): The length of the walk.

        Returns:
            torch.Tensor: Bipartite-encoded walk sequences.
        """

        list_cantor_node_ixs_tensors = []
        
        for i in range(1, walk_length + 1):
            hon_ixs_tensor = self.walk_tensors_by_length[walk_length][:, max(0, i - self.max_order):i]
            cantor_encoded = cantor_encode_tensor(hon_ixs_tensor)
            mapped_indices = cantor_encoded.apply_(self.dict_cantor_to_honode_ixs_mapping[min(i, self.max_order)].get)
            list_cantor_node_ixs_tensors.append(mapped_indices)
            
        return torch.stack(list_cantor_node_ixs_tensors, dim=1)

    def __getitem__(self, index_tuple):
        """
        Retrieves a bipartite-encoded node sequence from the dataset.

        Args:
            index_tuple (tuple): A tuple containing the length of the sequence (int) and the index of the sequence within that length (int).

        Returns:
            torch.Tensor: The bipartite-encoded node sequence.
        """
        # This cannot work with batching. Could work if each walk length had its own Dataset object
        walk_length, index = index_tuple
        return self.bipartite_encoded_walks_by_length[walk_length][index]

    def __len__(self):
        """
        Returns the total number of sequences in the dataset.

        Returns:
            int: Total number of sequences.
        """
        return self.total_sequences


In [4]:
dag_data = pp.DAGData(pp.IndexMap(list("01234")))

dag_data.append_walk(list("0230230"), weight=30)
dag_data.append_walk(list("1241241"), weight=70)
dag_data.append_walk(list("0230241"), weight=1)
#
# dag_data.append_walk(list("0230230230230230230"), weight=30)


m = pp.MultiOrderModel.from_DAGs(dag_data, max_order=21)


In [5]:
dict_cantor_to_honode_ixs_mapping = {}
# This works cause the ho-node sequences are sorted by their indices
for order, hon in m.layers.items():
    cantor_ids = cantor_encode_tensor(hon.data.node_sequence)
    cantor_to_node_ixs_mapping = {cantor_id.item(): i for i, cantor_id in enumerate(cantor_ids)}
    dict_cantor_to_honode_ixs_mapping[order] = cantor_to_node_ixs_mapping


In [6]:
walk_data = WalksDataset(dag_data, dict_cantor_to_honode_ixs_mapping, max_order=2)

TODO: following cells contraints all this to be done on a single value of path length

In [7]:
l =7
source_to_target_from_walks = walk_data.bipartite_encoded_walks_by_length[l]
path_counts = walk_data.walk_counts_by_length[l]

In [8]:
# REMOVE THIS!
# FIND ANOTHER WAY TO GET NODE COUNTS
from torch_geometric.loader import DataLoader
dag_graph = next(iter(DataLoader(dag_data.dags, batch_size=len(dag_data.dags)))).to(pp.config["torch"]["device"])


In [9]:
unique_nodes, counts = torch.unique(dag_graph.node_sequence, return_counts=True)
node_emission_probabilities = counts / counts.sum()

source_to_target_edge_index_zeroth = torch.stack([
    torch.zeros_like(source_to_target_from_walks[:, 0]),
    source_to_target_from_walks[:, 0]
])

# log likelihood
tot_log_lh = 0
# log likelihood for the 0-th steps
lh_l = torch.mul(torch.log(node_emission_probabilities[source_to_target_edge_index_zeroth[1]]), torch.tensor(path_counts))
tot_log_lh += lh_l.sum()

for i in range(0, l - 1):
    T = compute_transition_probabilities(m.layers[min(i + 2, walk_data.max_order)])
    # Prepare source_to_target_edge_index
    source_to_target_edge_index = source_to_target_from_walks[:, i:i + 2].T.squeeze()
    # log likelihood for i-th steps
    lh_l = torch.mul(torch.log(T[source_to_target_edge_index[1]]), torch.tensor(path_counts))
    tot_log_lh += lh_l.sum()

tot_log_lh


tensor(-739.3706)

- Adapt to multiple walk lengths
- Better structure for package
- Degrees of freedom etc (actually making the likilhood ration test test)
    - This all thing above is getting lhj
    - Need degrees of freedom
    - Need to actually perform che likelihood ratio test (see line 407 of this: https://github.com/IngoScholtes/pathpy/blob/master/pathpy/MultiOrderModel.py#L378). 
- test functions 

In [29]:


def get_mon_dof(m,max_order, assumption = "paths"):
    """
    The degrees of freedom fo the kth layer of a multi-order model this depende on the number of different paths of exactly length k in the graph.
    Therefore, we can obtain this values by summing the entries of the kth power of the binary adhacency matrix of the graph.
    Finally, we must consider that, due the conservation of probablility, all non-zero rows of the transition matrix of the higher-order network must sum to one. 
    This poses on additional constraint per row that respects the condition, which should be removed from the total count of degrees of freedom.

    @param: maxOrder: the maximum order up to which model layers shall be
    taken into account

    @param assumption: if set to 'paths', for the degree of freedom calculation
        only paths in the first-order network topology will be considered. This is
        needed whenever we model paths in a *given* network topology.
        If set to 'ngrams' all possible n-grams will be considered, independent of whether they
        are valid paths in the first-order network or not. The 'ngrams' and the 'paths' assumption
        coincide if the first-order network is fully connected, i.e. if all possible paths actually occur.
    """
    if max_order == None:
        max_order = max(m.layers)
    assert max_order <= max(m.layers), 'Error: maxOrder cannot be larger than maximum order of multi-order network'

    # for both assumptions, the zeroth order simply gives the node probabilities. 
    dof = m.layers[1].data.num_nodes - 1

    if assumption == "paths":
        for order in range(1, max_order+1):
            num_order_paths = m.layers[order].data.num_nodes
            num_nonzero_outdegrees = len(m.layers[order].data.edge_index[0].unique())
            dof += num_order_paths - num_nonzero_outdegrees
    elif assumption == "ngram":
        # the ngram assumption corresponds to the assumption normally made when using higher-order markov chains
        # i.e., every node can follow every predecent sequence of nodes
        for order in range(1, max_order + 1):
            dof += (m.layers[1].data.num_nodes**order)*(m.layers[1].data.num_nodes - 1) 
    else:
        assert False, f"Unknown assumption {assumption} in input. The only accepted values are 'path' and 'ngram' "
    
    return int(dof)

get_mon_dof(m,max_order=5)

8