In [30]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import spektral
from spektral.datasets.citation import Citation
from torch.utils.data import Dataset, DataLoader

In [36]:
dataset = Citation("citeseer", random_split=True, normalize=True)
dataset[0]

  self._set_arrayXarray(i, j, x)


Graph(n_nodes=3327, n_node_features=3703, n_edge_features=None, n_labels=6)

In [27]:
node_list = set([x[0] for x in edge_list] + [x[1] for x in edge_list])
len(node_list)

3327

In [40]:
node_ids = {k:v for v,k in enumerate(node_list)}

In [47]:
n = len(node_list)
adj_mat = np.zeros((n,n))

for x,y in edge_list:
    adj_mat[node_ids[x],node_ids[y]] = 1

In [60]:
dinv_mat = np.nan_to_num(np.diag(adj_mat.sum(axis=0)))

In [74]:
alpha = 0.15

Abar = np.matmul(adj_mat, dinv_mat)
I = np.diag(np.ones(n))
M = np.linalg.inv(I - (1-alpha)*Abar)

intimacy_mat = alpha*M

In [144]:
class CoraDataset(Dataset):
    def __init__(self, path='data/citeseer/', context_size=10) -> None:
        """
        Args:
            path (str, optional): path containing the Cora dataset. Defaults to 'data/citeseer/'.
            context_size (int, optional): number of nodes in a target node context, represent the
                                          topk nodes sorted by intimacy score. Defaults to 10.
        """
        super().__init__()

        self.context_size = context_size

        # swap nodes on each row as they are listed as target-source
        df = pd.read_csv(f"{path}/citeseer.cites", sep='\t', header=None)
        self.edge_list = [(x[1], x[0]) for x in df.values.tolist()]
        self.node_list = set([x[0] for x in self.edge_list] + [x[1] for x in self.edge_list])
        self.node_ids = {k:v for v,k in enumerate(node_list)}
        self.n = len(node_list)

        # pre-process graph to make data loader more efficient
        self.build_intimacy_matrix()
        self.build_contexts()

    def build_intimacy_matrix(self, alpha=0.15):
        # create adjacency matrix
        n = self.n
        adj_mat = np.zeros((n,n))

        for x,y in self.edge_list:
            adj_mat[self.node_ids[x], self.node_ids[y]] = 1
            adj_mat[self.node_ids[y], self.node_ids[x]] = 1
        
        # compute inverse of diagonal degrees matrix
        dinv_mat = np.nan_to_num(np.diag(adj_mat.sum(axis=0)))

        # compute final matrix, for details see (1) in Graph-BERT by Zhang et al. '20 page 3
        Abar = np.matmul(adj_mat, dinv_mat)
        I = np.diag(np.ones(n))
        M = np.linalg.inv(I - (1-alpha)*Abar)

        self.intimacy_mat = alpha*M
    
    def build_contexts(self):
        context_list = np.zeros((self.n, self.context_size+1), dtype=np.int)

        # always include target node into its context
        context_list[:,0] = range(self.n)

        # context of a node contains the topk nodes sorted by intimacy score
        context_list[:,1:] = (-self.intimacy_mat).argsort(axis=1)[:,:self.context_size]
        
        self.context_list = context_list
    
    def __len__(self):
        """ each node together with its context represent an instance of the graph """
        return self.n
    
    def __getitem__(self, idx):
        pass

In [145]:
cora = CoraDataset()

In [146]:
cora.context_list

array([[   0,  936,  576, ..., 3249, 3229,  855],
       [   1,    1, 2073, ..., 2215, 2216, 2217],
       [   2,    2, 2724, ..., 3063, 2994, 1925],
       ...,
       [3324, 1221,  844, ..., 1489,  206, 3063],
       [3325, 2734, 2786, ...,  126, 1143, 1662],
       [3326, 3278, 2561, ..., 2724, 1311,  522]])

In [136]:
(-cora.intimacy_mat[0,:]).argsort()

array([ 936,  576,  236, ..., 1136,  493, 2692], dtype=int64)

In [116]:
i = 50
cora.intimacy_mat[i,(-cora.intimacy_mat[i]).argsort()[:5]]

array([-0., -0., -0., -0., -0.])

In [109]:
(cora.intimacy_mat[10] > 0).sum()

1