In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import spektral
import hashlib
from spektral.datasets.citation import Citation
from torch.utils.data import Dataset, DataLoader

In [94]:
from math import cos


class CoraDataset(Dataset):
    def __init__(self, path='data/citeseer/', context_size=10, with_wl=True, wl_iterations=2) -> None:
        """
        Args:
            path (str, optional): path containing the Cora dataset. Defaults to 'data/citeseer/'.
            context_size (int, optional): number of nodes in a target node context, represent the
                                          topk nodes sorted by intimacy score. Defaults to 10.
        """
        super().__init__()

        self.context_size = context_size
        self.with_wl = with_wl
        self.wl_iterations = wl_iterations

        # swap nodes on each row as they are listed as target-source
        df = pd.read_csv(f"{path}/citeseer.cites", sep='\t', header=None)
        self.edge_list = [(x[1], x[0]) for x in df.values.tolist()]
        self.node_list = set([x[0] for x in self.edge_list] + [x[1] for x in self.edge_list])
        self.name_to_id = {k:v for v,k in enumerate(self.node_list)}
        self.id_to_name = {v:k for v,k in enumerate(self.node_list)}
        self.n = len(self.node_list)

        # distances between nodes
        ids_edge_list = [(self.name_to_id[x], self.name_to_id[y]) for x,y in self.edge_list]
        G = nx.from_edgelist(ids_edge_list, create_using=nx.Graph)
        self.distance_matrix = dict(nx.all_pairs_shortest_path_length(G))

        # load features and labels
        self.raw_features = pd.read_csv(f"{path}/citeseer.content", sep='\t', header=None, index_col=0)
        self.raw_features.index = self.raw_features.index.astype(str)
        self.labels = self.raw_features.iloc[:,-1].astype('category').cat.codes
        self.raw_features = self.raw_features.iloc[:,:-1]

        # pre-process graph to make data loader more efficient
        self.build_intimacy_matrix()
        self.build_contexts()

        self.wl_colors = None
        if with_wl:
            self.build_wl_coloring()

    def build_intimacy_matrix(self, alpha=0.15):
        # create adjacency matrix
        n = self.n
        adj_mat = np.zeros((n,n))
        name_to_id = self.name_to_id

        for x,y in self.edge_list:
            adj_mat[name_to_id[x], name_to_id[y]] = 1
            adj_mat[name_to_id[y], name_to_id[x]] = 1
        
        # compute inverse of diagonal degrees matrix
        dinv_mat = np.nan_to_num(np.diag(adj_mat.sum(axis=0)))

        # compute final matrix, for details see (1) in Graph-BERT by Zhang et al. '20 page 3
        Abar = np.matmul(adj_mat, dinv_mat)
        I = np.diag(np.ones(n))
        M = np.linalg.inv(I - (1-alpha)*Abar)

        self.intimacy_mat = alpha*M
    
    def build_contexts(self):
        context_list = np.zeros((self.n, self.context_size+1), dtype=np.int)

        # always include target node into its context
        context_list[:,0] = range(self.n)

        # context of a node contains the topk nodes sorted by intimacy score
        context_list[:,1:] = (-self.intimacy_mat).argsort(axis=1)[:,:self.context_size]
        
        self.context_list = context_list
    
    def build_wl_coloring(self):
        G = nx.from_edgelist(self.edge_list, create_using=nx.Graph)

        # initialize node colors
        wl_colors = {node:1 for node in G.nodes}

        for _ in range(self.wl_iterations):
            for node in sorted(G.nodes):
                # combine colors from neighbors
                code_list = [wl_colors[node]] + [wl_colors[x] for x in sorted(G.neighbors(node))]
                code = "".join(map(str, code_list))

                # update node code
                wl_colors[node] = hashlib.md5(code.encode()).hexdigest()
        
        color_to_num = {color:i for i,color in enumerate(set(wl_colors.values()))}
        wl_colors = {node:color_to_num[c] for node,c in wl_colors.items()}
        self.wl_colors = wl_colors
    
    def position_embed(self, v, col):
        x = np.zeros(v.shape)
        dim = len(v)

        for i in range(dim//2):
            x[i] = np.math.sin(col/(10000**((2*i)/dim)))
            x[i+1] = np.math.cos(col/10000**((2*i+1)/dim))
        
        return x
    
    def __len__(self):
        """ each node together with its context represent an instance of the graph """
        return self.n
    
    def __getitem__(self, idx):
        # ids of nodes in the context
        context = self.context_list[idx,:]

        n = len(context)
        
        # raw feature vector embedding
        X = np.array([self.raw_features.loc[self.id_to_name[x]].to_numpy() for x in context])

        # Weisfeiler-Lehman absolute role embedding
        colors = [self.wl_colors[self.id_to_name[i]] for i in context]
        C = np.array([self.position_embed(X[i], colors[i]) for i in range(n)])

        # intimacy based relative positional embedding
        I = np.array([self.position_embed(X[i], i) for i in range(n)])

        # hop based relative distance embedding
        H = np.array([self.position_embed(X[i], self.distance_matrix[context[0]][x]) for i,x in enumerate(context)])

        # labels
        y = np.array([self.labels[self.id_to_name[x]] for x in context])
        
        return X, C, I, H, y

In [95]:
cora = CoraDataset()

  if (await self.run_code(code, result,  async_=asy)):


In [98]:
cora[0][1]

array([[ 0.17595693,  0.49531559,  0.73566397, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87323855, -0.31859114,  0.98395305, ...,  0.        ,
         0.        ,  0.        ],
       [-0.41228327, -0.62492818,  0.98846784, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.99390196, -0.29573498,  0.73313466, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.86902542, -0.36147152, -0.98314087, ...,  0.        ,
         0.        ,  0.        ],
       [-0.3301331 ,  0.32226867, -0.3584567 , ...,  0.        ,
         0.        ,  0.        ]])

In [83]:
G = nx.from_edgelist(cora.edge_list, create_using=nx.Graph)
dist = dict(nx.all_pairs_shortest_path_length(G))


In [86]:
dist

KeyboardInterrupt: 

In [32]:
context = cora.context_list[0,:]
context

array([   0,  754, 1409, 1025, 1546, 3076,  474,  656, 2005, 2716, 2521])

In [34]:
X = np.array([cora.raw_features.loc[cora.id_to_name[x]].to_numpy() for x in context])
X.shape

(11, 3703)