In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
from torch import nn
import spektral
import hashlib
from spektral.datasets.citation import Citation
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import normalize

In [54]:
from math import cos


class CoraDataset(Dataset):
    def __init__(self, path='data/citeseer/', context_size=10, with_wl=True, wl_iterations=2, cutoff=99) -> None:
        """
        Args:
            path (str, optional): path containing the Cora dataset. Defaults to 'data/citeseer/'.
            context_size (int, optional): number of nodes in a target node context, represent the
                                          topk nodes sorted by intimacy score. Defaults to 10.
        """
        super().__init__()

        self.context_size = context_size
        self.with_wl = with_wl
        self.wl_iterations = wl_iterations
        self.hidden_size = hidden_size

        # swap nodes on each row as they are listed as target-source
        df = pd.read_csv(f"{path}/citeseer.cites", sep='\t', header=None)
        self.edge_list = [(x[1], x[0]) for x in df.values.tolist()]
        self.node_list = set([x[0] for x in self.edge_list] + [x[1] for x in self.edge_list])
        self.name_to_id = {k:v for v,k in enumerate(self.node_list)}
        self.id_to_name = {v:k for v,k in enumerate(self.node_list)}
        self.n = len(self.node_list)

        # distances between nodes
        ids_edge_list = [(self.name_to_id[x], self.name_to_id[y]) for x,y in self.edge_list]
        G = nx.from_edgelist(ids_edge_list, create_using=nx.Graph)
        self.distance_matrix = dict(nx.all_pairs_shortest_path_length(G), cutoff=cutoff)

        # load features and labels
        self.raw_features = pd.read_csv(f"{path}/citeseer.content", sep='\t', header=None, index_col=0)
        self.raw_features.index = self.raw_features.index.astype(str)
        self.labels = self.raw_features.iloc[:,-1].astype('category').cat.codes
        self.raw_features = self.raw_features.iloc[:,:-1]
        self.raw_features_size = self.raw_features.shape[1]

        # pre-process graph to make data loader more efficient
        self.build_intimacy_matrix()
        self.build_contexts()

        self.wl_colors = None
        if with_wl:
            self.build_wl_coloring()
        self.max_wl_colors = max(self.wl_colors.values())

    def build_intimacy_matrix(self, alpha=0.15):
        # create adjacency matrix
        n = self.n
        adj_mat = np.zeros((n,n))
        name_to_id = self.name_to_id

        for x,y in self.edge_list:
            adj_mat[name_to_id[x], name_to_id[y]] = 1
            adj_mat[name_to_id[y], name_to_id[x]] = 1
        
        # compute inverse of diagonal degrees matrix
        dinv_mat = np.nan_to_num(np.diag(adj_mat.sum(axis=0)))

        # compute final matrix, for details see (1) in Graph-BERT by Zhang et al. '20 page 3
        Abar = np.matmul(adj_mat, dinv_mat)
        I = np.diag(np.ones(n))
        M = np.linalg.inv(I - (1-alpha)*Abar)

        self.intimacy_mat = alpha*M
    
    def build_contexts(self):
        context_list = np.zeros((self.n, self.context_size+1), dtype=np.int)

        # always include target node into its context
        context_list[:,0] = range(self.n)

        # context of a node contains the topk nodes sorted by intimacy score
        context_list[:,1:] = (-self.intimacy_mat).argsort(axis=1)[:,:self.context_size]
        
        self.context_list = context_list
    
    def build_wl_coloring(self):
        G = nx.from_edgelist(self.edge_list, create_using=nx.Graph)

        # initialize node colors
        wl_colors = {node:1 for node in G.nodes}

        for _ in range(self.wl_iterations):
            for node in sorted(G.nodes):
                # combine colors from neighbors
                code_list = [wl_colors[node]] + [wl_colors[x] for x in sorted(G.neighbors(node))]
                code = "".join(map(str, code_list))

                # update node code
                wl_colors[node] = hashlib.md5(code.encode()).hexdigest()
        
        color_to_num = {color:i for i,color in enumerate(set(wl_colors.values()))}
        wl_colors = {node:color_to_num[c] for node,c in wl_colors.items()}
        self.wl_colors = wl_colors
    
    def position_embed(self, v, col):
        x = np.zeros(self.hidden_size)
        dim = len(v)

        for i in range(dim//2):
            x[i] = np.math.sin(col/(10000**((2*i)/dim)))
            x[i+1] = np.math.cos(col/10000**((2*i+1)/dim))
        
        return x
    
    def __len__(self):
        """ each node together with its context represent an instance of the graph """
        return self.n
    
    def __getitem__(self, idx):
        # ids of nodes in the context
        context = self.context_list[idx,:]

        n = len(context)
        
        # raw feature vector embedding
        X = torch.tensor([self.raw_features.loc[self.id_to_name[x]].to_numpy() for x in context])

        # Weisfeiler-Lehman absolute role embedding
        C = torch.tensor([self.wl_colors[self.id_to_name[i]] for i in context])

        # intimacy based relative positional embedding
        I = torch.tensor(range(n))

        # hop based relative distance embedding
        source = context[0]
        H = torch.tensor([self.distance_matrix[source][dest] for dest in context])

        # labels
        y = torch.tensor([self.labels[self.id_to_name[x]] for x in context])
        
        return X, C, I, H, y

In [49]:
cora = CoraDataset()

  if (await self.run_code(code, result,  async_=asy)):


In [53]:
cora[0]

(tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([ 562,  983,  176, 1295,  562, 1630,  710,  818,  531,  508, 1207]),
 tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 tensor([0, 5, 3, 5, 0, 4, 6, 4, 9, 4, 5]),
 tensor([0, 1, 3, 1, 0, 4, 3, 5, 3, 1, 5], dtype=torch.int8))

In [35]:
class GraphBertConfig():
    def __init__(
        self,
        residual_type = 'none',
        x_size=3000,
        y_size=7,
        k=5,
        max_wl_role_index = 100,
        max_hop_dis_index = 100,
        max_inti_pos_index = 100,
        hidden_size=32,
        num_hidden_layers=1,
        num_attention_heads=1,
        intermediate_size=32,
        hidden_act="gelu",
        hidden_dropout_prob=0.5,
        attention_probs_dropout_prob=0.3,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        is_decoder=False,
        **kwargs
    ):
        super(GraphBertConfig, self).__init__(**kwargs)
        self.max_wl_role_index = max_wl_role_index
        self.max_hop_dis_index = max_hop_dis_index
        self.max_inti_pos_index = max_inti_pos_index
        self.residual_type = residual_type
        self.x_size = x_size
        self.y_size = y_size
        self.k = k
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.is_decoder = is_decoder

In [36]:
config = GraphBertConfig()

In [38]:
class BertEmbeddings(nn.Module):
    def __init__(self, config: GraphBertConfig):
        super().__init__()
        self.raw_features_embeddings = nn.Linear(config.x_size, config.hidden_size)
        self.wl_absolute_role_embeddings = nn.Embedding(config.max_wl_role_index, config.hidden_size)
        self.intimacy_relative_embeddings = nn.Embedding(config.max_inti_pos_index, config.hidden_size)
        self.hop_distance_embeddings = nn.Embedding(config.max_hop_dis_index, config.hidden_size)
        
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    
    def forward(self, raw_features, wl_role_ids, init_pos_ids, hop_dis_ids):
        raw_feature_embeds = self.raw_feature_embeddings(raw_features)
        role_embeddings = self.wl_role_embeddings(wl_role_ids)
        position_embeddings = self.inti_pos_embeddings(init_pos_ids)
        hop_embeddings = self.hop_dis_embeddings(hop_dis_ids)

        embeddings = raw_feature_embeds + role_embeddings + position_embeddings + hop_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings

In [46]:
import pickle

with open("data.pickle", "rb") as f:
    data = pickle.load(f)

ModuleNotFoundError: No module named 'code.DatasetLoader'; 'code' is not a package