In [108]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import spektral
import hashlib
from spektral.datasets.citation import Citation
from torch.utils.data import Dataset, DataLoader

In [36]:
dataset = Citation("citeseer", random_split=True, normalize=True)
dataset[0]

  self._set_arrayXarray(i, j, x)


Graph(n_nodes=3327, n_node_features=3703, n_edge_features=None, n_labels=6)

In [27]:
node_list = set([x[0] for x in edge_list] + [x[1] for x in edge_list])
len(node_list)

3327

In [40]:
node_ids = {k:v for v,k in enumerate(node_list)}

In [47]:
n = len(node_list)
adj_mat = np.zeros((n,n))

for x,y in edge_list:
    adj_mat[node_ids[x],node_ids[y]] = 1

In [60]:
dinv_mat = np.nan_to_num(np.diag(adj_mat.sum(axis=0)))

In [74]:
alpha = 0.15

Abar = np.matmul(adj_mat, dinv_mat)
I = np.diag(np.ones(n))
M = np.linalg.inv(I - (1-alpha)*Abar)

intimacy_mat = alpha*M

In [112]:
class CoraDataset(Dataset):
    def __init__(self, path='data/citeseer/', context_size=10, with_wl=True, wl_iterations=2) -> None:
        """
        Args:
            path (str, optional): path containing the Cora dataset. Defaults to 'data/citeseer/'.
            context_size (int, optional): number of nodes in a target node context, represent the
                                          topk nodes sorted by intimacy score. Defaults to 10.
        """
        super().__init__()

        self.context_size = context_size
        self.with_wl = with_wl
        self.wl_iterations = wl_iterations

        # swap nodes on each row as they are listed as target-source
        df = pd.read_csv(f"{path}/citeseer.cites", sep='\t', header=None)
        self.edge_list = [(x[1], x[0]) for x in df.values.tolist()]
        self.node_list = set([x[0] for x in self.edge_list] + [x[1] for x in self.edge_list])
        self.name_to_id = {k:v for v,k in enumerate(self.node_list)}
        self.id_to_name = {v:k for v,k in enumerate(self.node_list)}
        self.n = len(self.node_list)

        # load features and labels
        self.raw_features = pd.read_csv(f"{path}/citeseer.content", sep='\t', header=None, index_col=0)
        self.raw_features.index = self.raw_features.index.astype(str)
        self.labels = self.raw_features.iloc[:,-1].astype('category').cat.codes
        self.raw_features = self.raw_features.iloc[:,:-1]

        # pre-process graph to make data loader more efficient
        self.build_intimacy_matrix()
        self.build_contexts()

        self.wl_colors = None
        if with_wl:
            self.build_wl_coloring()

    def build_intimacy_matrix(self, alpha=0.15):
        # create adjacency matrix
        n = self.n
        adj_mat = np.zeros((n,n))
        name_to_id = self.name_to_id

        for x,y in self.edge_list:
            adj_mat[name_to_id[x], name_to_id[y]] = 1
            adj_mat[name_to_id[y], name_to_id[x]] = 1
        
        # compute inverse of diagonal degrees matrix
        dinv_mat = np.nan_to_num(np.diag(adj_mat.sum(axis=0)))

        # compute final matrix, for details see (1) in Graph-BERT by Zhang et al. '20 page 3
        Abar = np.matmul(adj_mat, dinv_mat)
        I = np.diag(np.ones(n))
        M = np.linalg.inv(I - (1-alpha)*Abar)

        self.intimacy_mat = alpha*M
    
    def build_contexts(self):
        context_list = np.zeros((self.n, self.context_size+1), dtype=np.int)

        # always include target node into its context
        context_list[:,0] = range(self.n)

        # context of a node contains the topk nodes sorted by intimacy score
        context_list[:,1:] = (-self.intimacy_mat).argsort(axis=1)[:,:self.context_size]
        
        self.context_list = context_list
    
    def build_wl_coloring(self):
        G = nx.from_edgelist(self.edge_list, create_using=nx.Graph)

        # initialize node colors
        wl_colors = {node:1 for node in G.nodes}

        for _ in range(self.wl_iterations):
            for node in sorted(G.nodes):
                # combine colors from neighbors
                code_list = [wl_colors[node]] + [wl_colors[x] for x in sorted(G.neighbors(node))]
                code = "".join(map(str, code_list))

                # update node code
                wl_colors[node] = hashlib.md5(code.encode()).hexdigest()
        
        color_to_num = {color:i for i,color in enumerate(set(wl_colors.values()))}
        wl_colors = {node:color_to_num[c] for node,c in wl_colors.items()}
        self.wl_colors = wl_colors
    
    def __len__(self):
        """ each node together with its context represent an instance of the graph """
        return self.n
    
    def __getitem__(self, idx):
        context = self.context_list[idx,:]
        X = np.array([self.raw_features.loc[self.id_to_name[x]].to_numpy() for x in context])
        y = np.array([self.labels[self.id_to_name[x]] for x in context])

        if self.with_wl:
            C = np.array([self.wl_colors[self.id_to_name[x]] for x in context])
            return X, C, y
        else:
            return X, y

In [113]:
cora = CoraDataset()

  if (await self.run_code(code, result,  async_=asy)):


In [114]:
cora[0]

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([1399, 1160, 2023,  793, 1822, 1574, 1626, 1887, 1428, 1504,  935]),
 array([1, 4, 4, 1, 1, 1, 2, 2, 4, 5, 2], dtype=int8))

In [100]:
import networkx as nx
import hashlib

G = nx.from_edgelist(cora.edge_list, create_using=nx.Graph)

# initialize node colors
wl_colors = {node:1 for node in G.nodes}
num_iter = 2

for _ in range(num_iter):
    for node in sorted(G.nodes):
        # combine colors from neighbors
        code_list = [wl_colors[node]] + [wl_colors[x] for x in sorted(G.neighbors(node))]
        code = "".join(map(str, code_list))

        # update node code
        wl_colors[node] = hashlib.md5(code.encode()).hexdigest()

In [107]:
color_to_num = {color:i for i,color in enumerate(set(wl_colors.values()))}
wl_colors = {node:color_to_num[c] for node,c in wl_colors.items()}
wl_colors

{'100157': 2107,
 '364207': 793,
 '38848': 1772,
 'bradshaw97introduction': 1064,
 'bylund99coordinating': 2146,
 'dix01metaagent': 1525,
 'gray99finding': 2004,
 'labrou01standardizing': 1066,
 'labrou99agent': 1022,
 'nodine98overview': 653,
 'nodine99active': 53,
 'wagner97artificial': 212,
 '455651': 939,
 '100598': 804,
 'marquez00machine': 1771,
 'punyakanok01use': 1128,
 'krasnogor00memetic': 565,
 '101570': 2274,
 '131669': 70,
 '10227': 164,
 'sima00computational': 25,
 'sima01computational': 25,
 '140169': 565,
 '103027': 2274,
 'weiss00building': 1234,
 '105684': 494,
 '104129': 588,
 '106003': 1839,
 'singhal99document': 1350,
 'amer-yahia00boundingschemas': 2020,
 '106339': 143,
 'arlein99making': 392,
 'cluet99using': 27,
 'shasha02algorithmics': 597,
 '184462': 1163,
 '108321': 1915,
 '73962': 460,
 '83140': 303,
 '448486': 1658,
 '108573': 141,
 'bailey01analysis': 790,
 'bailey02eventconditionaction': 2179,
 'tova99active': 630,
 'cohen01learning': 1814,
 '108580': 182

In [75]:
np.array([cora.raw_features.loc[cora.id_to_name[x]].to_numpy() for x in cora[0]]).sum()

311

In [63]:
len(cora.id_to_name), len(cora.raw_features)

(3327, 3312)

In [69]:
cora.raw_features.loc['63694'].to_numpy()

array([0, 0, 0, ..., 0, 0, 'Agents'], dtype=object)

In [16]:
features = np.genfromtxt('data/citeseer/citeseer.content', delimiter='\t')
features.shape

(3312, 3705)

In [30]:
features_df = pd.read_csv('data/citeseer/citeseer.content', sep='\t', header=None, index_col=0)
features_df.index = features_df.index.astype(str)
features_df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,3695,3696,3697,3698,3699,3700,3701,3702,3703,3704
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Agents
100598,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,IR
105684,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Agents
11099,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,DB
114091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AI


In [40]:
labels = features_df.iloc[:,-1].astype('category').cat.codes

0
100157    1
100598    4
105684    1
11099     2
114091    0
dtype: int8

In [29]:
featuresfeatures_df.index.astype(str)

Index(['100157', '100598', '105684', '11099', '114091', '11510', '115971',
       '117999', '120432', '126894',
       ...
       'zhang01evolutionary', 'zhang01maximum', 'zhang01personalized',
       'zhang01pvm', 'zhang99evolving', 'zhang99query', 'zhang99situated',
       'zhang99towards', 'zhou00implementation', '455346'],
      dtype='object', name=0, length=3312)