#### Libraries

In [2]:
import pickle as pkl
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from tqdm import tqdm
from itertools import chain
from scipy import sparse

In [3]:
with open('training_data.pkl', 'rb') as file:
    train_graphs = pkl.load(file)

#### Helper Functions

In [5]:

def SortLabels(GF, nodeindex, labeldict):
    '''Get all neighbours of a node at nodeindex and sort them'''
    neigh = list(GF.neighbors(nodeindex))
    label = [labeldict[i] for i in neigh]
    label = list(chain.from_iterable(label))
    label.sort()
    return label

def GlueLabels(nodelabeldict, index, newlist):
    '''Glue together a list of nodelabels with the node's own label'''
    nodelbl = nodelabeldict[index][0]
    x = ''.join([str(s) for s in newlist])
    return str(nodelbl) + x

def dictToArray(dict):
    '''Helper function'''
    a = np.array(list(dict.items()), dtype=object )
    a[:,1] = list(chain.from_iterable(a[:,1]))
    return a

def draw(G):
    '''Draw graph with its labels'''
    pos = nx.spring_layout(G, seed=1, k=0.3)
    nx.draw(G, pos)
    node_labels = nx.get_node_attributes(G, "labels")
    nx.draw_networkx_labels(G, pos, node_labels, font_size=8);

def getnodelblarr(GF):
    '''For a specific graph, generate a (sorted) dataframe of old labels and new glued labels for each node'''
    #Dictionary of labels for graph
    NLbl_dict = nx.get_node_attributes(GF, 'labels')
    #Generate new lbls for each vertex
    newlabels = []
    for v in NLbl_dict:
        sortedlbls = SortLabels(GF, v, NLbl_dict)
        gluedlbl = GlueLabels(NLbl_dict, v, sortedlbls)
        newlabels.append(gluedlbl)

    #Array with columns: node, label, newlabel
    Nlbl_arra0 = np.c_[dictToArray(NLbl_dict), newlabels]
    #And sort
    Nlbl_arra0 = Nlbl_arra0[Nlbl_arra0[:,2].argsort()]
    return Nlbl_arra0

def hashtodic(ALPHAbet, newlblarr, currentmax):
    '''Function to hash newly glued labels; then add the (unique) new ones to the overall alphabet.
    Return a Dataframe with new, old, hashed labels for each node'''
    #Get unique entries
    a = np.unique(newlblarr[:,2]) 
    #Get those that are new also
    a = [a[i] for i in range(len(a)) if a[i] not in ALPHAbet]
    #Hash values
    b = np.arange(len(a))+currentmax
    currentmax = currentmax+len(b)
    dic1 = ALPHAbet | {a[i] : b[i] for i in range(len(a))}
    #relabel
    newlblarr = np.c_[newlblarr, [dic1[newlblarr[:,2][_]] for _ in range(len(newlblarr))]]    
    
    return dic1, pd.DataFrame(newlblarr, columns=["node", 'oldlbl', 'hashed', 'newlbl']), currentmax
 
def assignewlabels(GF, nlblarr):
    '''Helper function: change a graph's labels. (Change directly as working on a copy) '''
    newdict = {nlblarr['node'][i] : [nlblarr['newlbl'][i]] for i in range(len(nlblarr))}
    H = GF.copy()
    nx.set_node_attributes(H, newdict, name='labels')
    return H


def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    loader = np.load(filename)
    return sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])


#### Main Loop

In [None]:
#Number of hops
h = 4
hashedgraphs = train_graphs.copy()
hashedgraphs = hashedgraphs[0:len(train_graphs)//1]
Alphabet = {}#{f'{i}': i for i in range(49)} #{}
currentmax = 49

#Alphabet instances for each graph over all hops
#List of 6000 empty lists
l = [[] for _ in range(len(hashedgraphs))]
for i in range(len(hashedgraphs)):
    a = nx.get_node_attributes(hashedgraphs[i], 'labels').values()
    a = list(chain.from_iterable(a))
    l[i] = l[i] + a
pd.value_counts(l[0])

maxes = [currentmax]
#For each hop
for _ in range(h):
    #For each graph
    for i in tqdm(range(len(hashedgraphs))):
        Gi = hashedgraphs[i].copy()# Gi = train_graphs[i]
        #get biggest connected subgraph
        Gi = Gi.subgraph(sorted(nx.connected_components(Gi), key=len, reverse=True)[0])
        #For each node
        #Array with columns: node, label, newlabel
        lblarri = getnodelblarr(Gi)
        #Update big alphabet, hash
        Alphabet, lblarri, currentmax = hashtodic(Alphabet, lblarri, currentmax=currentmax)
        #sort and reset index
        lblarri = lblarri.sort_values(by='node').reset_index(drop=True)
        # lblarri = lblarri
        #relabel to a different graph
        Gi = assignewlabels(Gi, lblarri)
        #assign graph-value
        hashedgraphs[i] = Gi
        #add counts of labels
        a = nx.get_node_attributes(hashedgraphs[i], 'labels').values()
        a = list(chain.from_iterable(a))
        l[i] = l[i] + a

In [None]:
# plt.figure(figsize=(16,16))
# plt.subplot(2,2,1)
# draw(graphzero[0])
# plt.subplot(2,2,2)
# draw(graphzero[1])
# plt.subplot(2,2,3)
# draw(graphzero[2])

#### Generate feature vectors

In [None]:
#Creating matrix of feature vectors for each graph
M = np.zeros((len(hashedgraphs), len(Alphabet)+100))
for i in tqdm(range(len(hashedgraphs))):
    a = pd.value_counts(l[i])
    M[i, list(a.index)] = a.values


sM = sparse.csr_matrix(M)
#And save so this only needs to be saved once.
save_sparse_csr('WLfeaturevectors', sM)

In [None]:
from sys import getsizeof
print(M.shape, getsizeof(M), getsizeof(sM), sep='\n')

In [None]:
# plt.figure(figsize=(16,16))
# plt.subplot(2,2,1)
# draw(hashedgraphs[9])
# plt.subplot(2,2,2)
# draw(hashedgraphs[10])
# plt.subplot(2,2,3)
# # draw(train_graphs[59])
# nx.draw(train_graphs[9], pos, with_labels=True)
# plt.subplot(2,2,4)
# draw(train_graphs[0])

#### Create kernel matrix

In [6]:
sM = load_sparse_csr('WLfeaturevectors.npz')
sM.shape

(6000, 51160)

In [9]:
import scipy.sparse as ss
M = ss.csr_matrix.toarray(sM)

numpy.ndarray

In [14]:
def kernel(X, Y):
    # X is nxp, Y is mxp
    return np.dot(X, np.transpose(Y))

K = kernel(M, M)

In [15]:
sK = ss.csr_matrix(K)
save_sparse_csr('WLKernel', sM)