In [1]:
import ipycytoscape as ic  # visualise
import pandas as pd
import netaddr
from stellargraph import StellarGraph
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans  # sanity check embeddings
from stellargraph.data import UnsupervisedSampler
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_classification
from tensorflow import keras
from stellargraph.mapper import HinSAGENodeGenerator  # for getting embeddings

### Make some data

Create some data in pandas for stellagraph ingestion. Using https://stellargraph.readthedocs.io/en/stable/demos/basics/loading-pandas.html

In [2]:
# cytoscape format
data = {'nodes': 
         [{'data': {'id': 'n0', 'name': 'Node 0', 'ip': '192.168.0.10', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n1', 'name': 'Node 1', 'ip': '192.168.0.101', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n2', 'name': 'Node 2', 'ip': '192.168.0.9', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n3', 'name': 'Node 3', 'ip': '192.168.0.56', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n4', 'name': 'Node 4', 'ip': '192.168.0.12', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n5', 'name': 'Node 5', 'ip': '10.0.1.30', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n6', 'name': 'Node 6', 'ip': '10.0.1.56', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n7', 'name': 'Node 7', 'ip': '10.0.1.2', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n8', 'name': 'Node 8', 'ip': '10.0.1.102', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n9', 'name': 'Node 9', 'ip': '10.0.1.100', 'mask': '255.255.255.0'}},
          {'data': {'id': 'nRoot', 'name': 'Node Root', 'id_num': 2011}}
         ],
        'edges': 
        [{'data': {'id': 'n0-n5', 'source': 'n0', 'target': 'n5'}},
         {'data': {'id': 'n0-n1', 'source': 'n0', 'target': 'n1'}},
         {'data': {'id': 'n0-n2', 'source': 'n0', 'target': 'n2'}},
         {'data': {'id': 'n0-n8', 'source': 'n0', 'target': 'n8'}},
         {'data': {'id': 'n0-n9', 'source': 'n0', 'target': 'n9'}},
         {'data': {'id': 'n1-n2', 'source': 'n1', 'target': 'n2'}},
         {'data': {'id': 'n8-n9', 'source': 'n8', 'target': 'n9'}},
         {'data': {'id': 'n5-n3', 'source': 'n5', 'target': 'n3'}},
         {'data': {'id': 'n5-n4', 'source': 'n5', 'target': 'n4'}},
         {'data': {'id': 'n5-n6', 'source': 'n5', 'target': 'n6'}},
         {'data': {'id': 'n5-n7', 'source': 'n5', 'target': 'n7'}},
         {'data': {'id': 'n3-n4', 'source': 'n3', 'target': 'n4'}},
         {'data': {'id': 'n6-n7', 'source': 'n6', 'target': 'n7'}},
         {'data': {'id': 'nRoot-n1', 'source': 'nRoot', 'target': 'n1'}}]
       }

In [3]:
edges_df = pd.DataFrame([x['data'] for x in data['edges']])
edges_df = edges_df.drop(['id'], axis=1)
edges_df['weight'] = 1.0
edges_df['edge_type'] = 'sub_tree'
edges_df

Unnamed: 0,source,target,weight,edge_type
0,n0,n5,1.0,sub_tree
1,n0,n1,1.0,sub_tree
2,n0,n2,1.0,sub_tree
3,n0,n8,1.0,sub_tree
4,n0,n9,1.0,sub_tree
5,n1,n2,1.0,sub_tree
6,n8,n9,1.0,sub_tree
7,n5,n3,1.0,sub_tree
8,n5,n4,1.0,sub_tree
9,n5,n6,1.0,sub_tree


In [4]:
nodes_root_df = pd.DataFrame([x['data'] for x in data['nodes'] if 'id_num' in x['data'].keys()])
nodes_root_df = nodes_root_df.set_index(['id'])
nodes_root_df = nodes_root_df.drop(['name'], axis=1)
nodes_root_df

Unnamed: 0_level_0,id_num
id,Unnamed: 1_level_1
nRoot,2011


StellarGraph only takes node features as a numerical type, so need to cast strings to numeric

In [5]:
nodes_df = pd.DataFrame([x['data'] for x in data['nodes'] if 'ip' in x['data'].keys()])
nodes_df = nodes_df.set_index(['id'])

networks = [netaddr.IPNetwork(x[0] + '/' + x[1]) for x in zip(nodes_df['ip'], nodes_df['mask'])]
nodes_df['ip_num'] = [int(x.ip.bits().replace('.',''), 2) for x in networks]
nodes_df['mask_num'] = [int(x.netmask.bits().replace('.',''), 2) for x in networks]
nodes_df = nodes_df.drop(['ip', 'mask', 'name'], axis=1)
nodes_df

Unnamed: 0_level_0,ip_num,mask_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1
n0,3232235530,4294967040
n1,3232235621,4294967040
n2,3232235529,4294967040
n3,3232235576,4294967040
n4,3232235532,4294967040
n5,167772446,4294967040
n6,167772472,4294967040
n7,167772418,4294967040
n8,167772518,4294967040
n9,167772516,4294967040


In [6]:
nodes_edges_sg = StellarGraph(
    {'main': nodes_df, 'root': nodes_root_df}, 
    edges_df,
    edge_type_column='edge_type')
print(nodes_edges_sg.info())

StellarGraph: Undirected multigraph
 Nodes: 11, Edges: 14

 Node types:
  main: [10]
    Features: float32 vector, length 2
    Edge types: main-sub_tree->main, main-sub_tree->root
  root: [1]
    Features: float32 vector, length 1
    Edge types: root-sub_tree->main

 Edge types:
    main-sub_tree->main: [13]
        Weights: all 1 (default)
        Features: none
    main-sub_tree->root: [1]
        Weights: all 1 (default)
        Features: none


### HinSAGE

Here, we use HinSAGE, which is a variant of GraphSAGE for heterogenous graphs. Follows https://stellargraph.readthedocs.io/en/stable/demos/embeddings/graphsage-unsupervised-sampler-embeddings.html
https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/hinsage-link-prediction.html

create a generator that will spit out node pairs that are sampled from the input graph

In [7]:
nodes = list(nodes_edges_sg.nodes())
number_of_walks = 2
length = 3
unsupervised_samples = UnsupervisedSampler(
    nodes_edges_sg, nodes=nodes, length=length, number_of_walks=number_of_walks
)
# help(unsupervised_samples)  # generates node pairs with a random binary label. Equal chance of 0 or 1 label. 
# help(HinSAGELinkGenerator)

In [8]:
batch_size = 50
num_samples = [10, 5]  # sampling from 1-hop and 2-hop in graph
generator = HinSAGELinkGenerator(nodes_edges_sg, batch_size, num_samples, head_node_types=['main', 'main'])
train_gen = generator.flow(unsupervised_samples)
# help(HinSAGE)  # creates a two layer model

In [9]:
generator.schema.type_adjacency_list(generator.head_node_types, len(num_samples))


[('main', [2, 3]),
 ('main', [4, 5]),
 ('main', [6, 7]),
 ('root', [8]),
 ('main', [9, 10]),
 ('root', [11]),
 ('main', []),
 ('root', []),
 ('main', []),
 ('main', []),
 ('root', []),
 ('main', [])]

In [10]:
generator.schema.schema

{'root': [EdgeType(n1='root', rel='sub_tree', n2='main')],
 'main': [EdgeType(n1='main', rel='sub_tree', n2='main'),
  EdgeType(n1='main', rel='sub_tree', n2='root')]}

Create a HinSAGE model, which will have a two layer GCN under the hood

In [11]:
layer_sizes = [20, 20]  # len(layer_sizes) == len(num_samples). Not sure if len(...) != 2 is supported... 
                        # depending on using the src or dst node encoders, specifies the dimensionality of the
                        # node embeddings
hinsage = HinSAGE(
    layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.0, normalize="l2"
)
# Build the model and expose input and output sockets of hinsage, for node pair inputs:
x_inp, x_out = hinsage.in_out_tensors()

Phrase the 'link prediction problem', which will be predicting the binary label on node pairs coming from unsupervised_samples. Note, the link prediction problem is just a means to an end of computing 2-hop kernel functions over nodes.

In [12]:
# Build the model and expose input and output sockets of hinsage, for node pair inputs:
x_inp, x_out = hinsage.in_out_tensors()
# build the final layer for training
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [13]:
# spec the model for training
model = keras.Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

In [14]:
epochs = 10
history = model.fit(
    train_gen,
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True
)
# this currently breaks; known bug due to UnsupervisedSampler with HinSAGE
# https://github.com/stellargraph/stellargraph/issues/1022

IndexError: list index out of range

Generate some node embeddings from the HinSAGE layer stack

In [None]:
x_inp_src = x_inp[0::2]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [None]:
node_ids = nodes_df.index.values.tolist()
node_gen = HinSAGENodeGenerator(nodes_edges_sg, batch_size, num_samples).flow(node_ids)
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)

In [None]:
print(node_embeddings.shape)

Use KMeans to look at embedding clusters

In [None]:
X = node_embeddings
kmeans = KMeans(n_clusters=4).fit(X)
node_to_label = {x:y for x,y in zip(node_ids, kmeans.labels_)}
for label in set(kmeans.labels_):
    print([x for x, y in node_to_label.items() if y == label])

In [None]:
cytoscapeobj = ic.CytoscapeWidget()
cytoscapeobj.graph.add_graph_from_json(data)
cytoscapeobj.set_style([{
                            'selector': 'node',
                            'css': {
                                'content': 'data(id)',
                                'text-valign': 'center',
                                'color': 'black'
                            }
                        }])
display(cytoscapeobj)