In [1]:
import ipycytoscape as ic  # visualise
import pandas as pd
import netaddr
from stellargraph import StellarGraph
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans  # sanity check node2vec embeddings
from stellargraph.data import BiasedRandomWalk  # for node2vec
from gensim.models import Word2Vec  # for node2vec
from sklearn.manifold import TSNE  # for viewing embeddings form node2vec

Create some data in pandas for stellagraph ingestion. Using https://stellargraph.readthedocs.io/en/stable/demos/basics/loading-pandas.html

In [2]:
# cytoscape format, taken from spectral_clustering notebook
data = {'nodes': 
         [{'data': {'id': 'n0', 'name': 'Node 0', 'ip': '192.168.0.10', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n1', 'name': 'Node 1', 'ip': '192.168.0.101', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n2', 'name': 'Node 2', 'ip': '192.168.0.9', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n3', 'name': 'Node 3', 'ip': '192.168.0.56', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n4', 'name': 'Node 4', 'ip': '192.168.0.12', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n5', 'name': 'Node 5', 'ip': '10.0.1.30', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n6', 'name': 'Node 6', 'ip': '10.0.1.56', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n7', 'name': 'Node 7', 'ip': '10.0.1.2', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n8', 'name': 'Node 8', 'ip': '10.0.1.102', 'mask': '255.255.255.0'}},
          {'data': {'id': 'n9', 'name': 'Node 9', 'ip': '10.0.1.100', 'mask': '255.255.255.0'}}],
        'edges': 
        [{'data': {'id': 'n0-n5', 'source': 'n0', 'target': 'n5'}},
         {'data': {'id': 'n0-n1', 'source': 'n0', 'target': 'n1'}},
         {'data': {'id': 'n0-n2', 'source': 'n0', 'target': 'n2'}},
         {'data': {'id': 'n0-n8', 'source': 'n0', 'target': 'n8'}},
         {'data': {'id': 'n0-n9', 'source': 'n0', 'target': 'n9'}},
         {'data': {'id': 'n1-n2', 'source': 'n1', 'target': 'n2'}},
         {'data': {'id': 'n8-n9', 'source': 'n8', 'target': 'n9'}},
         {'data': {'id': 'n5-n3', 'source': 'n5', 'target': 'n3'}},
         {'data': {'id': 'n5-n4', 'source': 'n5', 'target': 'n4'}},
         {'data': {'id': 'n5-n6', 'source': 'n5', 'target': 'n6'}},
         {'data': {'id': 'n5-n7', 'source': 'n5', 'target': 'n7'}},
         {'data': {'id': 'n3-n4', 'source': 'n3', 'target': 'n4'}},
         {'data': {'id': 'n6-n7', 'source': 'n6', 'target': 'n7'}}]
       }

In [3]:
edges_df = pd.DataFrame([x['data'] for x in data['edges']])
edges_df = edges_df.drop(['id'], axis=1)
edges_df

Unnamed: 0,source,target
0,n0,n5
1,n0,n1
2,n0,n2
3,n0,n8
4,n0,n9
5,n1,n2
6,n8,n9
7,n5,n3
8,n5,n4
9,n5,n6


In [4]:
nodes_df = pd.DataFrame([x['data'] for x in data['nodes']])
nodes_df = nodes_df.set_index(['id'])
nodes_df

Unnamed: 0_level_0,name,ip,mask
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
n0,Node 0,192.168.0.10,255.255.255.0
n1,Node 1,192.168.0.101,255.255.255.0
n2,Node 2,192.168.0.9,255.255.255.0
n3,Node 3,192.168.0.56,255.255.255.0
n4,Node 4,192.168.0.12,255.255.255.0
n5,Node 5,10.0.1.30,255.255.255.0
n6,Node 6,10.0.1.56,255.255.255.0
n7,Node 7,10.0.1.2,255.255.255.0
n8,Node 8,10.0.1.102,255.255.255.0
n9,Node 9,10.0.1.100,255.255.255.0


StellarGraph only takes node features as a numerical type, so need to cast strings to numeric

In [5]:
networks = [netaddr.IPNetwork(x[0]+'/'+x[1]) for x in zip(nodes_df['ip'], nodes_df['mask'])]
nodes_df['ip_num'] = [int(x.ip.bits().replace('.',''), 2) for x in networks]
nodes_df['mask_num'] = [int(x.netmask.bits().replace('.',''), 2) for x in networks]
nodes_df = nodes_df.drop(['ip', 'mask', 'name'], axis=1)
nodes_df

Unnamed: 0_level_0,ip_num,mask_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1
n0,3232235530,4294967040
n1,3232235621,4294967040
n2,3232235529,4294967040
n3,3232235576,4294967040
n4,3232235532,4294967040
n5,167772446,4294967040
n6,167772472,4294967040
n7,167772418,4294967040
n8,167772518,4294967040
n9,167772516,4294967040


In [6]:
nodes_edges_sg = StellarGraph(nodes_df, edges_df)
print(nodes_edges_sg.info())

StellarGraph: Undirected multigraph
 Nodes: 10, Edges: 13

 Node types:
  default: [10]
    Features: float32 vector, length 2
    Edge types: default-default->default

 Edge types:
    default-default->default: [13]
        Weights: all 1 (default)
        Features: none


### Node2Vec

Here, we use node2vec to convert the nodes to vector embeddings. These embeddings do not use any of the node/edge features.

Node2Vec first stage - generate some 'sentences' from random walks around the graph. The sentences are made up of words that are the node ids.

In [7]:
rw = BiasedRandomWalk(nodes_edges_sg)
walks = rw.run(
    nodes=list(nodes_edges_sg.nodes()),  # root nodes
    length=100,  # maximum length of a random walk
    n=12,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)
print("Number of random walks: {}".format(len(walks)))

Number of random walks: 120


Node2Vec second stage - convert sentences to vectors using Word2Vec. Using quite a small size here, since the size of the data is small

In [8]:
str_walks = [[str(n) for n in walk] for walk in walks]
str_walks[0][:5]

['n0', 'n5', 'n7', 'n6', 'n7']

In [9]:
model = Word2Vec(str_walks, size=16, window=5, min_count=0, sg=1, workers=2, iter=1)
model.wv["n0"]

array([ 0.17544858, -0.4518959 , -0.38296   , -0.37732312, -0.2734629 ,
        0.12075267, -0.40905276,  0.15864886, -0.52324057, -0.70424616,
       -0.31290355, -0.01093763,  0.10784699, -0.14036435, -0.34100693,
        0.06569256], dtype=float32)

Use KMeans to look at embedding clusters

In [14]:
X = (model.wv.vectors)
kmeans = KMeans(n_clusters=4).fit(X)
node_to_label = {x:y for x,y in zip(model.wv.index2word, kmeans.labels_)}
for label in set(kmeans.labels_):
    print([x for x, y in node_to_label.items() if y == label])

['n0', 'n1']
['n5', 'n6', 'n4', 'n7']
['n2', 'n9', 'n8']
['n3']


In [13]:
cytoscapeobj = ic.CytoscapeWidget()
cytoscapeobj.graph.add_graph_from_json(data)
cytoscapeobj.set_style([{
                            'selector': 'node',
                            'css': {
                                'content': 'data(id)',
                                'text-valign': 'center',
                                'color': 'black'
                            }
                        }])
display(cytoscapeobj)

CytoscapeWidget(cytoscape_layout={'name': 'cola'}, cytoscape_style=[{'selector': 'node', 'css': {'content': 'd…