In [1]:
import csv
import os

import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
import pydot
import graphviz
from networkx.drawing.nx_pydot import graphviz_layout
import matplotlib.pyplot as plt
import random

import math
from tqdm import tqdm
import threading

data_dir = '../data/'
trace_data = os.path.join(data_dir, 'training_data', '2020_05_04', 'trace')

In [2]:
filenames = os.listdir(trace_data)
trace_df = [pd.read_csv(trace_data + os.sep + f) for f in filenames]

for i, df in enumerate(trace_df):
    if df['callType'].iloc[0] == 'JDBC':
      df['serviceName'] = df['dsName']
      df = df.drop(['dsName'], axis=1)
    elif df['callType'].iloc[0] == 'LOCAL':
      df = df.drop(['dsName'], axis=1)
    trace_df[i] = df

trace_df = pd.concat(trace_df)


In [3]:
grouped_traces = tuple(trace_df.groupby('traceId'))

del trace_df

In [4]:
def hierarchy_pos(G, root=None, width=1., vert_gap = 0.2, vert_loc = 0, leaf_vs_root_factor = 0.5):

    '''
    If the graph is a tree this will return the positions to plot this in a 
    hierarchical layout.
    
    Based on Joel's answer at https://stackoverflow.com/a/29597209/2966723,
    but with some modifications.  

    We include this because it may be useful for plotting transmission trees,
    and there is currently no networkx equivalent (though it may be coming soon).
    
    There are two basic approaches we think of to allocate the horizontal 
    location of a node.  
    
    - Top down: we allocate horizontal space to a node.  Then its ``k`` 
      descendants split up that horizontal space equally.  This tends to result
      in overlapping nodes when some have many descendants.
    - Bottom up: we allocate horizontal space to each leaf node.  A node at a 
      higher level gets the entire space allocated to its descendant leaves.
      Based on this, leaf nodes at higher levels get the same space as leaf
      nodes very deep in the tree.  
      
    We use use both of these approaches simultaneously with ``leaf_vs_root_factor`` 
    determining how much of the horizontal space is based on the bottom up 
    or top down approaches.  ``0`` gives pure bottom up, while 1 gives pure top
    down.   
    
    
    :Arguments: 
    
    **G** the graph (must be a tree)

    **root** the root node of the tree 
    - if the tree is directed and this is not given, the root will be found and used
    - if the tree is directed and this is given, then the positions will be 
      just for the descendants of this node.
    - if the tree is undirected and not given, then a random choice will be used.

    **width** horizontal space allocated for this branch - avoids overlap with other branches

    **vert_gap** gap between levels of hierarchy

    **vert_loc** vertical location of root
    
    **leaf_vs_root_factor**

    xcenter: horizontal location of root
    '''
    if not nx.is_tree(G):
        raise TypeError('cannot use hierarchy_pos on a graph that is not a tree')

    if root is None:
        if isinstance(G, nx.DiGraph):
            root = next(iter(nx.topological_sort(G)))  #allows back compatibility with nx version 1.11
        else:
            root = random.choice(list(G.nodes))

    def _hierarchy_pos(G, root, leftmost, width, leafdx = 0.2, vert_gap = 0.2, vert_loc = 0, 
                    xcenter = 0.5, rootpos = None, 
                    leafpos = None, parent = None):
        '''
        see hierarchy_pos docstring for most arguments

        pos: a dict saying where all nodes go if they have been assigned
        parent: parent of this branch. - only affects it if non-directed

        '''

        if rootpos is None:
            rootpos = {root:(xcenter,vert_loc)}
        else:
            rootpos[root] = (xcenter, vert_loc)
        if leafpos is None:
            leafpos = {}
        children = list(G.neighbors(root))
        leaf_count = 0
        if not isinstance(G, nx.DiGraph) and parent is not None:
            children.remove(parent)  
        if len(children)!=0:
            rootdx = width/len(children)
            nextx = xcenter - width/2 - rootdx/2
            for child in children:
                nextx += rootdx
                rootpos, leafpos, newleaves = _hierarchy_pos(G,child, leftmost+leaf_count*leafdx, 
                                    width=rootdx, leafdx=leafdx,
                                    vert_gap = vert_gap, vert_loc = vert_loc-vert_gap, 
                                    xcenter=nextx, rootpos=rootpos, leafpos=leafpos, parent = root)
                leaf_count += newleaves

            leftmostchild = min((x for x,y in [leafpos[child] for child in children]))
            rightmostchild = max((x for x,y in [leafpos[child] for child in children]))
            leafpos[root] = ((leftmostchild+rightmostchild)/2, vert_loc)
        else:
            leaf_count = 1
            leafpos[root]  = (leftmost, vert_loc)
#        pos[root] = (leftmost + (leaf_count-1)*dx/2., vert_loc)
#        print(leaf_count)
        return rootpos, leafpos, leaf_count

    xcenter = width/2.
    if isinstance(G, nx.DiGraph):
        leafcount = len([node for node in nx.descendants(G, root) if G.out_degree(node)==0])
    elif isinstance(G, nx.Graph):
        leafcount = len([node for node in nx.node_connected_component(G, root) if G.degree(node)==1 and node != root])
    rootpos, leafpos, leaf_count = _hierarchy_pos(G, root, 0, width, 
                                                    leafdx=width*1./leafcount, 
                                                    vert_gap=vert_gap, 
                                                    vert_loc = vert_loc, 
                                                    xcenter = xcenter)
    pos = {}
    for node in rootpos:
        pos[node] = (leaf_vs_root_factor*leafpos[node][0] + (1-leaf_vs_root_factor)*rootpos[node][0], leafpos[node][1]) 
#    pos = {node:(leaf_vs_root_factor*x1+(1-leaf_vs_root_factor)*x2, y1) for ((x1,y1), (x2,y2)) in (leafpos[node], rootpos[node]) for node in rootpos}
    xmax = max(x for x,y in pos.values())
    for node in pos:
        pos[node]= (pos[node][0]*width/xmax, pos[node][1])
    return pos

In [4]:
def process_trace(trace):
    ids = trace[trace['callType'] == 'CSF']
    relationship = {}

    def parse(row):
        # parent -> child
        if row['pid'] in ids:
            relationship[row['pid']] = row['cmdb_id']
            
    def apply(row):
        # parent -> new_parent
        if row['callType'] != 'CSF':
            return row
        else:
            if row['id'] in relationship:
                row['cmdb_id'] = relationship[row['id']]
            return row

    trace.apply(parse, axis=1)
    return trace.apply(apply, axis=1)


def trace_graph(trace, prev_graph):
    DG = nx.DiGraph(prev_graph)
    
    hosts = trace['cmdb_id'].unique()
    services = trace['serviceName'].unique()

    # Add nodes to the graph
    for node in hosts:
        DG.add_node(node, type='host')
    
    for node in services:
        DG.add_node(node, type='service')

    # Add edges to the graph
    for _, row in trace.iterrows():
        parent = trace[trace['id'] == row['pid']]['serviceName']
        service = row['serviceName']
        host = row['cmdb_id']
    
        # Parent service to current service
        if(len(parent)): # Parent may be empty
            DG.add_edge(parent.values[0], service)
        
        # Current service to its host
        DG.add_edge(service, host)

    return DG

In [9]:
num_threads = os.cpu_count()
data = [{} for _ in range(num_threads)]

def function(idx, grouped_traces):
    print('Entered')
    for index, t in enumerate(grouped_traces):
        DG = trace_graph(trace=process_trace(t[1]), prev_graph=None)
        DG_hash = nx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash(DG)    

        trace_dgs = data[idx]
        
        if DG_hash not in trace_dgs.keys():
            trace_dgs.update({DG_hash : DG})

        # iso = False
        # for k in trace_dgs.keys():
        #     if nx.is_isomorphic(G, DG):
        #         iso = True
        #         break
        # if not iso:
        #     trace_dgs.append(DG)

        if index % 1000 == 0: 
            print(idx, ':', index)
    print('Left')
    

step_size = round(len(grouped_traces) / num_threads + 0.5)
threads = []
for i in range(num_threads):
    print(i)
    t = threading.Thread(target=function, args=(i, grouped_traces[i*step_size:(i+1)*step_size]))
    t.start()
    threads.append(t)

for t in threads:
    t.join()

    # root = trace[trace['pid'] == 'None']['id'].values[0]
            
    # plt.figure(figsize=(9,9))
    # pos = nx.draw_shell(DG, with_labels=True)
    # nx.draw_networkx_nodes(DG, pos, nodelist=hosts, node_color="r", node_size=1500)
    # nx.draw_networkx_nodes(DG, pos, nodelist=services, node_color="b", node_size=500)    
    # plt.show()

    # if i > 100:
    #     break
    # i += 1

0
Entered
1
Entered
2
Entered
3
Entered
4
Entered
5
Entered
2 : 0
6
Entered
0 : 0
7
Entered
8
Entered
9
Entered
10
1Entered
6 : 0
 11
Entered
: 0
5 : 0
10 : 0
4 8 : 0: 0

93 :  : 0
110
 : 0
7 : 0


In [6]:
data = {} 

for i, t in tqdm(enumerate(grouped_traces)):
    DG = trace_graph(trace=process_trace(t[1]), prev_graph=None)
    DG_hash = nx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash(DG)

    if DG_hash not in data.keys():
        data.update({DG_hash : DG})

print(data)

891it [00:35, 25.44it/s]


KeyboardInterrupt: 

In [19]:
print(trace_dgs, len(trace_dgs))

[<networkx.classes.digraph.DiGraph object at 0x000001914FDDAF98>, <networkx.classes.digraph.DiGraph object at 0x000001914FDCC7F0>, <networkx.classes.digraph.DiGraph object at 0x000001914FDDA278>, <networkx.classes.digraph.DiGraph object at 0x000001914FDE43C8>, <networkx.classes.digraph.DiGraph object at 0x000001914E708630>] 5


In [None]:
print(data)

In [30]:
print(grouped_traces)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000191636ECD68>


In [None]:
print(step_size)