In [2]:
import csv
import os

import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
import pydot
import graphviz
from networkx.drawing.nx_pydot import graphviz_layout
import matplotlib.pyplot as plt
import random

import math
from tqdm import tqdm
import threading
import pickle

data_dir = '../data/'
trace_data = os.path.join(data_dir, 'training_data', '2020_05_04', 'trace')

In [3]:
filenames = os.listdir(trace_data)
trace_df = [pd.read_csv(trace_data + os.sep + f) for f in filenames]

for i, df in enumerate(trace_df):
    if df['callType'].iloc[0] == 'JDBC':
      df['serviceName'] = df['dsName']
      df = df.drop(['dsName'], axis=1)
    elif df['callType'].iloc[0] == 'LOCAL':
      df = df.drop(['dsName'], axis=1)
    trace_df[i] = df

trace_df = pd.concat(trace_df)


In [4]:
grouped_traces = tuple(trace_df.groupby('traceId'))
del trace_df

In [5]:
def process_trace(trace):
    ids = trace[trace['callType'] == 'CSF']
    relationship = {}

    def parse(row):
        # parent -> child
        if row['pid'] in ids:
            relationship[row['pid']] = row['cmdb_id']
            
    def apply(row):
        # parent -> new_parent
        if row['callType'] != 'CSF':
            return row
        else:
            if row['id'] in relationship:
                row['cmdb_id'] = relationship[row['id']]
            return row

    trace.apply(parse, axis=1)
    return trace.apply(apply, axis=1)


def trace_graph(trace, prev_graph):
    DG = nx.DiGraph(prev_graph)
    
    hosts = trace['cmdb_id'].unique()
    services = trace['serviceName'].unique()

    # Add nodes to the graph
    for node in hosts:
        DG.add_node(node, type='host')
    
    for node in services:
        DG.add_node(node, type='service')

    # Add edges to the graph
    for _, row in trace.iterrows():
        parent = trace[trace['id'] == row['pid']]['serviceName']
        service = row['serviceName']
        host = row['cmdb_id']
    
        # Parent service to current service
        if(len(parent)): # Parent may be empty
            DG.add_edge(parent.values[0], service)
        
        # Current service to its host
        DG.add_edge(service, host)

    return DG

In [None]:
# num_threads = os.cpu_count()
# data = [{} for _ in range(num_threads)]

# def function(idx, grouped_traces):
#     print('Entered')
#     for index, t in enumerate(grouped_traces):
#         DG = trace_graph(trace=process_trace(t[1]), prev_graph=None)
#         DG_hash = nx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash(DG)    

#         trace_dgs = data[idx]
        
#         if DG_hash not in trace_dgs:
#             trace_dgs.update({DG_hash : DG})

#         if index > 10:
#             break

#         if index % 1000 == 0: 
#             print(idx, ':', index)
#     print('Left')
    

# step_size = round(len(grouped_traces) / num_threads + 0.5)
# threads = []
# for i in range(num_threads):
#     print(i)
#     t = threading.Thread(target=function, args=(i, grouped_traces[i*step_size:(i+1)*step_size]))
#     t.start()
#     threads.append(t)

# for t in threads:
#     t.join()

In [25]:
# num_threads = os.cpu_count()
# data = [{} for _ in range(num_threads)]
data = {} 

for i, t in tqdm(enumerate(grouped_traces), total=len(grouped_traces)):
    DG = trace_graph(trace=process_trace(t[1]), prev_graph=None)
    DG_hash = nx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash(DG)

    if DG_hash not in data:
        data.update({DG_hash : DG})

# Save 
data_filename = 'unique_graphs.pickle'
with open(data_filename, 'wb') as f:
    pickle.dump(data, f)

print('\n'+('-'*40))
print('Saved pickle file')
print('Number of graphs:',  len(data.keys()))
print('Filename:', data_filename)
print('-'*40)

# print(data, len(data.keys()))


  0%|          | 100/730041 [00:03<7:47:20, 26.03it/s]
----------------------------------------
Saved pickle file
Number of graphs: 5
Filename: Z:unique_graphs.pickle
----------------------------------------



In [24]:

# with open(data_filename, 'rb') as f:
#     final_data_recovered = pickle.load(f)

# print(final_data_recovered)

{'ddcc80605bb7964484afd458d64f553c': <networkx.classes.digraph.DiGraph object at 0x000001676D4CA320>, '7e626134855656d21edad7ebea80e3b2': <networkx.classes.digraph.DiGraph object at 0x000001676D637A90>, '4bc5a73e0f726b9a3d336b8ca1d58c06': <networkx.classes.digraph.DiGraph object at 0x000001676D3CA4E0>, '31ea5d3843487227dbfbdfe7727d716f': <networkx.classes.digraph.DiGraph object at 0x000001676D1F8828>, '1572870b16a9c8bdae01f7446fc40cdf': <networkx.classes.digraph.DiGraph object at 0x000001676D1F8320>}


In [None]:
# final_data = {}
# for d in data:
#     for k in d.keys():
#         if k not in final_data.keys():
#             final_data[k] = d[k]

# print(final_data, len(final_data.keys()))


# with open(data_filename, 'wb') as f:
#     pickle.dump(final_data, f)
