In [1]:
# Libraries
import pandas as pd
import numpy as np

# Plots
import networkx as nx
import matplotlib
import matplotlib.cm as cm
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import plotly.offline as py
from plotly.subplots import make_subplots

# Dash
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# 1. Log parsing

In [2]:
path = r'Data\New_PIT2.txt'

#### 1.1 Extract df from text

In [3]:
def txt_to_df(path):

    # 1.1 Read Data
    f = open(path, 'r')
    content = f.read()
    content_list = content.split('/* JOBSPLIT: ')
    f.close()
    df = pd.DataFrame(content_list, columns=['Text'])

    # 1.2. Define Task type
    df['Task Type'] = df["Text"].str.split().str[0]

    # 1.3 Assign Task ID
    df = df.reset_index(drop=True)
    task_list = [0] 
    for index, row in df.iterrows():
        if (row['Task Type'] == 'TASKSTARTTIME') | (row['Task Type'] == 'JOBENDTIME'):
            task_list.append(task_list[index] +1)
        else:
            task_list.append(task_list[index])   
    df['Task ID'] = task_list[1:]

    # 1.4 Remove unessesary text
    df['Text'] = df[['Text','Task Type']].apply(lambda x: x[0].replace('\n\n','\n').replace('STEP SOURCE FOLLOWS */\n','') if x[1]=='STEP' 
                                                          else x[0].replace('*/\n',''), axis = 1)

    df['Task ID'] = df['Task ID'].astype('int')
    
    return df

df_log = txt_to_df(path)
df_log

Unnamed: 0,Text,Task Type,Task ID
0,,,0
1,JOBSTARTTIME 27JUL2021:10:34:25.72,JOBSTARTTIME,0
2,TASKSTARTTIME 27JUL2021:10:34:25.72,TASKSTARTTIME,1
3,CATALOG INPUT WORK.SASMAC1.MTF_IFRS9_PD_PIT_BA...,CATALOG,1
4,LIBNAME WORK V9 '/opt/sas/saswork/SAS_work2019...,LIBNAME,1
...,...,...,...
3976,ELAPSED 8,ELAPSED,163
3977,PROCNAME DATASETS,PROCNAME,163
3978,proc datasets lib = work nolist noprint memty...,STEP,163
3979,JOBENDTIME 27JUL2021:10:36:37.94,JOBENDTIME,164


#### 1.2. Extract info about Code

In [4]:
def df_to_code(df):

    #-------------------------------------------
    # 2.1 Extract the Code Syntax
    df_code = df.loc[(df_log['Task Type'].isin(['STEP']))].copy()
    df_code.drop(['Task Type'], inplace=True, axis=1)
    df_code = df_code.rename(columns={'Text': 'Code'})
    df_code = df_code[['Task ID', 'Code']]

    #-------------------------------------------
    # 2.2. Prepare info about Start Time
    df_taskstarttime = df_log.loc[(df_log['Task Type'].isin(['TASKSTARTTIME']))].copy()
    df_taskstarttime['Start Time'] = df_taskstarttime['Text'].str.split().str[1]
    df_taskstarttime['Start Time'] = pd.to_datetime(df_taskstarttime['Start Time'], format='%d%b%Y:%H:%M:%S.%f')
    df_taskstarttime.drop(['Text', 'Task Type'], inplace=True, axis=1)

    #-------------------------------------------
    # 2.3. Prepare info about Elapsed Time
    df_elapsedtime = df_log.loc[(df_log['Task Type'].isin(['ELAPSED']))].copy()
    df_elapsedtime['Elapsed Time'] = df_elapsedtime['Text'].str.split().str[1].astype('int')/1000
    df_elapsedtime.drop(['Text', 'Task Type'], inplace=True, axis=1)

    #-------------------------------------------
    # 2.4. Prepare info about Procedure Names
    df_procedure = df_log.loc[(df_log['Task Type'].isin(['PROCNAME']))].copy()
    df_procedure['Procedure'] = df_procedure['Text'].str.split().str[1]
    df_procedure.drop(['Text', 'Task Type'], inplace=True, axis=1)

    #-------------------------------------------
    # 2.5 Output
    df_code = df_code.merge(df_procedure, on=['Task ID'], how = 'outer')\
                     .merge(df_taskstarttime, on=['Task ID'], how = 'outer')\
                     .merge(df_elapsedtime, on=['Task ID'], how = 'outer')
    
    df_code['Start Time'] = df_code['Start Time'].astype('str')
    df_code['Task ID'] = df_code['Task ID'].astype('int')
    
    return df_code

df_code = df_to_code(df_log)
df_code

Unnamed: 0,Task ID,Code,Procedure,Start Time,Elapsed Time
0,1,/*--------------------------------------------...,DATASTEP,2021-07-27 10:34:25.720,0.006
1,2,data _null_;\n set __mtf_pd_pit_version;\n mt...,DATASTEP,2021-07-27 10:34:25.730,0.007
2,3,proc sql noprint;\n create table __mtf_pd_pit...,SQL,2021-07-27 10:34:25.740,2.941
3,4,proc sql noprint;\n create table __mtf_pd_pit...,SQL,2021-07-27 10:34:28.680,2.490
4,5,proc sql noprint;\n select count(*) into :__i...,SQL,2021-07-27 10:34:31.170,0.128
...,...,...,...,...,...
158,159,proc transpose data=WORK.__MTF_PD_PIT_ODR_WID...,TRANSPOSE,2021-07-27 10:36:36.550,0.659
159,160,proc sql noprint _method;\n create table work...,SQL,2021-07-27 10:36:37.210,0.208
160,161,proc datasets lib = work nolist noprint memty...,DATASETS,2021-07-27 10:36:37.420,0.468
161,162,proc datasets lib = work nolist noprint memty...,DATASETS,2021-07-27 10:36:37.890,0.044


#### 1.3. Extract Edges

In [5]:
def df_to_edges(df):
    
    #-------------------------------------------
    # 3.1 Extract Table info
    # 3.1.a Get data
    df_tables = df.loc[(df['Task Type'].isin(['DATASET', 'OPENTIME', 'TASKSTARTTIME',  'JOBENDTIME']))].copy()   

    # 3.1.b # Extract table name and type
    df_tables['Table'] = np.where(df_tables['Task Type'].isin(['DATASET']),
                                  df_tables['Text'].str.split().str[3],
                                  np.where(df_tables['Task Type'].isin(['OPENTIME']),
                                           df_tables['Text'].str.split().str[1],
                                           ''))
    df_tables['Table type'] = df_tables['Table'].str.split('.').str[-1]                          # Extract table type
    df_tables['Table'] = df_tables['Table'].apply(lambda x: '.'.join(x.split('.')[:-1]))         # Extract table name

    # 3.1.c Define table type (Input, Output, Update)
    df_tables['Dataset type'] = np.where(df_tables['Task Type'].isin(['DATASET']),
                                          df_tables['Text'].str.split().str[1],                  # Define Input, Output, Update tables
                                          '')
    df_tables['Dataset type'] = np.where(df_tables['Task Type'].isin(['OPENTIME']),
                                          df_tables['Dataset type'].shift(),                     # Inherit table type
                                          df_tables['Dataset type'])

    # 3.1.d Add sub-step indicator
    df_tables = df_tables.reset_index(drop=True)
    df_tables['Lag Task ID'] = df_tables['Task ID'].shift()
    df_tables['Lag Dataset type'] = df_tables['Dataset type'].shift()
    sub_step_list = [1]
    for index, row in df_tables.iterrows():
        if (row['Task ID'] != row['Lag Task ID']):
            sub_step_list.append(1)                       # Reset sub-step
        elif ((row['Lag Dataset type'] == 'OUTPUT') & (row['Dataset type'] == 'INPUT')) | \
             ((row['Lag Dataset type'] == 'OUTPUT') & (row['Dataset type'] == 'UPDATE')) | \
             ((row['Lag Dataset type'] == 'UPDATE') & (row['Dataset type'] == 'INPUT')):
            sub_step_list.append(sub_step_list[index] +1) # Increase sub-step
        else:
            sub_step_list.append(sub_step_list[index])    # Keep sub-step
    df_tables['SubTask ID'] = sub_step_list[1:]

    # 3.1.e Correct case for updated tables
    df_tables['Dataset type'] = df_tables['Dataset type'].replace({'UPDATE': 'UPDATE INPUT'})
    df_update = df_tables.loc[df_tables['Dataset type'].isin(['UPDATE INPUT'])].copy()
    df_update['Dataset type'] = df_update['Dataset type'].replace({'UPDATE INPUT': 'UPDATE OUTPUT'})
    df_tables = df_tables.append(df_update).sort_values(['Task ID', 'SubTask ID', 'Dataset type'])
    
    # 3.1.f Add node index
    node_id_dict = {}
    node_id_list = []
    for index, row in df_tables.iterrows():
        # If update or output:
        if (row['Dataset type'] == 'OUTPUT') | (row['Dataset type'] == 'UPDATE OUTPUT') :
            # create node ID
            node_id = str(row['Task ID']) +':'+ \
                      str(row['SubTask ID']) +':'+ \
                      row['Table'] +':'+ \
                      row['Table type']   
            # update column
            node_id_list.append(node_id)       
            # update dict
            node_id_dict[row['Table']] = node_id                                                                    

        # If source table:
        elif row['Table'] not in node_id_dict:
            # create node ID
            node_id = '0:0:' + \
                      row['Table']  +':'+ \
                      row['Table type']
            # update column
            node_id_list.append(node_id)                                                           
            # update dict
            node_id_dict[row['Table']] = node_id  
        # If input:
        else: 
            # use previous node ID 
            node_id_list.append(node_id_dict[row['Table']])                                                         
    df_tables['Node Id'] = node_id_list

    # 3.1.g Remove leftover tables
    df_tables.drop(['Table type','Lag Task ID','Lag Dataset type'], inplace=True, axis=1)

    #-------------------------------------------
    # 3.2. Prepare info about Input tables
    df_input = df_tables.loc[(df_tables['Task Type'].isin(['DATASET']) ) & 
                             (df_tables['Dataset type'].isin(['INPUT', 'UPDATE INPUT']))].copy()
    df_input = df_input.rename(columns={'Table': 'Input Table',
                                        'Node Id': 'Source ID'})
    df_input.drop(['Text', 'Dataset type', 'Task Type'], inplace=True, axis=1)

    #-------------------------------------------
    # 3.3. Prepare info about Output tables
    df_output = df_tables.loc[(df_tables['Task Type'].isin(['DATASET']) ) & 
                              (df_tables['Dataset type'].isin(['OUTPUT', 'UPDATE OUTPUT']))].copy()
    df_output = df_output.rename(columns={'Table': 'Output Table',
                                          'Node Id': 'Target ID'})
    df_output.drop(['Text', 'Dataset type', 'Task Type'], inplace=True, axis=1)

    #-------------------------------------------
    # 3.4. Prepare info about Time Calculations
    # 3.4.a Get Data
    df_time = df_tables.loc[df_tables['Task Type'].isin(['OPENTIME', 'TASKSTARTTIME', 'JOBENDTIME'])].copy()

    # 3.4.b Add time
    df_time['Start Time'] = np.where(df_time['Task Type'].isin(['OPENTIME']),
                                     df_time['Text'].str.split().str[2].str.replace('DATE:', ''),
                                     df_time['Text'].str.split().str[1].str.replace('DATE:', ''))
    df_time['Start Time']  = pd.to_datetime(df_time['Start Time'], format='%d%b%Y:%H:%M:%S.%f')

    # 3.4.c Remove columns and rows
    df_time.drop(['Table','Text','Task Type','Dataset type','Node Id'], inplace=True, axis=1)
    df_time = df_time.drop_duplicates(subset=['Task ID', 'SubTask ID'], keep='first')

    # 3.4.d Calculate time for each step
    df_time['Elapsed Time'] = (df_time['Start Time'].shift(-1) - df_time['Start Time']).dt.total_seconds()
    df_time['Elapsed Time'] = df_time['Elapsed Time'].round(2)

    #-------------------------------------------
    # 3.5. Output
    # 3.5.a Merge the results for input and output tables + Elapsed time
    df = df_input.merge(df_output, on=['Task ID', 'SubTask ID'], how = 'outer')\
                 .merge(df_time, on=['Task ID', 'SubTask ID'], how = 'left')\
                 .sort_values(['Task ID', 'SubTask ID'])\
                 .reset_index(drop = True)

    # 3.5.b Fill missing for specific cases
    df['Input Table'] = df['Input Table'].fillna('No Input')
    df['Output Table'] = df['Output Table'].fillna('_null_')

    # 3.5.c Correct Start time
    df['Start Time'] = df['Start Time'].fillna(df['Start Time'].shift()) 
    df['Elapsed Time'] = df['Elapsed Time'].fillna(0)

    # 3.5.d Correct Target node ID
    df['Target ID'] = df['Target ID'].fillna(df['Task ID'].astype('str') +':'+ df['SubTask ID'].astype('str')+ ':No Output:Empty') 

    # 3.5.e Correct Source node ID
    df['Source ID'] = df[['Input Table','Task ID','SubTask ID','Source ID']].apply(lambda x: str(x[1])+':'+str(int(x[2])-1)+':No Input:Empty' if x[0]=='No Input' 
                                                                                  else x[3], axis = 1)
    
    # 3.5.f Reorder columns
    df = df[['Task ID','SubTask ID', 'Source ID','Target ID', 'Input Table','Output Table', 'Start Time','Elapsed Time']]
    
    return df

df_edges = df_to_edges(df_log)
df_edges

Unnamed: 0,Task ID,SubTask ID,Source ID,Target ID,Input Table,Output Table,Start Time,Elapsed Time
0,1,1,1:0:No Input:Empty,1:1:WORK.__MTF_PD_PIT_VERSION:DATA,No Input,WORK.__MTF_PD_PIT_VERSION,2021-07-27 10:34:25.720,0.01
1,2,1,1:1:WORK.__MTF_PD_PIT_VERSION:DATA,2:1:No Output:Empty,WORK.__MTF_PD_PIT_VERSION,_null_,2021-07-27 10:34:25.730,0.01
2,3,1,0:0:INPUT.PD_VIEW:DATA,3:1:WORK.__MTF_PD_PIT_DISTINCT_RATING1:DATA,INPUT.PD_VIEW,WORK.__MTF_PD_PIT_DISTINCT_RATING1,2021-07-27 10:34:25.740,2.93
3,3,1,0:0:INPUT.PD_VIEW:DATA,3:1:WORK.'SASTMP-000000348':UTILITY,INPUT.PD_VIEW,WORK.'SASTMP-000000348',2021-07-27 10:34:25.740,2.93
4,3,2,0:0:INPUT.PD_RR_RATING:DATA,3:2:WORK.__MTF_PD_PIT_DISTINCT_RATING2:DATA,INPUT.PD_RR_RATING,WORK.__MTF_PD_PIT_DISTINCT_RATING2,2021-07-27 10:34:28.670,0.00
...,...,...,...,...,...,...,...,...
286,160,1,159:1:WORK.__MTF_PD_PIT_ODR_LONG_DIS:DATA,160:1:WORK.TEST_PD_DIS_IN:DATA,WORK.__MTF_PD_PIT_ODR_LONG_DIS,WORK.TEST_PD_DIS_IN,2021-07-27 10:36:37.210,0.21
287,160,1,100:1:WORK.__MTF_PD_PIT_MARGINAL_BOTH_PD:DATA,160:1:WORK.TEST_PD_DIS_IN:DATA,WORK.__MTF_PD_PIT_MARGINAL_BOTH_PD,WORK.TEST_PD_DIS_IN,2021-07-27 10:36:37.210,0.21
288,161,1,161:0:No Input:Empty,161:1:WORK.'SASTMP-000000636':UTILITY,No Input,WORK.'SASTMP-000000636',2021-07-27 10:36:37.420,0.47
289,162,1,162:0:No Input:Empty,162:1:WORK.'SASTMP-000000638':UTILITY,No Input,WORK.'SASTMP-000000638',2021-07-27 10:36:37.890,0.04


## 2. Graph data

In [6]:
# Function for defining positions of tables
#====================================================================================  

def get_coords(G):
    
    #--------------------------------------------------------------------
    # get source nodes 
    source_nodes = sorted([node for node in G.nodes() if G.in_degree(node) == 0])
    
    #--------------------------------------------------------------------
    # Get nodes in traversal order
    list_nodes = []
    for i in range(len(source_nodes)):
        # Define source node
        source = source_nodes[i]
        # Get nodes in depth order
        list_nodes_depth = list(nx.dfs_preorder_nodes(G, source=source, depth_limit=None))
        # Extend main list
        list_nodes.extend(list_nodes_depth)
        
    #--------------------------------------------------------------------
    # Add elements if missed
    list_nodes_full = sorted(list(G.nodes))
    list_nodes.extend(list_nodes_full)

    #--------------------------------------------------------------------
    # Create df
    df_nodes = pd.DataFrame(list_nodes, columns =['Node'])
    df_nodes['Task ID'] = df_nodes['Node'].str.split(':').str[0].astype('float')
    df_nodes['SubTask ID'] = df_nodes['Node'].str.split(':').str[1].astype('float')
    df_nodes = df_nodes.drop_duplicates(subset=['Node'], keep='first')
    
    #--------------------------------------------------------------------
    # Assign y
    df_nodes = df_nodes.reset_index(drop=True)
    df_nodes['Lag Node'] = df_nodes['Node'].shift()
    df_nodes['Lag Task ID'] = df_nodes['Task ID'].shift()
    df_nodes['Lag SubTask ID'] = df_nodes['SubTask ID'].shift()
    y_list = [0] 
    for index, row in df_nodes.iterrows():
        # Get predecessors
        predecessors = [c for c in G.predecessors( row['Node'] )]
        if (row['Lag Node'] in predecessors):
            if (row['Task ID'] == row['Lag Task ID'])  & (row['SubTask ID'] == row['Lag SubTask ID']):
                y_list.append(y_list[index] + 1)
            else: 
                y_list.append(y_list[index])
        else:
            y_list.append(y_list[index] + 1)
    df_nodes['y'] = y_list[1:]
    
    #--------------------------------------------------------------------
    # Reorder nodes
    df_nodes = df_nodes.sort_values(by=['Task ID','SubTask ID'])

    #--------------------------------------------------------------------
    # Assign x
    df_nodes = df_nodes.reset_index(drop=True)
    df_nodes['Lag Task ID'] = df_nodes['Task ID'].shift()
    df_nodes['Lag SubTask ID'] = df_nodes['SubTask ID'].shift()
    x_list = [-2] 
    for index, row in df_nodes.iterrows():
        # Move from task to task
        if (row['Task ID'] != row ['Lag Task ID']):
            x_list.append(x_list[index] + 2)
        # Move from subtask to subtask        
        elif (row['Task ID'] == row ['Lag Task ID']) & (row['SubTask ID'] != row ['Lag SubTask ID']):
            x_list.append(x_list[index] + 1)  
        else:
            x_list.append(x_list[index])
    df_nodes['x'] = x_list[1:]

    #--------------------------------------------------------------------
    # Coords dict
    coords = df_nodes[['Node', 'x' , 'y']].set_index('Node').T.to_dict('list')
    
    return coords

In [7]:
# Function for extracting data on connections
#====================================================================================  

def get_edge_df(G):
    
    edge_dict = {'x0':[],
                 'y0':[],
                 'x1':[],
                 'y1':[],
                 'x_mean':[],
                 'y_mean':[],
                 'step_id':[],
                 'color_values':[],
                 'text_labels':[]}

    for edge in G.edges():
        #--------------------------------------------------------------------
        # Get Coords
        x0, y0 = G.nodes[edge[0]]['coords']
        x1, y1 = G.nodes[edge[1]]['coords']
        x_mean = round( (x0+x1)/2, 4)
        y_mean = round( (y0+y1)/2, 4)   

        #--------------------------------------------------------------------
        # Shift central points        
        shift = (( (x1-x0)**2 + (y1-y0)**2 )**0.5) * 0.05    
        # If Horizontal line
        if x0 == x1:
            x_mean = x_mean + shift
        # If Vertical line            
        elif y0 == y1:
            y_mean = y_mean - shift

        #--------------------------------------------------------------------
        # Colorbar values
        color_value = G.edges[edge[0],edge[1]]['Elapsed Time'] 

        #--------------------------------------------------------------------
        # Hover text
        text_label = 'Task ID: '+str(G.edges[edge[0],edge[1]]['Task ID'])\
                        +' '+\
                        'SubTask ID: '+str(G.edges[edge[0],edge[1]]['SubTask ID'])\
                        +'<br> '+\
                        'Input: '+str(G.edges[edge[0],edge[1]]['Input Table'])\
                        +'<br> '+\
                        'Output: '+str(G.edges[edge[0],edge[1]]['Output Table'])\
                        +'<br> '+\
                        'Elapsed Time: '+str(G.edges[edge[0],edge[1]]['Elapsed Time'])

        #--------------------------------------------------------------------
        # output
        edge_dict['x0'].append(x0)
        edge_dict['y0'].append(y0)    
        edge_dict['x1'].append(x1)
        edge_dict['y1'].append(y1)  
        edge_dict['x_mean'].append(x_mean)
        edge_dict['y_mean'].append(y_mean)  
        edge_dict['step_id'].append(G.edges[edge[0],edge[1]]['Task ID']) 
        edge_dict['color_values'].append(color_value)   
        edge_dict['text_labels'].append(text_label) 

    edge_df = pd.DataFrame(edge_dict)

    #====================================================================================        
    # Get color codes
    norm = matplotlib.colors.Normalize(vmin=min(edge_df['color_values']), 
                                       vmax=max(edge_df['color_values']), 
                                       clip=True)
    mapper = cm.ScalarMappable(norm=norm, cmap='RdYlGn_r')
    edge_df['hex'] = edge_df['color_values'].apply(lambda x: matplotlib.colors.to_hex(mapper.to_rgba(x), keep_alpha=False))
    
    return edge_df

In [8]:
# Function for extracting data on connections
#====================================================================================  

def get_node_df(G):  

    node_dict = {'x':[],
                 'y':[],
                 'label':[],
                 'step_id':[],
                 'group':[],
                 'color':[],
                 'shape':[]}
    
    for node in G.nodes():  
        #--------------------------------------------------------------------
        # Node Coords
        x, y = G.nodes[node]['coords']
        node_dict['x'].append(x)
        node_dict['y'].append(y)   

        #--------------------------------------------------------------------
        # Node name
        table_name = node.split(':')[2]
        node_dict['label'].append(table_name)  
        
        #--------------------------------------------------------------------
        # Step ID
        node_dict['step_id'].append(node.split(':')[0])  
        
        #--------------------------------------------------------------------
        # Table Type
        table_type = node.split(':')[3]       
        
        #--------------------------------------------------------------------
        # Predesessors
        predecessors = [c.split(':')[2] for c in G.predecessors(node)]

        #--------------------------------------------------------------------
        # Node shape and color
        # a) No Input node
        if table_name == 'No Input':
            node_dict['group'].append('No Input')  
            node_dict['shape'].append('diamond-cross')  
            node_dict['color'].append('grey')  
        # b) No Output node
        elif table_name == 'No Output':
            node_dict['group'].append('No Output')  
            node_dict['shape'].append('square-cross')  
            node_dict['color'].append('grey')
        # c) Temporary tables
        elif table_type == 'UTILITY':
            node_dict['group'].append('Technical Table')  
            node_dict['shape'].append('star-square-dot')  
            node_dict['color'].append('silver')
        # d) Input node
        elif G.in_degree(node) == 0:
            node_dict['group'].append('Input Table')  
            node_dict['shape'].append('diamond')  
            node_dict['color'].append('gold')
        # e) Output node
        elif G.out_degree(node) == 0:
            node_dict['group'].append('Output Table')  
            node_dict['shape'].append('square')  
            node_dict['color'].append('cyan')
        # f) Updated node            
        elif table_name in predecessors:
            node_dict['group'].append('Updated Table')  
            node_dict['shape'].append('cross')  
            node_dict['color'].append('orange')
        # g) Internal nodes
        else:
            node_dict['group'].append('Internal Table')  
            node_dict['shape'].append('circle')  
            node_dict['color'].append('blue')

    node_df = pd.DataFrame(node_dict)
    node_df['step_id'] = node_df['step_id'].astype('int')
    
    return node_df

In [9]:
def get_graph_data(df):

    # Create graph
    G = nx.from_pandas_edgelist(df,
                                'Source ID','Target ID',
                                ['Task ID','SubTask ID','Input Table','Output Table','Start Time','Elapsed Time'],
                                create_using=nx.DiGraph())
    print(nx.info(G))

    # Define positions of tables
    coords = get_coords(G)
    # Add positions to the graph
    nx.set_node_attributes(G, coords, 'coords') 

    # Create table for connections and label text
    edge_df = get_edge_df(G)

    # Create table for nodes
    node_df = get_node_df(G)

    return node_df, edge_df

node_df, edge_df = get_graph_data(df_edges)

DiGraph with 241 nodes and 291 edges


In [10]:
print(len(node_df))
print(len(edge_df))

241
291


In [11]:
edge_df

Unnamed: 0,x0,y0,x1,y1,x_mean,y_mean,step_id,color_values,text_labels,hex
0,2,78,3,78,2.5,77.95,1,0.01,Task ID: 1 SubTask ID: 1<br> Input: No Input<b...,#006837
1,3,78,5,78,4.0,77.90,2,0.01,Task ID: 2 SubTask ID: 1<br> Input: WORK.__MTF...,#006837
2,0,30,7,30,3.5,29.65,3,2.93,Task ID: 3 SubTask ID: 1<br> Input: INPUT.PD_V...,#249d53
3,0,30,7,31,3.5,30.50,3,2.93,Task ID: 3 SubTask ID: 1<br> Input: INPUT.PD_V...,#249d53
4,0,30,11,32,5.5,31.00,4,2.46,Task ID: 4 SubTask ID: 1<br> Input: INPUT.PD_V...,#18954f
...,...,...,...,...,...,...,...,...,...,...
286,353,41,355,41,354.0,40.90,159,0.32,Task ID: 159 SubTask ID: 1<br> Input: WORK.__M...,#036e3a
287,355,41,358,17,356.5,29.00,160,0.21,Task ID: 160 SubTask ID: 1<br> Input: WORK.__M...,#026c39
288,360,75,361,75,360.5,74.95,161,0.47,Task ID: 161 SubTask ID: 1<br> Input: No Input...,#04703b
289,363,76,364,76,363.5,75.95,162,0.04,Task ID: 162 SubTask ID: 1<br> Input: No Input...,#006837


In [12]:
df_test = edge_df.merge(node_df, on=['step_id'], how = 'outer')
df_test.to_csv('test.csv', sep = ';')

## 3. Dash plot

### 1. Node Trace

In [None]:
def get_node_trace(df):
    
    node_trace = []
    for index, row in df.iterrows():
        
        node_trace.append(go.Scatter(x = [row['x']], 
                                     y = [row['y']],
                                     mode = 'markers',
                                     # Marker
                                     marker=dict(symbol = row['shape'], 
                                                 color = row['color'],
                                                 size = 10),
                                     name = row['group'],
                                     marker_line_color='black', 
                                     marker_line_width=0.5,
                                     # Text
                                     text = str(row['label']),
                                     textposition = 'top center',
                                     # Legend
                                     showlegend=True,
                                     # Meta
                                     meta = row['step_id']
                                    )
                        ) 
    return node_trace

In [None]:
# network plot
node_trace = get_node_trace(node_df)

### 2. Edge Trace

In [None]:
def get_edge_trace(df):
    
    edge_trace = []
    for index, row in df.iterrows():
        edge_trace.append(go.Scatter(x = [row['x0'], row['x_mean'], row['x1'], None],
                                     y = [row['y0'], row['y_mean'], row['y1'], None],
                                     mode = 'lines', 
                                     # line
                                     line_shape = 'spline',
                                     line = dict(width=1, 
                                                 dash='dot', 
                                                 color=row['hex']),
                                     # Text
                                     hoverinfo = 'none',
                                     # Legend
                                     showlegend=False,
                                     # Meta
                                     meta = row['step_id']
                                    )
                         )
        
    return edge_trace

In [None]:
edge_trace = get_edge_trace(edge_df)

### 3. Text trace

In [None]:
def get_text_trace(df):
    
    text_trace = go.Scatter(x=df['x_mean'], 
                            y=df['y_mean'], 
                            # Marker
                            mode = 'markers', 
                            marker_symbol = 'hexagram',
                            marker=dict(showscale=True, 
                                        colorscale='RdYlGn', 
                                        reversescale=True,
                                        size = 8, 
                                        color=df['color_values'],
                                        colorbar=dict(thickness=15,
                                                      title='Execution time (s)',
                                                      xanchor='left',
                                                      titleside='right')
                                       ),
                            # Text
                            text = df['text_labels'],  
                            textposition = 'top center',
                            # Legend
                            showlegend=False,
                            # Meta
                            meta = df['step_id']                           
                           )
    
    return text_trace

In [None]:
text_trace = get_text_trace(edge_df)

### 4. Table filtering

In [None]:
# Operator dict
operators = [['ge ', '>='],
             ['le ', '<='],
             ['lt ', '<'],
             ['gt ', '>'],
             ['ne ', '!='],
             ['eq ', '='],
             ['contains '],
             ['datestartswith ']]

#======================================================================
# Filter query syntax split
def split_filter_part(filter_part):
    for operator_type in operators:
        for operator in operator_type:
            if operator in filter_part:
                name_part, value_part = filter_part.split(operator, 1)
                name = name_part[name_part.find('{') + 1: name_part.rfind('}')]

                value_part = value_part.strip()
                v0 = value_part[0]
                if (v0 == value_part[-1] and v0 in ("'", '"', '`')):
                    value = value_part[1: -1].replace('\\' + v0, v0)
                else:
                    try:
                        value = float(value_part)
                    except ValueError:
                        value = value_part

                # word operators need spaces after them in the filter string,
                # but we don't want these later
                return name, operator_type[0].strip(), value

    return [None] * 3


### 5. Plot

In [None]:
fig_net = go.Figure(edge_trace + node_trace + [text_trace])
for elements in fig_net:
    print(fig_net)

In [None]:
# Server start
app = dash.Dash()

fig_net = go.Figure(edge_trace + node_trace + [text_trace]) 
fig_net.update_layout(xaxis=dict(showgrid=False, 
                                 zeroline=False),
                      yaxis=dict(showgrid=False, 
                                 zeroline=False,
                                 autorange='reversed'),
                      legend=dict(orientation='h',
                                  x=1, y=1.02,
                                  xanchor='right', yanchor='bottom'),
                      height=450)

#======================================================================
# Supress duplicate legend entries
names_legend = set()
fig_net.for_each_trace(lambda trace: trace.update(showlegend=False)
                       if (trace.name in names_legend) else names_legend.add(trace.name))

#======================================================================
# Dash plot
app.layout = html.Div([
    # Network plot
    dcc.Graph(id='graph',
              figure=fig_net),
    # Table base
    html.Div(className='row', 
             children=[
                       # Table
                       dash_table.DataTable(id='table',
                                            columns=[{'name': i, 'id': i} for i in df_code.columns],
                                            data=df_code.to_dict('records'),
                                            # Filtering
                                            filter_action='native',
                                            # Sorting
                                            sort_action='native',
                                            sort_mode='multi',
                                            # Table size
                                            fixed_rows={
                                                'headers': True},
                                            style_table={
                                                'height': 450,
                                                'overflowY': 'scroll',
                                                'border': 'thin lightgrey solid'}, 
                                            # Column size
                                            style_cell_conditional=[
                                                {'if': {'column_id': 'Task ID'},
                                                 'width': '5%'},
                                                {'if': {'column_id': 'Code'},
                                                 'width': '75%'},
                                                {'if': {'column_id': 'Procedure'},
                                                 'width': '5%'},
                                                {'if': {'column_id': 'Start Time'},
                                                 'width': '10%'},
                                                {'if': {'column_id': 'Elapsed Time'},
                                                 'width': '5%'}],
                                            # Cell colors
                                            style_header={
                                                'fontWeight': 'bold',
                                                'backgroundColor': 'rgb(48, 84, 150)',
                                                'color': 'white'},
                                            style_filter={
                                                'backgroundColor': 'rgb(142, 169, 219)',
                                                'color': 'white'},
                                            style_cell={
                                                'textAlign': 'left',
                                                'whiteSpace': 'pre-line',
                                                'height': 'auto',
                                                'backgroundColor': 'rgb(217, 225, 242)',
                                                'color': 'black',
                                                'minWidth': '10px', 
                                                'maxWidth': '800px'}
                                           )
             ])
])

#======================================================================
# Add Scatter -> Table interactions
@app.callback(
    Output('table', 'data'),
    Input('graph', 'selectedData'))
def update_table(selectedData):
    if selectedData is not None:
        x_values = []
        print('selectedData', selectedData)
        for elements in selectedData['points']:
            print('elements', elements)
            if 'meta' in elements:
                if elements['meta'] not in x_values:
                    x_values.append( int(elements['meta']) )
        df = df_code.loc[ df_code['Task ID'].isin(x_values) ]
    else:
        df = df_code
    return df.to_dict('records')
    
#======================================================================
# Add Table -> Scatter interactions
@app.callback(
    Output('graph', 'figure'),
    Input('table', 'filter_query'))
def update_scatter2(filter_query):
    #--------------------------------------------------------------------
    # 1. If data was filtered:
    if filter_query is not None:
        
        #--------------------------------------------------------------------
        # 1.1. Fetch step ids based on filter query
        filtering_expressions = filter_query.split(' && ')
        dff = df_code
        for filter_part in filtering_expressions:
            col_name, operator, filter_value = split_filter_part(filter_part)
            if operator in ('eq', 'ne', 'lt', 'le', 'gt', 'ge'):
                dff = dff.loc[getattr(dff[col_name], operator)(filter_value)]
            elif operator == 'contains':
                dff = dff.loc[dff[col_name].str.contains(filter_value)]   
            elif operator == 'datestartswith':
                dff = dff.loc[dff[col_name].str.startswith(filter_value)]
        step_list = list(dff['Task ID'])
        
        #--------------------------------------------------------------------
        # 1.2 Select figure traces that are in the filtered data
        selected_points = []
        fig_data = fig_net['data']
        for i in range(len(fig_data)):
            meta = fig_data[i]['meta']
            print('=============================================')
            print(i)
            print(fig_data[i])
            
            if isinstance(meta, int):
                if meta in step_list:
                    selected_points.append(i)
        print(selected_points)
        
        #--------------------------------------------------------------------
        # 1.3. Update figure
        fig_net.update_traces(selectedpoints = selected_points)
    
    return fig_net

#======================================================================
app.run_server(debug=False)