In [1]:
# libraries
import pandas as pd
import numpy as np

## 1. Log parsing

In [2]:
path = 'OLD_PIT.txt'

In [3]:
def log_to_table(path):
    #====================================================================================
    # 1.Get columns from text

    # 1.1 Read Data
    df_log = pd.read_csv(path, sep="\t", header=None, names = ["Text"])

    # 1.2. Keep only needed logs
    df_log = df_log.loc[df_log["Text"].str.contains("JOBSPLIT") == True]

    # 1.3. Clean symbols and remove "JOBSPLIT" keyword
    df_log["Text"] = df_log["Text"].apply(lambda x: ' '.join(x.split(' ')[1:-1])).str.replace(r"JOBSPLIT: ", "", regex=True)

    # 1.5. Define action type
    df_log["Action Type"] = df_log["Text"].str.split().str[0]

    # 1.7. Keep the correct groups
    df_log = df_log.loc[df_log["Action Type"].isin(["DATASET", "OPENTIME", "TASKSTARTTIME", "JOBENDTIME"])]

    # 1.8. Get Table name
    df_log["Table"] = np.where(df_log["Action Type"].isin(["DATASET"]),
                               df_log["Text"].str.split().str[3],
                               np.where(df_log["Action Type"].isin(["OPENTIME"]),
                                        df_log["Text"].str.split().str[1],
                                        ""))
    df_log["Table type"] = df_log["Table"].str.split('.').str[-1]
    df_log["Table"] = df_log["Table"].apply(lambda x: '.'.join(x.split('.')[:-1]))

    # 1.9. Keep the correct tables
    df_log = df_log.loc[~df_log["Table type"].isin(["UTILITY"])]

    # 1.10. Define type of dataset
    df_log["Dataset type"] = np.where(df_log["Action Type"].isin(["DATASET"]),
                                      df_log["Text"].str.split().str[1],
                                      '')
    df_log["Dataset type"] = np.where(df_log["Action Type"].isin(["OPENTIME"]),
                                      df_log["Dataset type"].shift(),
                                      df_log["Dataset type"])
    # 1.11 Add IDs for each process
    df_log = df_log.reset_index(drop=True)
    df_log["Lag Dataset type"] = df_log["Dataset type"].shift()
    test_list = [1] 
    for index, row in df_log.iterrows():
        if ((row["Lag Dataset type"] == 'OUTPUT') & (row["Dataset type"] == 'INPUT')) | \
            ((row["Lag Dataset type"] == 'OUTPUT') & (row["Dataset type"] == 'UPDATE')) | \
            ((row["Lag Dataset type"] == 'UPDATE') & (row["Dataset type"] == 'INPUT')):
            test_list.append(test_list[index] +1)
        else:
            test_list.append(test_list[index])
    df_log["Step"] = test_list[1:]

    # 1.12. Drop columns
    df_log.drop(['Lag Dataset type','Table type'], inplace=True, axis=1)   

    #====================================================================================
    # 2. Dataset Manipulations
    # 2.1. Input dataset
    df_input = df_log.loc[(df_log["Action Type"].isin(["DATASET"])) & 
                          (df_log["Dataset type"].isin(["INPUT"])) ].copy()
    df_input = df_input.rename(columns={"Table": "Input Table"})
    df_input.drop(['Text', 'Action Type', 'Dataset type'], inplace=True, axis=1)

    # 2.2. Output dataset
    df_output = df_log.loc[(df_log["Action Type"].isin(["DATASET"])) & 
                           (df_log["Dataset type"].isin(["OUTPUT"])) ].copy()
    df_output = df_output.rename(columns={"Table": "Output Table"})
    df_output.drop(['Text', 'Action Type', 'Dataset type'], inplace=True, axis=1)

    # 2.3. Update dataset
    df_update = df_log.loc[(df_log["Action Type"].isin(["DATASET"])) & 
                           (df_log["Dataset type"].isin(["UPDATE"])) ].copy()
    df_update = df_update.rename(columns={"Table": "Input Table"})
    df_update["Output Table"] = df_update["Input Table"] 
    df_update.drop(['Text', 'Action Type', 'Dataset type'], inplace=True, axis=1)

    #====================================================================================
    # 3. Time dictionary
    df_time = df_log.loc[df_log["Action Type"].isin(["OPENTIME", "TASKSTARTTIME", "JOBENDTIME"])].copy()
    df_time["Start Time"] = np.where(df_time["Action Type"].isin(["OPENTIME"]),
                               df_time["Text"].str.split().str[2].str.replace("DATE:", ""),
                               df_time["Text"].str.split().str[1].str.replace("DATE:", ""))
    df_time["Start Time"]  = pd.to_datetime(df_time["Start Time"], format="%d%b%Y:%H:%M:%S.%f")
    df_time.drop(['Text','Action Type','Table','Dataset type'], inplace=True, axis=1)
    df_time = df_time.drop_duplicates(subset=['Step'], keep='first')
    df_time["Elapsed Time"] = (df_time["Start Time"].shift(-1) - df_time["Start Time"]).dt.total_seconds()
    df_time["Elapsed Time"] = df_time["Elapsed Time"].round(2)
    df_time = df_time.fillna(0)

    #====================================================================================
    # 4. Table size dictionary
    df_table = df_log.loc[df_log["Action Type"].isin(["OPENTIME"])].copy()
    df_table["Input File Size"] = df_table["Text"].str.split().str[-1].str.replace("SIZE:", "")
    df_table = df_table.rename(columns={"Table": "Input Table"})
    df_table.drop(['Text', 'Action Type', 'Dataset type'], inplace=True, axis=1)
    df_table = df_table.drop_duplicates(subset=['Input Table', 'Step'], keep='first')

    #====================================================================================
    # 5. Output
    # 5.1 Merge the results for input, output and updated tables + Time and Size dictionaries
    df = df_input.merge(df_output, on=['Step'], how = 'outer')\
                 .append(df_update)\
                 .merge(df_time, on=['Step'], how = 'left')\
                 .merge(df_table, on=['Step', 'Input Table'], how = 'left')\
                 .sort_values(['Step', 'Start Time'])\
                 .reset_index(drop = True)\
                 .fillna("No Input")

    # 5.2 Indicate special actions (SORT, UPDATE AND CREATE TABLE)
    df['Action'] = np.where(df["Input Table"]==df["Output Table"],
                        "UPDATE",
                        np.where(df["Input Table"]=="No Input",
                                 "CREATED",
                                 np.where(df["Output Table"].str.contains("SORTTMP"),
                                          "SORTED",
                                          "")))

    # 5.3 Correct action labels
    df_actions = df.loc[df['Action'] == "SORTED", ["Step", "Action"] ]
    df = df.merge(df_actions, on=['Step'], how = 'left')
    df['Action'] = np.where(df['Action_y'] == "SORTED",
                            df['Action_y'],
                            df['Action_x'])
    df = df.drop(columns=['Action_x','Action_y'])
    
    # 5.4 Drop tmp sorting output
    df = df.drop(df[ df["Output Table"].str.contains("SORTTMP") ].index)
    
    # 5.5 Add target node index
    df["Target Node Id"] = "Step " + df["Step"].astype('str') + ": " + df["Output Table"]

    # 5.6 Add source node index
    node_id_dict = {}
    Source_Node_Id = []
    node_attr = {}
    for index, row in df.iterrows():
        # Add Source Node Id
        if row["Input Table"] in node_id_dict:
            Source_Node_Id.append(node_id_dict[row["Input Table"]])
        elif row["Input Table"] == "No Input":
            Source_Node_Id.append("Step "+ str(row["Step"]) + ": " + "No Input")
        else:
            Source_Node_Id.append("Step 0: " + row["Input Table"]) 
        # Update Keys
        node_id_dict[row["Output Table"]] = row["Target Node Id"]    

        # Node attributes
        if row["Action"] != "":
            node_attr[row["Target Node Id"]] = row["Action"]
    df['Source Node Id'] = Source_Node_Id
    
    # 5.7 Reorder columns
    df = df[['Step','Source Node Id','Target Node Id','Input Table','Output Table','Start Time','Elapsed Time','Action']]
    
    return df, node_attr

In [4]:
df,node_attr = log_to_table(path)
df

Unnamed: 0,Step,Source Node Id,Target Node Id,Input Table,Output Table,Start Time,Elapsed Time,Action
0,1,Step 1: No Input,Step 1: WORK.__MTF_PD_PIT_VERSION,No Input,WORK.__MTF_PD_PIT_VERSION,2021-07-23 18:00:00.780,0.06,CREATED
1,2,Step 1: WORK.__MTF_PD_PIT_VERSION,Step 2: WORK.__MTF_PD_PIT_DISTINCT_RATING1,WORK.__MTF_PD_PIT_VERSION,WORK.__MTF_PD_PIT_DISTINCT_RATING1,2021-07-23 18:00:00.840,3.35,
2,2,Step 0: INPUT.PD_VIEW,Step 2: WORK.__MTF_PD_PIT_DISTINCT_RATING1,INPUT.PD_VIEW,WORK.__MTF_PD_PIT_DISTINCT_RATING1,2021-07-23 18:00:00.840,3.35,
3,3,Step 0: INPUT.PD_RR_RATING,Step 3: WORK.__MTF_PD_PIT_DISTINCT_RATING2,INPUT.PD_RR_RATING,WORK.__MTF_PD_PIT_DISTINCT_RATING2,2021-07-23 18:00:04.190,0.00,
4,4,Step 2: WORK.__MTF_PD_PIT_DISTINCT_RATING1,Step 4: WORK.__MTF_PD_PIT_DISTINCT_ZVAL1,WORK.__MTF_PD_PIT_DISTINCT_RATING1,WORK.__MTF_PD_PIT_DISTINCT_ZVAL1,2021-07-23 18:00:04.190,0.36,
...,...,...,...,...,...,...,...,...
140,80,Step 79: WORK.__MTF_PD_MARK_RATING_MIG_AVG1_20,Step 80: WORK.TEST_PD_STB_IN,WORK.__MTF_PD_MARK_RATING_MIG_AVG1_20,WORK.TEST_PD_STB_IN,2021-07-23 18:03:23.680,0.83,
141,80,Step 79: WORK.__MTF_PD_MARK_RATING_MIG_AVG1_20,Step 80: WORK.__MTF_PD_PIT_ODR_WIDE_DIS,WORK.__MTF_PD_MARK_RATING_MIG_AVG1_20,WORK.__MTF_PD_PIT_ODR_WIDE_DIS,2021-07-23 18:03:23.680,0.83,
142,81,Step 80: WORK.__MTF_PD_PIT_ODR_WIDE_DIS,Step 81: WORK.__MTF_PD_PIT_ODR_WIDE_TP_DIS,WORK.__MTF_PD_PIT_ODR_WIDE_DIS,WORK.__MTF_PD_PIT_ODR_WIDE_TP_DIS,2021-07-23 18:03:24.510,0.19,
143,82,Step 81: WORK.__MTF_PD_PIT_ODR_WIDE_TP_DIS,Step 82: WORK.__MTF_PD_PIT_ODR_LONG_DIS,WORK.__MTF_PD_PIT_ODR_WIDE_TP_DIS,WORK.__MTF_PD_PIT_ODR_LONG_DIS,2021-07-23 18:03:24.700,1.05,


In [5]:
df.to_csv('test.csv', sep = ";")

# Network visualisation

In [6]:
# libraries
import networkx as nx

import matplotlib
import matplotlib.cm as cm
from matplotlib import pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots

#### Graph Setup

In [15]:
def get_coordinates(G):

    #====================================================================================
    # 1. Define coordinates
    coords = nx.multipartite_layout(G, subset_key = "offset")
    node_x_coords = []
    node_y_coords = []
    pos = {}
    for keys, values in coords.items():
        pos[keys] = list([round(values[0], 6),
                          round(values[1], 6)])
        node_x_coords.append(round(values[0], 6))
        node_y_coords.append(round(values[1], 6))
    nx.set_node_attributes(G, pos, "pos") 

    #====================================================================================    
    # 2. Shift coordinates to avoid overlap
    # Edge coords
    edge_x_coords = []
    edge_y_coords = []
    # Edge text coords
    edge_text_x_coords=[]
    edge_text_y_coords=[]
    for edge in G.edges():
        
        # 2.1 Get data:
        # Start points
        x0, y0 = G.nodes[edge[0]]['pos']
        # End points
        x1, y1 = G.nodes[edge[1]]['pos']
        # Middle points
        x_mean = round( (x0+x1)/2, 6) 
        y_mean = round( (y0+y1)/2, 6)
        # Shift length
        shift = (( (x1-x0)**2 + (y1-y0)**2 )**0.5) * 0.05

        # 2.2 Adjustment of coordinates
        # If Horizontal line
        if x0 == x1:
                x_mean = x_mean - shift
        # If Vertical line            
        elif y0 == y1:
            if y0 >= 0:
                 y_mean = y_mean + shift
            else:
                 y_mean = y_mean - shift        
        # If Diagonal line
        else:
            y_x_proportion = abs((x1-x0) / (y1-y0))
            l_y = ( (shift**2) / (y_x_proportion**2+1) )**0.5
            l_x = l_y * y_x_proportion
            # Increase 
            if y0 < y1:
                # Below y=0
                if y1<=0:
                    x_mean = x_mean + l_x
                    y_mean = y_mean - l_y    
                # Above y=0
                else:
                    x_mean = x_mean - l_x
                    y_mean = y_mean + l_y    
                    
            # Decrease
            elif y0 > y1:
                # Below y=0
                if y0<=0:
                    x_mean = x_mean - l_x
                    y_mean = y_mean - l_y                     
                # Above y=0
                else:
                    x_mean = x_mean + l_x
                    y_mean = y_mean + l_y             

        # 2.3 Output
        # X coordinates
        edge_x_coords.append([x0,x_mean,x1])
        # Y coordinates  
        edge_y_coords.append([y0,y_mean,y1])
        # Edge text coordinates        
        edge_text_x_coords.append(x_mean)
        edge_text_y_coords.append(y_mean)

    return G, edge_x_coords, edge_y_coords, edge_text_x_coords, edge_text_y_coords

In [16]:
def get_edge_trace(G, 
                   edge_x_coords, edge_y_coords, 
                   edge_text_x_coords, edge_text_y_coords):
    
    #====================================================================================        
    # 1. Edge text values
    edge_text = []
    edge_color_values = []
    for edge in G.edges():
        edge_color_values.append(G.edges[edge[0], edge[1]]['Elapsed Time'])
        edge_text.append('Input Table: '     +  str(G.edges[edge[0], edge[1]]['Input Table'])      + ' <br> ' +            
                         'Output Table: '    +  str(G.edges[edge[0], edge[1]]['Output Table'])     + ' <br> ' +            
                         'Step: '            +  str(G.edges[edge[0], edge[1]]['Step'])             + ' <br> ' +
                         'Start Time: '      +  str(G.edges[edge[0], edge[1]]['Start Time'])       + ' <br> ' +
                         'Elapsed Time: '    +  str(G.edges[edge[0], edge[1]]['Elapsed Time']))
    
    #====================================================================================        
    # 2. Map colors to values
    minima = min(edge_color_values)
    maxima = max(edge_color_values)
    norm = matplotlib.colors.Normalize(vmin=minima, vmax=maxima, clip=True)
    mapper = cm.ScalarMappable(norm=norm, cmap='RdYlGn_r')
    
    edge_color = []
    for v in edge_color_values:
        rgba = mapper.to_rgba(v)
        edge_color.append( matplotlib.colors.to_hex(rgba, keep_alpha=False))
        
    #====================================================================================        
    # 2. Edge plot         
    edge_trace = []  
    for i in range(len(edge_color)): # Add each edge as a separate plot
        edge_trace.append(go.Scatter(x = edge_x_coords[i], 
                                     y = edge_y_coords[i],
                                     mode = 'lines',
                                     line_shape = 'spline',
                                     line = dict(width=1,
                                                 dash='dot',
                                                 color=edge_color[i]),
                                     hoverinfo = 'none'))
    
    #====================================================================================        
    # 3. Edge text plot
    edge_text_trace = go.Scatter(x = edge_text_x_coords,
                                 y = edge_text_y_coords, 
                                 # Marker
                                 mode = 'markers',
                                 marker_symbol = 'hexagram',
                                 marker=dict(showscale=True, 
                                             colorscale='RdYlGn', 
                                             reversescale=True, 
                                             size = 8,
                                             color=[], 
                                             colorbar=dict(thickness=15,
                                                           title='Execution time (s)',
                                                           xanchor='left',
                                                           titleside='right')
                                            ),
                                 # Text
                                 text = edge_text, 
                                 textposition = 'top center', 
                                 hovertemplate = ' %{text}')
    edge_text_trace.marker.color = edge_color_values
    
    return edge_trace, edge_text_trace

In [17]:
def get_node_trace(G):
    
    #====================================================================================            
    # 1. Node Attributes
    node_x = []
    node_y = []
    node_table = []
    node_color = []
    node_shape = []
    for node in G.nodes():        
        # Node Coords
        x, y = G.nodes[node]['pos']
        node_x.append(x)
        node_y.append(y)        
        
        # Node name
        table_name = node.split(": ")[1]
        node_table.append(table_name)
        
        # Node shape and color
        # No source node
        if G.nodes[node]["node_type"] == "CREATED":
            node_shape.append("diamond-cross")
            node_color.append("grey")
    
        # Input node
        elif G.in_degree(node) == 0:
            node_shape.append("diamond")
            node_color.append("gold")
            
        # Output node
        elif G.out_degree(node) == 0:
            node_shape.append("square")
            node_color.append("cyan") 
            
        # Updated node            
        elif G.nodes[node]["node_type"] == "UPDATE":
            node_shape.append("cross")
            node_color.append("orange")
            
        # Sorted node            
        elif G.nodes[node]["node_type"] == "SORTED":
            node_shape.append("pentagon")
            node_color.append("purple")
            
        # Internal nodes
        else:
            node_shape.append("circle")
            node_color.append("blue")

    #====================================================================================        
    # 2. Node plot
    node_trace = go.Scatter(x = node_x, 
                            y = node_y,
                            mode = 'markers',
                            marker_size = 10,
                            marker_line_color="black", 
                            marker_line_width=0.5,
                            text = node_table,
                            textposition = 'top center',
                            hovertemplate = ' %{text}')
    
    node_trace.marker.color = node_color
    node_trace.marker.symbol = node_shape
    
    return node_trace

In [18]:
#====================================================================================
# 1. Create Graph
G = nx.from_pandas_edgelist(df,
                            'Source Node Id', 'Target Node Id',
                            ['Input Table', 'Output Table', 'Step','Start Time','Elapsed Time'],
                            create_using=nx.DiGraph())

#====================================================================================
# 2. Add Attributes

# 2.1 Define node type
nx.set_node_attributes(G, "", "node_type") 
nx.set_node_attributes(G, node_attr, "node_type")

# 2.2 Define offset and table name for nodes
node_tables = {}
node_offset = {}
for node in G.nodes():
    node_tables[node] = node.split(": ")[1]
    node_offset[node] = int(node.split(" ")[1].replace(r":", "")) + 1
nx.set_node_attributes(G, node_tables, "Table Name")
nx.set_node_attributes(G, node_offset, "offset")

#====================================================================================
# 3. Drop unused nodes
remove = [node for node,node_table in dict(G.nodes.data("Table Name")).items() if node_table == 'No Input']
G.remove_nodes_from(remove)

#====================================================================================
# 4. Add coordinates
G, edge_x_coords, edge_y_coords, edge_text_x_coords, edge_text_y_coords = get_coordinates(G)

#====================================================================================
# 5. Create edge lines
edge_trace, edge_text_trace = get_edge_trace(G,
                                             edge_x_coords, edge_y_coords,
                                             edge_text_x_coords, edge_text_y_coords)

#====================================================================================
# 6. Create node lines
node_trace = get_node_trace(G)

#====================================================================================
# 7. Plot
# 7.1. Base
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    vertical_spacing=0,
                    specs=[[{"type": "scatter"}], [{"type": "table"}]])

# 7.2. Network plot
# Draw each edge as a separate plot
for i in range(len(edge_trace)):
    fig.add_trace(edge_trace[i], row=1, col=1) 
# Nodes
fig.add_trace(node_trace, row=1, col=1)
# Text
fig.add_trace(edge_text_trace, row=1, col=1) 

# 7.3. Table plot
df_plot = df[['Step','Input Table','Output Table','Start Time','Elapsed Time']]
fig.add_trace(go.Table(columnwidth=[1,6,6,4,2,2],
                       header=dict(values=list(df_plot.columns),
                                   font=dict(size=10),
                                   align="left"),
                       cells=dict(values=[df_plot[k].tolist() for k in df_plot.columns],
                                  font=dict(size=8),
                                  align = "left")),
              row=2, col=1)

# 7.4. Plot correction
fig.update_layout(width=950, height=800,
                  showlegend=False,
                  title_text="SAS Code Process analysis",
                  xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                  yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))

fig.show()