In [4]:
import pandas as pd
import re
import numpy as np
import ast


In [5]:
def parse_log_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    log_entries = []
    for line in lines:
        match = re.match(r'(\S+) (\S+) - (\S+) - (.*)', line)
        if match:
            date, time, level, message = match.groups()
            process_match = re.match(r'(Start|End) process-(\d+): (.*)', message)
            if process_match:
                event, process_id, m = process_match.groups()
                log_entries.append([date + ' ' + time, m, event, process_id, level])
    
    return pd.DataFrame(log_entries, columns=['Timestamp', 'Message', 'Event', 'ProcessID', 'Level'])




In [6]:
# Parse the log file
log_file_path = '../data/app.log'
df = parse_log_file(log_file_path)

df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S,%f')

# Group by 'ProcessID'
grouped = df.groupby(['ProcessID'])


# Create a DataFrame to hold combined results
completed = []
incomplete = []
for process_id, group in grouped:
    start_row = group[group['Event'] == 'Start']
    end_row = group[group['Event'] == 'End']

    # Incomplete Process
    if(len(end_row) == 0 or end_row['Level'].values[0] == 'CRITICAL'):
        di = ast.literal_eval(start_row['Message'].values[0])
        di['ProcessID'] = process_id[0]
        incomplete.append(di)
    
    # Completed Process
    else:
        di = ast.literal_eval(end_row['Message'].values[0])
        start_time = start_row['Timestamp'].values[0]
        end_time = end_row['Timestamp'].values[0]
        duration = end_time - start_time
        di['ProcessID'] = process_id[0]
        di['Duration'] = duration / np.timedelta64(1, 's')
        completed.append(di)



print('Complete Processes')
completed_df = pd.DataFrame(completed)
completed_col = ['ProcessID', 'Duration'] + [col for col in completed_df.columns if col not in ['ProcessID', 'Duration']]
completed_df = completed_df[completed_col]
display(completed_df.sort_values(by='Duration', ascending=False).head()) # Sorting in decreasing value of Duration.


if(len(incomplete) > 0):
    print('Incomplete Processes')
    incomplete_df = pd.DataFrame(incomplete)
    incomplete_col = ['ProcessID'] + [col for col in incomplete_df.columns if col not in ['ProcessID']]
    incomplete_df = incomplete_df[incomplete_col]
    display(incomplete_df)




Complete Processes


Unnamed: 0,ProcessID,Duration,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,intervals_str,u,u_star,u_star_hat,z,optimal_cost,actual_cost
1943,2747,128.626,7,1.5,0.4,25,36,5,11.01842,0.531576,429.643263,0.025646,10.782755125109665_11.07029820398264_11.875847...,403.541753,358.807132,390.172434,0.919612,17.893848,5.347728
4458,511,112.676,7,2.0,0.05,15,35,5,13.98848,0.684816,417.247162,0.033526,13.164449274425104_13.735692606013847_14.71797...,493.416607,437.792072,479.089459,0.9138,2.781227,0.716357
3795,4414,79.29,7,1.0,0.05,20,37,5,7.719874,0.40109,370.456982,0.020839,7.635586176821438_7.955491704341508_7.07521850...,273.250922,224.997663,278.616425,0.807553,2.412663,20.0
2066,2858,66.448,7,1.0,0.05,30,45,5,7.999946,0.517747,238.747464,0.033508,7.894474522504195_8.88636074937826_7.948207297...,297.258802,275.213271,349.951212,0.786433,1.102277,30.0
1376,2236,60.349,2,2.0,0.1,30,40,5,3.277287,0.192101,291.051502,0.01126,3.521486231915436_3.2746549807957943_3.1676712...,145.0965,125.624812,127.433078,0.98581,1.947169,1.766342


Incomplete Processes


Unnamed: 0,ProcessID,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,intervals_str
0,3071,3,1,0.05,30,43,5,2.553718,0.111179,527.598917,0.00484,2.570858789336957_2.4564096334856984_2.7347184...
