In [23]:
import pandas as pd
import re
import numpy as np
import ast


In [24]:
def parse_log_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    log_entries = []
    for line in lines:
        match = re.match(r'(\S+) (\S+) - (\S+) - (.*)', line)
        if match:
            date, time, level, message = match.groups()
            process_match = re.match(r'(Start|End) process-(\d+): (.*)', message)
            if process_match:
                event, process_id, m = process_match.groups()
                log_entries.append([date + ' ' + time, m, event, process_id, level])
    
    return pd.DataFrame(log_entries, columns=['Timestamp', 'Message', 'Event', 'ProcessID', 'Level'])




In [25]:
# Parse the log file
log_file_path = 'app.log'
df = parse_log_file(log_file_path)

df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S,%f')

# Group by 'ProcessID'
grouped = df.groupby(['ProcessID'])


# Create a DataFrame to hold combined results
completed = []
incomplete = []
for process_id, group in grouped:
    start_row = group[group['Event'] == 'Start']
    end_row = group[group['Event'] == 'End']

    # Incomplete Process
    if(len(end_row) == 0 or end_row['Level'].values[0] == 'CRITICAL'):
        di = ast.literal_eval(start_row['Message'].values[0])
        di['ProcessID'] = process_id[0]
        incomplete.append(di)
    
    # Completed Process
    else:
        di = ast.literal_eval(end_row['Message'].values[0])
        start_time = start_row['Timestamp'].values[0]
        end_time = end_row['Timestamp'].values[0]
        duration = end_time - start_time
        di['ProcessID'] = process_id[0]
        di['Duration'] = duration / np.timedelta64(1, 's')
        completed.append(di)



print('Complete Processes')
completed_df = pd.DataFrame(completed)
completed_col = ['ProcessID', 'Duration'] + [col for col in completed_df.columns if col not in ['ProcessID', 'Duration']]
completed_df = completed_df[completed_col]
display(completed_df.sort_values(by='Duration', ascending=False).head()) # Sorting in decreasing value of Duration.


if(len(incomplete) > 0):
    print('Incomplete Processes')
    incomplete_df = pd.DataFrame(incomplete)
    incomplete_col = ['ProcessID'] + [col for col in incomplete_df.columns if col not in ['ProcessID']]
    incomplete_df = incomplete_df[incomplete_col]
    display(incomplete_df)




Complete Processes


Unnamed: 0,ProcessID,Duration,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,u,u_star,u_star_hat,z,optimal_cost,actual_cost,intervals_str
516,1462,0.052,6,1,0.666667,20,20,5,4.756871,0.766795,38.484388,0.123605,117.360582,110.593695,89.589507,1.234449,4.511258,18.51405,3.786094029914729_4.948222679433805_5.49283457...
2643,3377,0.046,4,3,0.066667,25,26,5,15.492691,6.225987,6.19209,2.502013,354.503525,257.350205,346.547692,0.742611,6.476888,0.530389,16.253607128067618_18.44068366333611_20.352194...
7219,7496,0.044,6,1,0.066667,25,19,5,4.651831,2.273663,4.185966,1.111292,103.226659,90.251438,66.246799,1.362352,0.865015,2.465324,4.8950438626482535_0.7264998110484501_6.339006...
2658,3390,0.044,2,1,0.666667,20,13,5,1.179015,0.269171,19.185894,0.061452,23.721069,18.764484,13.191394,1.422479,3.30439,7.019783,1.2168881104115095_1.1381010434572751_0.747755...
2656,3389,0.044,4,2,0.066667,25,13,5,5.471307,3.199741,2.923832,1.87128,91.067532,74.03904,46.50198,1.59217,1.135233,2.971037,6.088390856490961_3.165041784410251_10.7722073...
