In [1]:
import pandas as pd
import re
import numpy as np
import ast


In [2]:
def parse_log_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    log_entries = []
    for line in lines:
        match = re.match(r'(\S+) (\S+) - (\S+) - (.*)', line)
        if match:
            date, time, level, message = match.groups()
            process_match = re.match(r'(Start|End) process-(\d+): (.*)', message)
            if process_match:
                event, process_id, m = process_match.groups()
                log_entries.append([date + ' ' + time, m, event, process_id, level])
    
    return pd.DataFrame(log_entries, columns=['Timestamp', 'Message', 'Event', 'ProcessID', 'Level'])




In [3]:
# Parse the log file
log_file_path = 'app.log'
df = parse_log_file(log_file_path)

df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S,%f')

# Group by 'ProcessID'
grouped = df.groupby(['ProcessID'])


# Create a DataFrame to hold combined results
completed = []
incomplete = []
for process_id, group in grouped:
    start_row = group[group['Event'] == 'Start']
    end_row = group[group['Event'] == 'End']

    # Incomplete Process
    if(len(end_row) == 0 or end_row['Level'].values[0] == 'CRITICAL'):
        di = ast.literal_eval(start_row['Message'].values[0])
        di['ProcessID'] = process_id[0]
        incomplete.append(di)
    
    # Completed Process
    else:
        di = ast.literal_eval(end_row['Message'].values[0])
        start_time = start_row['Timestamp'].values[0]
        end_time = end_row['Timestamp'].values[0]
        duration = end_time - start_time
        di['ProcessID'] = process_id[0]
        di['Duration'] = duration / np.timedelta64(1, 's')
        completed.append(di)



print('Complete Processes')
completed_df = pd.DataFrame(completed)
completed_col = ['ProcessID', 'Duration'] + [col for col in completed_df.columns if col not in ['ProcessID', 'Duration']]
completed_df = completed_df[completed_col]
display(completed_df.sort_values(by='Duration', ascending=False).head()) # Sorting in decreasing value of Duration.


if(len(incomplete) > 0):
    print('Incomplete Processes')
    incomplete_df = pd.DataFrame(incomplete)
    incomplete_col = ['ProcessID'] + [col for col in incomplete_df.columns if col not in ['ProcessID']]
    incomplete_df = incomplete_df[incomplete_col]
    display(incomplete_df)




Complete Processes


Unnamed: 0,ProcessID,Duration,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,u,u_star,u_star_hat,z,optimal_cost,actual_cost,intervals_str
3507,4154,34.858,6,1,0.666667,25,21,5,5.286232,0.264014,400.903367,0.013186,130.127204,114.600261,108.300068,1.058173,10.351295,14.551424,5.433644726054029_4.946400713294464_5.27568703...
1370,2230,22.51,6,3,0.133333,20,25,5,15.821942,0.972216,264.846354,0.05974,458.148735,407.460086,384.671022,1.059243,6.758487,9.797028,15.99338100143342_15.723829195521585_14.243422...
3627,4262,18.65,6,3,0.666667,25,20,5,18.258698,1.11793,266.753908,0.068448,284.090882,363.251958,357.541298,1.015972,25.0,25.0,19.776614374341406_18.77884494763773_17.210625...
1164,2045,15.259,6,4,0.066667,30,23,5,20.81812,1.311042,252.144742,0.082564,528.78153,473.683558,462.629202,1.023895,3.673198,4.410155,22.05830588662582_21.47413913530587_21.6735784...
7631,7867,15.247,6,2,0.066667,10,27,5,8.422151,0.580157,210.743551,0.039964,329.117624,288.484106,220.059885,1.310935,2.708901,7.270516,9.12470907033516_8.753671810180096_7.749944854...


Incomplete Processes


Unnamed: 0,ProcessID,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat
0,9030,3,3,0.666667,10,39,5,9.637513,14.304453,0.453928,21.231347
