In [1]:
import pandas as pd
import re
import numpy as np
import ast


In [2]:
def parse_log_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    log_entries = []
    for line in lines:
        match = re.match(r'(\S+) (\S+) - (\S+) - (.*)', line)
        if match:
            date, time, level, message = match.groups()
            process_match = re.match(r'(Start|End) process-(\d+): (.*)', message)
            if process_match:
                event, process_id, m = process_match.groups()
                log_entries.append([date + ' ' + time, m, event, process_id, level])
    
    return pd.DataFrame(log_entries, columns=['Timestamp', 'Message', 'Event', 'ProcessID', 'Level'])




In [3]:
# Parse the log file
log_file_path = 'app.log'
df = parse_log_file(log_file_path)

df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S,%f')

# Group by 'ProcessID'
grouped = df.groupby(['ProcessID'])


# Create a DataFrame to hold combined results
completed = []
incomplete = []
for process_id, group in grouped:
    start_row = group[group['Event'] == 'Start']
    end_row = group[group['Event'] == 'End']

    # Incomplete Process
    if(len(end_row) == 0 or end_row['Level'].values[0] == 'CRITICAL'):
        di = ast.literal_eval(start_row['Message'].values[0])
        di['ProcessID'] = process_id[0]
        incomplete.append(di)
    
    # Completed Process
    else:
        di = ast.literal_eval(end_row['Message'].values[0])
        start_time = start_row['Timestamp'].values[0]
        end_time = end_row['Timestamp'].values[0]
        duration = end_time - start_time
        di['ProcessID'] = process_id[0]
        di['Duration'] = duration / np.timedelta64(1, 's')
        completed.append(di)

completed_df = pd.DataFrame(completed)

incomplete_df = pd.DataFrame(incomplete)


# Bringing useful columns ahead
completed_col = ['ProcessID', 'Duration'] + [col for col in completed_df.columns if col not in ['ProcessID', 'Duration']]
incomplete_col = ['ProcessID'] + [col for col in incomplete_df.columns if col not in ['ProcessID']]

completed_df = completed_df[completed_col]
incomplete_df = incomplete_df[incomplete_col]


display(completed_df.sort_values(by='Duration', ascending=False).head()) # Sorting in decreasing value of Duration.
display(incomplete_df)


Unnamed: 0,ProcessID,Duration,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,u,u_star,u_star_hat,z,optimal_cost,actual_cost,intervals_str
2739,3833,60.704,6,3,0.666667,20,38,5,22.357182,5.308577,18,1.260489,708.550683,732.52581,876.980805,0.835281,20.0,20.0,17.219624366438882_18.624689380598795_19.77137...
2988,4096,53.825,6,3,0.066667,20,33,5,17.948164,4.084701,19,0.929609,629.06998,530.506301,540.452741,0.981596,6.570912,5.907816,15.513492797009901_25.224155824611216_16.17341...
2945,4052,50.912,6,1,0.133333,30,33,5,6.461526,1.473297,19,0.335927,194.549502,171.029996,192.475443,0.888581,3.135934,0.276541,7.820770678865009_8.177909372753204_5.97547564...
3785,4923,43.993,3,2,0.133333,30,29,5,4.076601,0.933251,19,0.213648,188.742044,140.854108,105.862666,1.330536,6.385058,11.050584,4.207102962385784_3.642258176752745_2.84734291...
3779,4915,43.65,6,2,0.133333,20,27,5,12.137143,2.933823,17,0.709172,344.8977,288.484106,299.92577,0.961852,7.521812,5.996257,13.18106849497249_14.414493246506785_10.374737...


Unnamed: 0,ProcessID,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat
0,1017,3,2,0.066667,20,10,5,9.824121,11.155708,1,12.667782
1,1019,4,2,0.666667,25,6,5,7.860422,1.348280,34,0.231267
2,1023,3,1,0.066667,30,13,5,2.842072,0.398845,51,0.055972
3,1027,2,3,0.666667,20,37,5,7.357311,6.671186,1,6.049047
4,1032,3,2,0.066667,10,23,5,5.215839,1.122711,22,0.241664
...,...,...,...,...,...,...,...,...,...,...,...
662,962,3,1,0.666667,25,20,5,2.770321,2.287650,1,1.889075
663,966,2,3,0.066667,10,20,5,4.484889,0.552302,66,0.068015
664,973,2,1,0.133333,10,19,5,1.688653,1.466724,1,1.273962
665,984,6,4,0.666667,10,16,5,29.353810,5.116026,33,0.891664
