In [175]:
import pandas as pd
import re
import numpy as np
import ast


In [176]:
def parse_log_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    log_entries = []
    for line in lines:
        match = re.match(r'(\S+) (\S+) - (\S+) - (.*)', line)
        if match:
            date, time, level, message = match.groups()
            process_match = re.match(r'(Start|End) process-(\d+): (.*)', message)
            if process_match:
                event, process_id, m = process_match.groups()
                log_entries.append([date + ' ' + time, m, event, process_id, level])
    
    return pd.DataFrame(log_entries, columns=['Timestamp', 'Message', 'Event', 'ProcessID', 'Level'])




In [177]:
# Parse the log file
log_file_path = 'app.log'
df = parse_log_file(log_file_path)

df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S,%f')

# Group by 'ProcessID'
grouped = df.groupby(['ProcessID'])


# Create a DataFrame to hold combined results
completed = []
incomplete = []
for process_id, group in grouped:
    start_row = group[group['Event'] == 'Start']
    end_row = group[group['Event'] == 'End']

    # Incomplete Process
    if(len(end_row) == 0 or end_row['Level'].values[0] == 'CRITICAL'):
        di = ast.literal_eval(start_row['Message'].values[0])
        di['ProcessID'] = process_id[0]
        incomplete.append(di)
    
    # Completed Process
    else:
        di = ast.literal_eval(end_row['Message'].values[0])
        start_time = start_row['Timestamp'].values[0]
        end_time = end_row['Timestamp'].values[0]
        duration = end_time - start_time
        di['ProcessID'] = process_id[0]
        di['Duration'] = duration / np.timedelta64(1, 's')
        completed.append(di)

completed_df = pd.DataFrame(completed)

incomplete_df = pd.DataFrame(incomplete)


# Bringing useful columns ahead
completed_col = ['ProcessID', 'Duration'] + [col for col in completed_df.columns if col not in ['ProcessID', 'Duration']]
incomplete_col = ['ProcessID'] + [col for col in incomplete_df.columns if col not in ['ProcessID']]

completed_df = completed_df[completed_col]
incomplete_df = incomplete_df[incomplete_col]


display(completed_df.sort_values(by='Duration', ascending=False).head()) # Sorting in decreasing value of Duration.
display(incomplete_df)


Unnamed: 0,ProcessID,Duration,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat,u,u_star,u_star_hat,z,optimal_cost,actual_cost,intervals_str
36,42,109.528,4,3,0.666667,25,20,5,6.861675,0.573692,143,0.047965,215.632455,235.051991,132.334409,1.776197,25.0,55.532031,7.012735210487592_6.747495397130317_6.73981351...
29,35,9.192,3,1,0.066667,30,38,5,2.939091,0.822825,13,0.230357,118.582517,89.548564,100.456653,0.891415,1.935597,1.208391,2.1040952836412172_3.2623916754081517_2.155130...
30,36,6.021,2,2,0.666667,20,39,5,3.197452,1.09575,9,0.375508,154.116409,148.658515,123.530085,1.20342,3.638596,20.390883,2.055282084289818_2.840721153687751_4.81828648...
22,29,5.029,4,3,0.066667,20,26,5,7.81155,2.241046,12,0.642931,372.540581,260.664033,176.272384,1.478757,7.458437,13.084546,4.329763625013726_8.553691911833468_10.4512300...
0,0,4.668,4,3,0.133333,20,36,5,15.169167,5.80084,7,2.218298,410.476939,389.783183,517.482988,0.753229,2.759167,20.0,14.632774942608911_7.559445848295129_21.668558...


Unnamed: 0,ProcessID,alpha,beta,h,c,N,n,mean_n,std_n,alpha_hat,beta_hat
0,4,2,2,0.066667,30,22,5,3.368485,4.372408,1,5.675536
