In [2]:
import pandas as pd
import glob

In [20]:
structured_paths = sorted(glob.glob('./*/*_structured.csv'))
templates_paths = sorted(glob.glob('./*/*_templates.csv'))
templates_paths.pop(templates_paths.index('./HDFS/HDFS_templates.csv'))
templates_paths.pop(templates_paths.index('./BGL/BGL_templates.csv'))

'./BGL/BGL_templates.csv'

In [30]:
paths = list(zip(structured_paths, templates_paths))
paths

[('./Android/Android_2k.log_structured.csv',
  './Android/Android_2k.log_templates.csv'),
 ('./Apache/Apache_2k.log_structured.csv',
  './Apache/Apache_2k.log_templates.csv'),
 ('./BGL/BGL_2k.log_structured.csv', './BGL/BGL_2k.log_templates.csv'),
 ('./HDFS/HDFS_2k.log_structured.csv', './HDFS/HDFS_2k.log_templates.csv'),
 ('./HPC/HPC_2k.log_structured.csv', './HPC/HPC_2k.log_templates.csv'),
 ('./Hadoop/Hadoop_2k.log_structured.csv',
  './Hadoop/Hadoop_2k.log_templates.csv'),
 ('./HealthApp/HealthApp_2k.log_structured.csv',
  './HealthApp/HealthApp_2k.log_templates.csv'),
 ('./Linux/Linux_2k.log_structured.csv', './Linux/Linux_2k.log_templates.csv'),
 ('./Mac/Mac_2k.log_structured.csv', './Mac/Mac_2k.log_templates.csv'),
 ('./OpenSSH/OpenSSH_2k.log_structured.csv',
  './OpenSSH/OpenSSH_2k.log_templates.csv'),
 ('./OpenStack/OpenStack_2k.log_structured.csv',
  './OpenStack/OpenStack_2k.log_templates.csv'),
 ('./Proxifier/Proxifier_2k.log_structured.csv',
  './Proxifier/Proxifier_2k.log

In [31]:
template1 = pd.read_csv(paths[-2][1])
template1

Unnamed: 0,EventId,EventTemplate
0,E1,<*> Created NT transaction (seq <*>) result <*...
1,E2,"<*> Creating NT transaction (seq <*>), objectn..."
2,E3,<*> CSI perf trace:
3,E4,<*> CSI Store <*> (<*>) initialized
4,E5,<*> IAdvancedInstallerAwareStore_ResolvePendin...
5,E6,<*> ICSITransaction::Commit calling IStorePend...
6,E7,<*> Performing <*> operations; <*> are not loc...
7,E8,<*> Store coherency cookie matches last scaven...
8,E9,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> CSI Transactio...
9,E10,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> CSI Transactio...


In [None]:
def count_variables(eventTemplate):
    # This function counts the number of variables in the event template
    # The variables are represented by '<*>'
    # The function also revises the counts by enforing timestamp patterns to be counted as one variable
    replacements = [
        '<*>/<*>/<*>:<*>:<*>:<*>.<*>',
        'time=<*>,<*>,<*>,<*>,<*>,',
        '#<*> <*>:<*>:<*> EDT <*>',
        'seconds at <*>:<*>:<*> <*>',
        '[<*>:<*>:<*>.<*>]',
        '<*>:<*>:<*>.<*>',
        '<*>/<*>/<*> <*>:<*>:<*> ak Exp',
        'delay=<*>:<*>:<*>, xdelay=<*>:<*>:<*>'
    ]
    
    # Sort replacements by length of the pattern in descending order
    # This is to ensure that the longest pattern is replaced first
    replacements.sort(key=lambda x: len(x), reverse=True)
    
    occurrences = eventTemplate.count('<*>')
    
    for pattern in replacements:
        count = pattern.count('<*>')
        occurrences -= eventTemplate.count(pattern) * (count-1) # This is to avoid counting the same variable multiple times, instead we count it once
        eventTemplate = eventTemplate.replace(pattern, '') # Remove the pattern from the template, this is to avoid counting the same pattern multiple times
    
    return occurrences


In [41]:
structure1 = pd.read_csv(paths[-2][0])
structure1

Unnamed: 0,LineId,Date,Time,Level,Component,Content,EventId,EventTemplate
0,1,2016-09-28,04:30:30,Info,CBS,Loaded Servicing Stack v6.1.7601.23505 with Co...,E23,Loaded Servicing Stack <*> with Core: <*>\cbsc...
1,2,2016-09-28,04:30:31,Info,CSI,00000001@2016/9/27:20:30:31.455 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...
2,3,2016-09-28,04:30:31,Info,CSI,00000002@2016/9/27:20:30:31.458 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...
3,4,2016-09-28,04:30:31,Info,CSI,00000003@2016/9/27:20:30:31.458 WcpInitialize ...,E13,<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize ...
4,5,2016-09-28,04:30:31,Info,CBS,Ending TrustedInstaller initialization.,E17,Ending TrustedInstaller initialization.
...,...,...,...,...,...,...,...,...
1995,1996,2016-09-29,02:04:40,Info,CBS,Read out cached package applicability for pack...,E29,Read out cached package applicability for pack...
1996,1997,2016-09-29,02:04:40,Info,CBS,Session: 30546354_3360174372 initialized by cl...,E36,Session: <*>_<*> initialized by client Windows...
1997,1998,2016-09-29,02:04:40,Info,CBS,Read out cached package applicability for pack...,E29,Read out cached package applicability for pack...
1998,1999,2016-09-29,02:04:40,Info,CBS,Session: 30546354_3363894584 initialized by cl...,E36,Session: <*>_<*> initialized by client Windows...


In [42]:
# get the counts of EventId from the structured data
event_counts = structure1['EventId'].value_counts()
event_counts

EventId
E36    608
E29    558
E50    280
E20    224
E18    224
E21     18
E24      8
E49      8
E13      6
E2       3
E1       3
E3       3
E4       3
E39      2
E37      2
E17      2
E38      2
E19      2
E44      2
E48      2
E40      2
E23      2
E42      2
E27      2
E5       2
E26      2
E25      2
E43      2
E47      2
E41      2
E28      1
E16      1
E46      1
E12      1
E14      1
E11      1
E15      1
E35      1
E45      1
E9       1
E22      1
E31      1
E33      1
E6       1
E8       1
E7       1
E32      1
E10      1
E30      1
E34      1
Name: count, dtype: int64

In [39]:
template1['variables'] = template1['EventTemplate'].apply(count_variables)

In [43]:
# lets join the variables counts with the event counts
event_counts = event_counts.to_frame().reset_index()
event_counts.columns = ['EventId', 'count']
event_counts = event_counts.merge(template1[['EventId', 'variables']], on='EventId')
event_counts

Unnamed: 0,EventId,count,variables
0,E36,608,2
1,E29,558,3
2,E50,280,0
3,E20,224,1
4,E18,224,1
5,E21,18,1
6,E24,8,3
7,E49,8,1
8,E13,6,4
9,E2,3,3


In [44]:
loghub_variable_instance_counts = []

for structure_path, template_path in paths:
    structure = pd.read_csv(structure_path)
    template = pd.read_csv(template_path)
    dataset = structure_path.split('/')[1]
    
    # Count the number of variables in the event template
    template['unique_variables'] = template['EventTemplate'].apply(count_variables)

    # get the counts of EventId from the structured data
    event_counts = structure['EventId'].value_counts()
    event_counts = event_counts.to_frame().reset_index()
    event_counts.columns = ['EventId', 'instance_count']
    event_counts = event_counts.merge(template[['EventId', 'unique_variables']], on='EventId')
    event_counts['dataset'] = dataset

    loghub_variable_instance_counts.append(event_counts)

loghub_variable_instance_counts = pd.concat(loghub_variable_instance_counts)
loghub_variable_instance_counts

Unnamed: 0,EventId,instance_count,unique_variables,dataset
0,E126,200,15,Android
1,E103,199,8,Android
2,E123,181,0,Android
3,E65,85,1,Android
4,E21,85,2,Android
...,...,...,...,...
45,E21,1,7,Zookeeper
46,E30,1,2,Zookeeper
47,E48,1,1,Zookeeper
48,E35,1,5,Zookeeper


In [46]:
loghub_variable_instance_counts['percentage_total'] = loghub_variable_instance_counts['instance_count'] / 2000

In [47]:
# save the results
loghub_variable_instance_counts.to_csv('loghub_variable_instance_counts.csv', index=False)

In [64]:
def generate_dataset_summary_table(df):

    summary_df = (
        df.copy()
        .groupby('dataset')
        .agg(
            num_templates=('EventId', 'nunique'),
            avg_vars=('unique_variables', 'mean'),
            median_vars=('unique_variables', 'median'),
            min_vars=('unique_variables', 'min'),
            max_vars=('unique_variables', 'max'),
            std_vars=('unique_variables', 'std')
        )
        .reset_index()
    )
    summary_df['avg_vars'] = summary_df['avg_vars'].round(2)
    summary_df['std_vars'] = summary_df['std_vars'].round(2)

    # rename the columns
    summary_df.columns = [
        'Dataset',
        'Number of Templates',
        'Mean',
        'Median',
        'Minimum',
        'Maximum',
        'Standard Deviation'
    ]

    return summary_df

In [67]:
summary_table = generate_dataset_summary_table(loghub_variable_instance_counts)
summary_table

Unnamed: 0,Dataset,Number of Templates,Mean,Median,Minimum,Maximum,Standard Deviation
0,Android,166,2.14,1.0,0,20,3.1
1,Apache,6,1.5,1.5,1,2,0.55
2,BGL,120,2.18,1.0,0,25,2.85
3,HDFS,14,3.64,3.5,1,6,1.34
4,HPC,46,1.24,1.0,0,9,1.8
5,Hadoop,114,1.75,1.0,0,12,2.24
6,HealthApp,75,1.32,1.0,0,8,1.72
7,Linux,118,1.19,1.0,0,13,1.65
8,Mac,341,3.06,1.0,0,36,4.69
9,OpenSSH,27,2.07,2.0,0,4,1.27


In [68]:
# save the results
summary_table.to_csv('loghub_variable_instance_summary.csv', index=False)

In [70]:
# export the summary table to latex
summary_table.to_latex('loghub_variable_instance_summary.tex', index=False, caption='Summary of the number of variables in the LogHub datasets', label='tab:loghub_variable_instance_summary')