In [1]:
import time
import os
import logging
from pathlib import Path
import datetime
from config import config
from gpt_model import get_completion_from_gpt
from claude import get_completion_from_claude
from format_output import Format_output
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Set the ROOT_DIR to your repository root.
ROOT_DIR = os.path.dirname(os.path.abspath(''))
# Set the DATA_DIR to the directory where your data resides.
DATA_DIR = os.path.join(ROOT_DIR, 'data/loghub_2k')

In [3]:
save_dir_path = os.path.join(ROOT_DIR, 'results')

now_time = datetime.datetime.now()
date_string = "Semantic_" + now_time.strftime('%Y-%m-%d-%H-%M-%S')
save_dir_separator = "Semantic_" + now_time.strftime('%Y%m%d%H%M%S')

save_dir_now = os.path.join(save_dir_path, save_dir_separator)
raw_save_dir = os.path.join(save_dir_now, "semantic_raw_results/")
Path(raw_save_dir).mkdir(parents=True, exist_ok=True)
semantic_template_file_name = 'semantic_output.txt'
variables_output_file_name = 'variables_output.txt'
semantic_template_output_file_path = raw_save_dir + semantic_template_file_name
variables_output_file_path = raw_save_dir + variables_output_file_name


In [4]:
# File paths
ground_truth_file_path = os.path.join(DATA_DIR, "sample_ground_truth_template.csv")
raw_log_file_path = os.path.join(DATA_DIR, "sample_combined_raw_logs.txt")

In [5]:
# Load ground truth data
ground_truth_df = pd.read_csv(ground_truth_file_path)
ground_truth_log_templates = ground_truth_df['EventTemplate'].tolist()
ground_truth_variable_templates = ground_truth_df['VariableTemplate'].tolist()
ground_truth_systems = ground_truth_df['System'].tolist()

In [6]:
# Load raw log messages
with open(raw_log_file_path, 'r') as raw_file:
    raw_logs = [line.strip() for line in raw_file.readlines()]


In [7]:
# Step 1: Reformulate log messages with semantic understanding
counter=0
enhanced_prompts = []
enhanced_prompts_file_path = os.path.join(save_dir_now, "enhanced_prompts.txt")
for raw_log in raw_logs:
    new_prompt=f"""You are provided with a log message. Your task is to understand and extract the meaning behind the semi-structured log message.
                      
                    Log message: {raw_log}. 

                    A log message usually contains a header that is automatically produced by the logging framework, including information such as timestamp, class, and logging level (INFO, DEBUG, WARN etc.).
                    Ignore all these details and just understand meaning behind the natural languagae text which is in the log content.

                    The log content typically consists of many parts: 
                    1. Template - message body, that contains constant strings (or keywords) describing the system events; 
                    2. Parameters/Variables - dynamic variables, which reflect specific runtime status;

                    Please capture the essential context and meaning from the log message to understand the reasoning behind each raw log.
                    Provide only the meaning in 20-25 words from each log message surrounded by <TPL> and </TPL>. 
                    Never provide an explanation of how the meaning is constructed.
                """
    
    enhanced_prompt = get_completion_from_gpt(new_prompt)
    enhanced_prompts.append(enhanced_prompt)
    if counter % 10 == 0:
        print(f'{counter}: {enhanced_prompt}')
    counter+=1    


0: <TPL> Initialization of worker environment successful for workers2 properties file at /etc/httpd/conf/workers2.properties. </TPL>
20: <TPL> Command has been aborted. </TPL>
30: <TPL> CPU has an L2 cache size of 256K. </TPL>
40: <TPL> Connection to tcpconn4.tencent.com:80 through proxy proxy.cse.cuhk.edu.hk:5070 failed due to unexpected closure by the proxy server. </TPL>
50: <TPL> autopurge.snapRetainCount set to 3 </TPL>
60: <TPL>RAS KERNEL FATAL disable store gathering</TPL>
70: <TPL> Z coordinate exceeds physical dimension in node map file. </TPL>
80: <TPL> Shuffle port for a specific task attempt in a Hadoop MapReduce job is assigned the port number 13562. </TPL>
90: <TPL>Path not allowed in target domain for Safari SearchHelper service due to missing bundle service in requestor's bundle.</TPL>
100: <TPL> Display woke up notification posted by WindowServer. </TPL>
110: <TPL> insertHiHealthData() bulkSaveDetailHiHealthData fail </TPL>

<TPL> errorCode = 4, errorMessage = ERR_DATA

In [8]:
print(len(enhanced_prompts))

193


In [9]:
# save and format output data in a csv file
Format_output.save_raw_output(enhanced_prompts_file_path, enhanced_prompts)

# Save all semantic log templates to a file
print(f"Semantic log templates saved to: {enhanced_prompts_file_path}")

Semantic log templates saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241203153952/enhanced_prompts.txt


In [10]:
# convert raw output into formatted file 
Format_output.remove_TPL_from_output(enhanced_prompts_file_path, enhanced_prompts_file_path)

Processed output saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241203153952/enhanced_prompts.txt


In [11]:
# Step 2: Generate log template using zero-shot learning
counter_1 = 0
semantic_based_templates = []
for raw_log, enhanced_prompt in zip(raw_logs, enhanced_prompts):
    semantic_prompt = f"""You will be provided with a log message delimited by <MSG> and </MSG>. 
    You are also provided with the meaning or understanding from the log message as follow: {enhanced_prompt}. 
    
    The log message typically consists of two parts: 
    1. Template - message body, that contains constant strings (or keywords) describing the system events; 
    2. Parameters/Variables - dynamic variables, which reflect specific runtime status.
    You must identify and abstract all the dynamic variables in the log message with suitable placeholders inside angle brackets to extract the corresponding template.
    You must output the template corresponding to the log message. Print only the input log's template surrounded by <TPL> and </TPL>. 
    Never print an explanation of how the template is constructed.
    Here are a few examples of log messages (labeled with Q:) and corresponding templates (labeled with A:):

    Q: <MSG>[081109 204453 34 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_2377150260128098806 size 67108864]</MSG>
    A: <TPL>[BLOCK* NameSystem.addStoredBlock: blockMap updated: <*>:<*> is added to <*> size <*>]</TPL>

    Q: <MSG>- 1129734520 2005.10.19 R17-M0-N0-I:J18-U01 2005-10-19-08.08.40.058960 R17-M0-N0-I:J18-U01 RAS KERNEL INFO shutdown complete</MSG>
    A: <TPL>shutdown complete</TPL>

    Q: <MSG>20231114T101914E ERROR 14 while processing line 123: cannot find input '42'</MSG>
    A: <TPL>ERROR <*> while processing line <*>: cannot find input <*></TPL>

    Q: <MSG>2023-01-14 23:05:14 INFO: Reading data from /user/input/file.txt</MSG>
    A: <TPL>Reading data from <*> </TPL>
    Here is the input log message: <MSG>{raw_log}</MSG>
    Please print the corresponding template.
    """
    response = get_completion_from_gpt(semantic_prompt)
    semantic_based_templates.append(response)
    
    if counter_1 % 10 == 0:
        print(f'{counter_1}: {response}')
        
    counter_1+=1   
   



0: <TPL>workerEnv.init() ok <*> </TPL>
20: <TPL>Command has been aborted</TPL>
30: <TPL>CPU: L2 cache: <*> </TPL>
40: <TPL>Could not connect through proxy <*> - Proxy closed the connection unexpectedly.</TPL>
50: <TPL>autopurge.snapRetainCount set to <*> </TPL>
60: <TPL>FATAL disable store gathering</TPL>
70: <TPL>ciod: Z coordinate <*> exceeds physical dimension <*> at line <*> of node map file <*> </TPL>
80: <TPL>Shuffle port returned by ContainerManager for attempt_<*>_<*>_<*>_<*> : 13562</TPL>
90: <TPL>Path not allowed in target domain for Safari SearchHelper service due to missing bundle service in requestor's bundle.</TPL>
100: <TPL>CGXDisplayDidWakeNotification [*]: posting kCGSDisplayDidWake</TPL>
110: <TPL>insertHiHealthData() bulkSaveDetailHiHealthData fail errorCode = <*> ,errorMessage = <*></TPL>
120: <TPL>Closed due to user request.</TPL>
130: <TPL>5 more authentication failures; logname= uid=0 euid=0 tty=ssh ruser= rhost=<*>.net.om  user=root</TPL>
140: <TPL>Reading broad

In [12]:
counter_1 = 0
variable_from_logs = []
for raw_log, enhanced_prompt in zip(raw_logs, enhanced_prompts):
    semantic_prompt = f"""You will be provided with a log message delimited by <MSG> and </MSG>. 
    You are also provided with the meaning or understanding from the log message as follow: {enhanced_prompt}. 
    
    I want you to categorize the variable(s) in each log message as variable template. 
    The variable should be classified within the category as below:
    1. Object ID [OID]	Identification information of an object
    2. Location Indicator   [LOI]	Location information of an object
    3. Object Name	[OBN]	Name of an object
    4.Type Indicator	[TID]	Type information of an object or an action
    5. Switch Indicator	[SID]	Status of a switch variable
    6. Time or Duration of an Action	[TDA]	Time or duration of an action
    7. Computing Resources	[CRS]	Information of computing resource
    8. Object Amount	[OBA]	Amount of an object
    9. Status Code	[STC]	Status code of an object or an action
    10. Other Parameters	[OTP]	Other information does not belong to the above categories]. 
    


    Static words/parts of the log message are to be annotated with O strictly
    Organize your variable template within <TPL></TPL> in the following format for each Q: 

    Q: <MSG>[Jul  1 22:08:16 calvisitor-10-105-163-202 WindowServer[184]: device_generate_desktop_screenshot: authw 0x7fa823c89600(2000), shield 0x7fa8258cac00(2001)]</MSG>
    A: <TPL>[O O O O O O O O OID OID OID OID OBA]</TPL>

    Q: <MSG>[nova-scheduler.log.1.2017-05-16_13:53:08 2017-05-16 00:00:57.129 25998 INFO nova.scheduler.host_manager [req-d724a3bd-e314-4f81-a41c-460aa91f24ae - - - - -] Successfully synced instances from host 'cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us']</MSG>
    A: <TPL>[O O O O O O O O O O O O LOI O LOI OBN O STC O OBA O TDA]</TPL>

    Above is an example of our log annotation process. Static words are annotated with O, object ID is annotated with OID, 
    and two location indicators are annotated with LOI.

    Here is the input log message: <MSG>{raw_log}</MSG>
    Print only the variable template surrounded by <TPL> and </TPL> and nothing else for each log message. 
    Never print an explanation of how the variable_template is constructed.
    """
    response = get_completion_from_gpt(semantic_prompt)
    variable_from_logs.append(response)    

    if counter_1 % 10 == 0:
        print(f'{counter_1}: {response}')
        
    counter_1+=1   

0: <TPL>[O O O O TID O O O O LOI O O O O O O O O O]</TPL>
10: <TPL>[OID OBN OBN TID OID OBA STC]</TPL>
20: <TPL>[O O TID TID OID OID TID]</TPL>
30: A: <TPL>[O O O O O O O O CRS:O O O O OBA]</TPL>
40: <TPL>[TDA O O O O O O OID O O O O O O O O O O O O O O O O O O O O O O O O O O O O]</TPL>
50: <TPL>[O O O O O O O TID O O O O O O O O O O O OBA]</TPL>
60: <TPL>[OID OBA OBA TID OTP]</TPL>
70: <TPL>[OID OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA OBA
80: <TPL>[O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID OID O

In [13]:
# save and format output data in a csv file
Format_output.save_raw_output(semantic_template_output_file_path, semantic_based_templates)
Format_output.save_raw_output(variables_output_file_path, variable_from_logs)
# Save all semantic log templates to a file
print(f"Semantic Log templates and Variable templates are saved to: {raw_save_dir}")

Semantic Log templates and Variable templates are saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241203153952/semantic_raw_results/


In [14]:
# convert raw output into formatted file 
Format_output.remove_TPL_from_output(semantic_template_output_file_path, semantic_template_output_file_path)
Format_output.remove_TPL_from_output(variables_output_file_path, variables_output_file_path)

Processed output saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241203153952/semantic_raw_results/semantic_output.txt
Processed output saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241203153952/semantic_raw_results/variables_output.txt


In [15]:
# File paths
processed_log_template_file_path = semantic_template_output_file_path
processed_variable_template_file_path = variables_output_file_path


In [16]:
# Load processed output data
with open(processed_log_template_file_path, 'r') as processed_file:
    processed_log_templates = [line.strip() for line in processed_file.readlines()]

with open(processed_variable_template_file_path, 'r') as processed_file:
    processed_variable_templates = [line.strip() for line in processed_file.readlines()]    

# Ensure the lists are of the same length for comparison
min_length = min(len(ground_truth_log_templates), len(processed_log_templates), len(processed_variable_templates))
ground_truth_log_templates = ground_truth_log_templates[:min_length]
ground_truth_variable_templates = ground_truth_variable_templates[:min_length]
processed_log_templates = processed_log_templates[:min_length]
processed_variable_templates = processed_variable_templates[:min_length]
ground_truth_systems = ground_truth_systems[:min_length]



In [17]:
# Calculate evaluation metrics for processed_log_templates
accuracy = accuracy_score(ground_truth_log_templates, processed_log_templates)
precision = precision_score(ground_truth_log_templates, processed_log_templates, average='weighted', zero_division=0)
recall = recall_score(ground_truth_log_templates, processed_log_templates, average='weighted', zero_division=0)
f1 = f1_score(ground_truth_log_templates, processed_log_templates, average='weighted', zero_division=0)

# Print evaluation metrics for processed_log_templates
print(f"Log Template Parsing Accuracy: {accuracy * 100:.2f}%")
print(f"Log Template Precision: {precision * 100:.2f}%")
print(f"Log Template Recall: {recall * 100:.2f}%")
print(f"Log Template F1 Score: {f1 * 100:.2f}%")


# Calculate evaluation metrics for processed_variable_templates
accuracy = accuracy_score(ground_truth_variable_templates, processed_variable_templates)
precision = precision_score(ground_truth_variable_templates, processed_variable_templates, average='weighted', zero_division=0)
recall = recall_score(ground_truth_variable_templates, processed_variable_templates, average='weighted', zero_division=0)
f1 = f1_score(ground_truth_variable_templates, processed_variable_templates, average='weighted', zero_division=0)

# Print evaluation metrics for processed_variable_templates
print(f"Variable Template Parsing Accuracy: {accuracy * 100:.2f}%")
print(f"Variable Template Precision: {precision * 100:.2f}%")
print(f"Variable Template Recall: {recall * 100:.2f}%")
print(f"Variable Template F1 Score: {f1 * 100:.2f}%")



Log Template Parsing Accuracy: 34.72%
Log Template Precision: 35.23%
Log Template Recall: 34.72%
Log Template F1 Score: 34.89%
Variable Template Parsing Accuracy: 0.00%
Variable Template Precision: 0.00%
Variable Template Recall: 0.00%
Variable Template F1 Score: 0.00%


In [18]:
# Calculate correctly parsed log templates for each system
correct_parsed_counts = {}
for system, gt_template, processed_template in zip(ground_truth_systems, ground_truth_log_templates, processed_log_templates):
    if gt_template == processed_template:
        if system not in correct_parsed_counts:
            correct_parsed_counts[system] = 0
        correct_parsed_counts[system] += 1

# Print correctly parsed log templates for each system
print("\nCorrectly Parsed Log Templates per System:")
total=0
for system, count in correct_parsed_counts.items():
    total +=count
    print(f"{system}: {count}")

print(f"Total correctly parsed log templates: {total}")


Correctly Parsed Log Templates per System:
Apache: 5
HPC: 1
Linux: 9
Zookeeper: 10
BGL: 5
Hadoop: 4
Mac: 3
HealthApp: 6
OpenSSH: 6
Spark: 12
OpenStack: 3
Thunderbird: 3
Total correctly parsed log templates: 67


In [19]:
# Calculate correctly parsed variable templates for each system
correct_parsed_counts = {}
for system, gt_template, processed_template in zip(ground_truth_systems, ground_truth_variable_templates, processed_variable_templates):
    if gt_template == processed_template:
        if system not in correct_parsed_counts:
            correct_parsed_counts[system] = 0
        correct_parsed_counts[system] += 1

# Print correctly parsed variable templates for each system
print("\nCorrectly Parsed Variable Templates per System:")
total=0
for system, count in correct_parsed_counts.items():
    total +=count
    print(f"{system}: {count}")

print(f"Total correctly parsed variable templates: {total}")


Correctly Parsed Variable Templates per System:
Total correctly parsed variable templates: 0
