In [89]:
import time
import os
import logging
from pathlib import Path
import datetime
from config import config
from gpt_model import get_completion_from_gpt
from claude import get_completion_from_claude
from format_output import Format_output
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [90]:
# Set the ROOT_DIR to your repository root.
ROOT_DIR = os.path.dirname(os.path.abspath(''))
# Set the DATA_DIR to the directory where your data resides.
DATA_DIR = os.path.join(ROOT_DIR, 'data/loghub_2k')

In [91]:
save_dir_path = os.path.join(ROOT_DIR, 'results')

now_time = datetime.datetime.now()
date_string = "Semantic_" + now_time.strftime('%Y-%m-%d-%H-%M-%S')
save_dir_separator = "Semantic_" + now_time.strftime('%Y%m%d%H%M%S')

save_dir_now = os.path.join(save_dir_path, save_dir_separator)
raw_save_dir = os.path.join(save_dir_now, "semantic_raw_results/")
Path(raw_save_dir).mkdir(parents=True, exist_ok=True)
semantic_raw_output_file_name = 'semantic_output.txt'
semantic_raw_output_file_path = raw_save_dir + semantic_raw_output_file_name

In [92]:
# File paths
ground_truth_file_path = os.path.join(DATA_DIR, "sample_ground_truth_template.csv")
raw_log_file_path = os.path.join(DATA_DIR, "sample_combined_raw_logs.txt")

In [93]:
# Load ground truth data
ground_truth_df = pd.read_csv(ground_truth_file_path)
ground_truth_templates = ground_truth_df['EventTemplate'].tolist()
ground_truth_systems = ground_truth_df['System'].tolist()

In [94]:
# Load raw log messages
with open(raw_log_file_path, 'r') as raw_file:
    raw_logs = [line.strip() for line in raw_file.readlines()]


In [95]:
# Ensure the lists are of the same length for comparison
min_length = min(len(ground_truth_templates), len(raw_logs))
ground_truth_templates = ground_truth_templates[:min_length]
raw_logs = raw_logs[:min_length]
ground_truth_systems = ground_truth_systems[:min_length]

In [96]:
# Step 1: Reformulate log messages with semantic understanding
counter=0
enhanced_prompts = []
enhanced_prompts_file_path = os.path.join(save_dir_now, "enhanced_prompts.txt")
for raw_log in raw_logs:
    new_prompt=f"""You are provided with a log message. Your task is to understand and extract the meaning behind the semi-structured log message based on the given raw log.
                      
                    Raw log: {raw_log}. 

                    A raw log usually contains a header that is automatically produced by the logging framework, including information such as timestamp, class, and logging level (INFO, DEBUG, WARN etc.).
                    Ignore all these details and just understand the natural languagae text which is in the log content.

                    The log content typically consists of many parts: 
                    1. Template - message body, that contains constant strings (or keywords) describing the system events; 
                    2. Parameters/Variables - dynamic variables, which reflect specific runtime status;

                    Please capture the essential context and meaning from the log message to understand the reasoning behind each raw log.
                    Provide only the meaning in 25 words from each raw log surrounded by <TPL> and </TPL>. 
                    Never provide an explanation of how the semantic knowledge is constructed.
                """
    
    enhanced_prompt = get_completion_from_gpt(new_prompt)
    enhanced_prompts.append(enhanced_prompt)
    if counter % 10 == 0:
        print(f'{counter}: {enhanced_prompt}')
    counter+=1    


0: <TPL> Initialization of worker environment successful for workers2 properties file located at /etc/httpd/conf/workers2.properties. </TPL>
20: <TPL> Command abort: 1111858191. </TPL>
30: <TPL> CPU has an L2 cache size of 256K. </TPL>
40: <TPL> Connection error while trying to connect to tcpconn4.tencent.com:80 through proxy proxy.cse.cuhk.edu.hk:5070. Proxy closed the connection unexpectedly. </TPL>
50: <TPL> autopurge.snapRetainCount set to 3 </TPL>
60: <TPL>RAS KERNEL FATAL disable store gathering.</TPL>
70: <TPL>RAS KERNEL INFO: Z coordinate 32 exceeds physical dimension 32 at line 33 of node map file.</TPL>
80: <TPL> ContainerLauncher retrieved Shuffle port for specific map task attempt. </TPL>
90: <TPL> Path not allowed in target domain for Safari SearchHelper service due to missing bundle service in WebKit origin. </TPL>
100: <TPL> Display woke up notification posted by WindowServer for the specific display ID. </TPL>
110: <TPL> insertHiHealthData() bulkSaveDetailHiHealthData f

In [97]:
print(len(enhanced_prompts))

193


In [98]:
# save and format output data in a csv file
Format_output.save_raw_output(enhanced_prompts_file_path, enhanced_prompts)

# Save all semantic log templates to a file
print(f"Semantic log templates saved to: {enhanced_prompts_file_path}")

Semantic log templates saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128133042/enhanced_prompts.txt


In [99]:
# convert raw output into formatted file 
Format_output.remove_TPL_from_output(enhanced_prompts_file_path, enhanced_prompts_file_path)

Processed output saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128133042/enhanced_prompts.txt


In [100]:
# Step 2: Generate log template using zero-shot learning
counter = 0
enhanced_templates = []
semantic_log_template_output_file_path = semantic_raw_output_file_path
for raw_log, enhanced_prompt in zip(raw_logs, enhanced_prompts):
    semantic_prompt = f"""You will be provided with a log message delimited by <MSG> and </MSG>. 
    You are also provided with a new prompt that has the semantic undertanding of the raw log given. New prompt: {enhanced_prompt}. 
    
    The log message typically consists of two parts: 
    1. Template - message body, that contains constant strings (or keywords) describing the system events; 
    2. Parameters/Variables - dynamic variables, which reflect specific runtime status.
    You must identify and abstract all the dynamic variables in the log message with suitable placeholders inside angle brackets to extract the corresponding template.
    You must output the template corresponding to the log message. Print only the input log's template surrounded by <TPL> and </TPL>. 
    Never print an explanation of how the template is constructed.
    Here are a few examples of log messages (labeled with Q:) and corresponding templates (labeled with A:):

    Q: <MSG>[081109 204453 34 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_2377150260128098806 size 67108864]</MSG>
    A: <TPL>[BLOCK* NameSystem.addStoredBlock: blockMap updated: <*>:<*> is added to <*> size <*>]</TPL>

    Q: <MSG>- 1129734520 2005.10.19 R17-M0-N0-I:J18-U01 2005-10-19-08.08.40.058960 R17-M0-N0-I:J18-U01 RAS KERNEL INFO shutdown complete</MSG>
    A: <TPL>shutdown complete</TPL>

    Q: <MSG>20231114T101914E ERROR 14 while processing line 123: cannot find input '42'</MSG>
    A: <TPL>ERROR <*> while processing line <*>: cannot find input <*></TPL>

    Q: <MSG>2023-01-14 23:05:14 INFO: Reading data from /user/input/file.txt</MSG>
    A: <TPL>Reading data from <*> </TPL>
    Here is the input log message: <MSG>{raw_log}</MSG>
    Please print the corresponding template.
    """
    response = get_completion_from_gpt(semantic_prompt)
    enhanced_templates.append(response)

    if counter % 10 == 0:
        print(f'{counter}: {response}')
    counter+=1      



In [101]:
# save and format output data in a csv file
Format_output.save_raw_output(semantic_log_template_output_file_path, enhanced_templates)

# Save all semantic log templates to a file
print(f"Semantic log templates saved to: {enhanced_prompts_file_path}")

Semantic log templates saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128133042/enhanced_prompts.txt


In [102]:
# convert raw output into formatted file 
Format_output.remove_TPL_from_output(semantic_log_template_output_file_path, semantic_log_template_output_file_path)

Processed output saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128133042/semantic_raw_results/semantic_output.txt


In [103]:
# File paths
processed_output_file_path = semantic_log_template_output_file_path


In [104]:
# Load processed output data
with open(processed_output_file_path, 'r') as processed_file:
    processed_templates = [line.strip() for line in processed_file.readlines()]

# Ensure the lists are of the same length for comparison
min_length = min(len(ground_truth_templates), len(processed_templates))
ground_truth_templates = ground_truth_templates[:min_length]
processed_templates = processed_templates[:min_length]
ground_truth_systems = ground_truth_systems[:min_length]



In [105]:
# Calculate evaluation metrics
accuracy = accuracy_score(ground_truth_templates, processed_templates)
precision = precision_score(ground_truth_templates, processed_templates, average='weighted', zero_division=0)
recall = recall_score(ground_truth_templates, processed_templates, average='weighted', zero_division=0)
f1 = f1_score(ground_truth_templates, processed_templates, average='weighted', zero_division=0)

# Print evaluation metrics
print(f"Parsing Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")



Parsing Accuracy: 32.12%
Precision: 32.64%
Recall: 32.12%
F1 Score: 32.30%


In [106]:
# Calculate correctly parsed templates for each system
correct_parsed_counts = {}
for system, gt_template, processed_template in zip(ground_truth_systems, ground_truth_templates, processed_templates):
    if gt_template == processed_template:
        if system not in correct_parsed_counts:
            correct_parsed_counts[system] = 0
        correct_parsed_counts[system] += 1

# Print correctly parsed templates for each system
print("\nCorrectly Parsed Templates per System:")
for system, count in correct_parsed_counts.items():
    print(f"{system}: {count}")


Correctly Parsed Templates per System:
Apache: 5
Linux: 9
Zookeeper: 5
BGL: 3
Hadoop: 5
Mac: 4
HealthApp: 6
OpenSSH: 8
Spark: 10
HDFS: 1
OpenStack: 3
Thunderbird: 3
