In [1]:
import time
import os
import logging
from pathlib import Path
import datetime
from config import config
from gpt_model import get_completion_from_gpt
from claude import get_completion_from_claude
from format_output import Format_output
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Set the ROOT_DIR to your repository root.
ROOT_DIR = os.path.dirname(os.path.abspath(''))
# Set the DATA_DIR to the directory where your data resides.
DATA_DIR = os.path.join(ROOT_DIR, 'data/loghub_2k')

In [3]:
save_dir_path = os.path.join(ROOT_DIR, 'results')

now_time = datetime.datetime.now()
date_string = "Semantic_" + now_time.strftime('%Y-%m-%d-%H-%M-%S')
save_dir_separator = "Semantic_" + now_time.strftime('%Y%m%d%H%M%S')

save_dir_now = os.path.join(save_dir_path, save_dir_separator)
raw_save_dir = os.path.join(save_dir_now, "semantic_raw_results/")
Path(raw_save_dir).mkdir(parents=True, exist_ok=True)
semantic_raw_output_file_name = 'semantic_output.txt'
semantic_raw_output_file_path = raw_save_dir + semantic_raw_output_file_name

In [4]:
# File paths
ground_truth_file_path = os.path.join(DATA_DIR, "ground_truth_template.csv")
raw_log_file_path = os.path.join(DATA_DIR, "combined_raw_logs.txt")

In [5]:
# Load ground truth data
ground_truth_df = pd.read_csv(ground_truth_file_path)
ground_truth_templates = ground_truth_df['EventTemplate'].tolist()
ground_truth_systems = ground_truth_df['System'].tolist()

In [6]:
# Load raw log messages
with open(raw_log_file_path, 'r') as raw_file:
    raw_logs = [line.strip() for line in raw_file.readlines()]


In [7]:
# Ensure the lists are of the same length for comparison
min_length = min(len(ground_truth_templates), len(raw_logs))
ground_truth_templates = ground_truth_templates[:min_length]
raw_logs = raw_logs[:min_length]
ground_truth_systems = ground_truth_systems[:min_length]

In [8]:
# Step 1: Reformulate log messages with semantic understanding
counter=0
enhanced_prompts = []
enhanced_prompts_file_path = os.path.join(save_dir_now, "enhanced_prompts.txt")
for raw_log in raw_logs:
    new_prompt=f"""You are provided with a log message. Your task is to understand and extract the meaning behind the semi-structured log message.
                      
                    Log message: {raw_log}. 

                    A log message usually contains a header that is automatically produced by the logging framework, including information such as timestamp, class, and logging level (INFO, DEBUG, WARN etc.).
                    Ignore all these details and just understand meaning behind the natural languagae text which is in the log content.

                    The log content typically consists of many parts: 
                    1. Template - message body, that contains constant strings (or keywords) describing the system events; 
                    2. Parameters/Variables - dynamic variables, which reflect specific runtime status;

                    Please capture the essential context and meaning from the log message to understand the reasoning behind each raw log.
                    Provide only the meaning in 20-25 words from each log message surrounded by <TPL> and </TPL>. 
                    Never provide an explanation of how the meaning is constructed.
                """
    
    enhanced_prompt = get_completion_from_gpt(new_prompt)
    enhanced_prompts.append(enhanced_prompt)
    if counter % 10 == 0:
        print(f'{counter}: {enhanced_prompt}')
    counter+=1    


0: <TPL> Initialization of worker environment successful for workers2 properties file located at /etc/httpd/conf/workers2.properties. </TPL>
10: <TPL> Node-162 started boot action with command 1911. </TPL>
20: <TPL> A ServerFileSystem domain panic occurred on storage442. </TPL>
30: <TPL> ServerFileSystem domain cluster_root_backup is no longer served by node node-96. </TPL>
40: <TPL> Temperature reading for gige7 interface is normal at 1073151998. </TPL>
50: <TPL> Node node-148 detected network connection on network 5.5.226.0 via interface alt0. </TPL>
60: <TPL>User unknown timed out after 900 seconds.</TPL>
70: <TPL> User 'root' successfully logged in via the 'LOGIN' service with root privileges. </TPL>
80: <TPL> Connection reset by peer, unable to retrieve client address in xinetd service. </TPL>
90: <TPL> klogd startup succeeded. </TPL>
100: <TPL> Kernel command line specifies read-only root filesystem with label 'LABEL=/' for booting the system quietly with Red Hat Graphical Boot. 

In [9]:
print(len(enhanced_prompts))

1146


In [10]:
# save and format output data in a csv file
Format_output.save_raw_output(enhanced_prompts_file_path, enhanced_prompts)

# Save all semantic log templates to a file
print(f"Semantic log templates saved to: {enhanced_prompts_file_path}")

Semantic log templates saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128221046/enhanced_prompts.txt


In [11]:
# convert raw output into formatted file 
Format_output.remove_TPL_from_output(enhanced_prompts_file_path, enhanced_prompts_file_path)

Processed output saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128221046/enhanced_prompts.txt


In [12]:
# Step 2: Generate log template using zero-shot learning
counter_1 = 0
enhanced_templates = []
semantic_log_template_output_file_path = semantic_raw_output_file_path
for raw_log, enhanced_prompt in zip(raw_logs, enhanced_prompts):
    semantic_prompt = f"""You will be provided with a log message delimited by <MSG> and </MSG>. 
    You are also provided with the meaning or understanding from the log message as follow: {enhanced_prompt}. 
    
    The log message typically consists of two parts: 
    1. Template - message body, that contains constant strings (or keywords) describing the system events; 
    2. Parameters/Variables - dynamic variables, which reflect specific runtime status.
    You must identify and abstract all the dynamic variables in the log message with suitable placeholders inside angle brackets to extract the corresponding template.
    You must output the template corresponding to the log message. Print only the input log's template surrounded by <TPL> and </TPL>. 
    Never print an explanation of how the template is constructed.
    Here are a few examples of log messages (labeled with Q:) and corresponding templates (labeled with A:):

    Q: <MSG>[081109 204453 34 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_2377150260128098806 size 67108864]</MSG>
    A: <TPL>[BLOCK* NameSystem.addStoredBlock: blockMap updated: <*>:<*> is added to <*> size <*>]</TPL>

    Q: <MSG>- 1129734520 2005.10.19 R17-M0-N0-I:J18-U01 2005-10-19-08.08.40.058960 R17-M0-N0-I:J18-U01 RAS KERNEL INFO shutdown complete</MSG>
    A: <TPL>shutdown complete</TPL>

    Q: <MSG>20231114T101914E ERROR 14 while processing line 123: cannot find input '42'</MSG>
    A: <TPL>ERROR <*> while processing line <*>: cannot find input <*></TPL>

    Q: <MSG>2023-01-14 23:05:14 INFO: Reading data from /user/input/file.txt</MSG>
    A: <TPL>Reading data from <*> </TPL>
    Here is the input log message: <MSG>{raw_log}</MSG>
    Please print the corresponding template.
    """
    response = get_completion_from_gpt(semantic_prompt)
    enhanced_templates.append(response)

    if counter_1 % 10 == 0:
        print(f'{counter_1}: {response}')
        
    counter_1+=1      



0: <TPL>workerEnv.init() ok <*> </TPL>
10: <TPL>node-162 action start <*> <*> boot (command <*>)</TPL>
20: <TPL>An ServerFileSystem domain panic has occurred on storage442</TPL>
30: <TPL>ServerFileSystem domain cluster_root_backup is no longer served by node <*> </TPL>
40: <TPL>gige temperature <*> <*> normal</TPL>
50: <TPL>node node-148 has detected an available network connection on network <*> via interface alt0</TPL>
60: <TPL>User unknown timed out after <*> seconds at <*> </TPL>
70: <TPL>session opened for user <*> by LOGIN(uid=*)</TPL>
90: <TPL>klogd startup succeeded</TPL>
100: <TPL>Kernel command line: ro root=LABEL=/ rhgb quiet</TPL>
110: <TPL>Console: colour VGA+ 80x25</TPL>
120: <TPL>rpc.idmapd startup succeeded</TPL>
130: <TPL>Enabling unmasked SIMD FPU exception support... done.</TPL>
140: <TPL>Linux Plug and Play Support v0.97 (c) Adam Belay</TPL>
150: <TPL>apm: BIOS version <*> Flags <*> (Driver version <*>)</TPL>
160: <TPL>Bringing up loopback interface:  succeeded</TPL

In [13]:
# save and format output data in a csv file
Format_output.save_raw_output(semantic_log_template_output_file_path, enhanced_templates)

# Save all semantic log templates to a file
print(f"Semantic log templates saved to: {enhanced_prompts_file_path}")

Semantic log templates saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128221046/enhanced_prompts.txt


In [14]:
# convert raw output into formatted file 
Format_output.remove_TPL_from_output(semantic_log_template_output_file_path, semantic_log_template_output_file_path)

Processed output saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128221046/semantic_raw_results/semantic_output.txt


In [15]:
# File paths
processed_output_file_path = semantic_log_template_output_file_path


In [16]:
# Load processed output data
with open(processed_output_file_path, 'r') as processed_file:
    processed_templates = [line.strip() for line in processed_file.readlines()]

# Ensure the lists are of the same length for comparison
min_length = min(len(ground_truth_templates), len(processed_templates))
ground_truth_templates = ground_truth_templates[:min_length]
processed_templates = processed_templates[:min_length]
ground_truth_systems = ground_truth_systems[:min_length]



In [17]:
# Calculate evaluation metrics
accuracy = accuracy_score(ground_truth_templates, processed_templates)
precision = precision_score(ground_truth_templates, processed_templates, average='weighted', zero_division=0)
recall = recall_score(ground_truth_templates, processed_templates, average='weighted', zero_division=0)
f1 = f1_score(ground_truth_templates, processed_templates, average='weighted', zero_division=0)

# Print evaluation metrics
print(f"Parsing Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")



Parsing Accuracy: 35.51%
Precision: 35.38%
Recall: 35.51%
F1 Score: 35.37%


In [19]:
# Calculate correctly parsed templates for each system
correct_parsed_counts = {}
for system, gt_template, processed_template in zip(ground_truth_systems, ground_truth_templates, processed_templates):
    if gt_template == processed_template:
        if system not in correct_parsed_counts:
            correct_parsed_counts[system] = 0
        correct_parsed_counts[system] += 1

# Print correctly parsed templates for each system
print("\nCorrectly Parsed Templates per System:")
total=0
for system, count in correct_parsed_counts.items():
    total +=count
    print(f"{system}: {count}")

print(f"Total correctly parsed logs: {total}")


Correctly Parsed Templates per System:
Apache: 5
HPC: 4
Linux: 58
Zookeeper: 30
BGL: 28
Hadoop: 41
Mac: 112
HealthApp: 40
OpenSSH: 10
Spark: 20
HDFS: 1
OpenStack: 8
Thunderbird: 50
Total correctly parsed logs: 407
