In [15]:
import time
import os
import logging
from pathlib import Path
import datetime
from config import config
from gpt_model import get_completion_from_gpt
from claude import get_completion_from_claude
from format_output import Format_output
import pandas as pd

In [16]:
# Set the ROOT_DIR to your repository root.
ROOT_DIR = os.path.dirname(os.path.abspath(''))
# Set the DATA_DIR to the directory where your data resides.
DATA_DIR = os.path.join(ROOT_DIR, 'data/loghub_2k')

In [24]:
save_dir_path = os.path.join(ROOT_DIR, 'results')

now_time = datetime.datetime.now()
date_string = "Semantic_" + now_time.strftime('%Y-%m-%d-%H-%M-%S')
save_dir_separator = "Semantic_" + now_time.strftime('%Y%m%d%H%M%S')

save_dir_now = os.path.join(save_dir_path, save_dir_separator)
raw_save_dir = os.path.join(save_dir_now, "semantic_raw_results/")
Path(raw_save_dir).mkdir(parents=True, exist_ok=True)
semantic_raw_output_file_name = 'semantic_output.txt'
semantic_raw_output_file_path = raw_save_dir + semantic_raw_output_file_name

In [None]:
# File paths
ground_truth_file_path = os.path.join(DATA_DIR, "sample_ground_truth_template.csv")
# processed_output_file_path = os.path.join(ROOT_DIR, 'results/20241127211340/formatted_results/output_processed.txt')
raw_log_file_path = os.path.join(DATA_DIR, "sample_combined_raw_logs.txt")

In [19]:
# Load ground truth data
ground_truth_df = pd.read_csv(ground_truth_file_path)
ground_truth_templates = ground_truth_df['EventTemplate'].tolist()
ground_truth_systems = ground_truth_df['System'].tolist()

In [None]:
# Load raw log messages
with open(raw_log_file_path, 'r') as raw_file:
    raw_logs = [line.strip() for line in raw_file.readlines()]

# # Load processed output data (before semantic enhancement)
# with open(processed_output_file_path, 'r') as processed_file:
#     processed_templates_before = [line.strip() for line in processed_file.readlines()]

In [21]:
# Ensure the lists are of the same length for comparison
min_length = min(len(ground_truth_templates), len(processed_templates_before), len(raw_logs))
ground_truth_templates = ground_truth_templates[:min_length]
processed_templates_before = processed_templates_before[:min_length]
raw_logs = raw_logs[:min_length]
ground_truth_systems = ground_truth_systems[:min_length]

In [None]:
# Step 1: Reformulate log messages with semantic understanding
counter=0
enhanced_prompts = []
enhanced_prompts_file_path = os.path.join(save_dir_now, "enhanced_prompts.txt")
for raw_log, gt_template, processed_template in zip(raw_logs, ground_truth_templates, processed_templates_before):
    match_status = "True" if gt_template == processed_template else "False"
    new_prompt = f"""You are provided with a log message along with a match status indicating whether the previous template
                     extraction was correct. Your task is to enhance the prompt for better extraction of semantic meaning 
                     based on the given log message. Log Message: {raw_log} Match Status: {match_status}. Please reformulate 
                     the prompt to capture the essential context and meaning of the log message in a more effective way for 
                     template extraction."""
    
    enhanced_prompt = get_completion_from_gpt(new_prompt)
    enhanced_prompts.append(enhanced_prompt)
    if counter % 50 == 0:
        print(counter)

    counter+=1
    with open(enhanced_prompts_file_path, 'a') as enhanced_file:
        enhanced_file.write(enhanced_prompt + '')

# Save all enhanced prompts to a file
print(f"Enhanced prompts saved to: {enhanced_prompts_file_path}")

Enhanced Prompt: Analyze the log message timestamped on Sun Dec 04 04:47:44 2005, indicating a notice from workerEnv.init() confirming successful initialization with workers2.properties located at /etc/httpd/conf/. Evaluate the accuracy of the previous template extraction based on this information.
Sure! Here is an enhanced prompt for better extraction of semantic meaning based on the given log message:

"Enhance the template extraction prompt to accurately capture the context and meaning of the log message: 
Log Message: 2573624 node-148 unix.hw net.niff.up 1074131743 1 NIFF: node node-148 reports network connection availability on network 5.5.226.0 via interface alt0.
Match Status: False. Refine the prompt to improve template extraction for this log message."
Prompt: Extract the essential details from the log message to understand the kernel command line configuration. Log Message: Jul 27 14:41:57 combo kernel: Kernel command line: ro root=LABEL=/ rhgb quiet.
Prompt: Improve the temp

KeyboardInterrupt: 

In [28]:
print(len(enhanced_prompts))

953


In [None]:
# Step 2: Generate log template using zero-shot learning
counter = 0
enhanced_templates = []
semantic_log_template_output_file_path = semantic_raw_output_file_path
for raw_log, enhanced_prompt in zip(raw_logs, enhanced_prompts):
    counter+=1
    semantic_prompt = f"""You will be provided with a log message delimited by <MSG> and </MSG>. 
    You are also provided with a new prompt that has the semantic undertanding of the raw log given. New prompt: {enhanced_prompt}. 
    
    The log message typically consists of two parts: 
    1. Template - message body, that contains constant strings (or keywords) describing the system events; 
    2. Parameters/Variables - dynamic variables, which reflect specific runtime status.
    You must identify and abstract all the dynamic variables in the log message with suitable placeholders inside angle brackets to extract the corresponding template.
    You must output the template corresponding to the log message. Print only the input log's template surrounded by <TPL> and </TPL>. 
    Never print an explanation of how the template is constructed.
    Here are a few examples of log messages (labeled with Q:) and corresponding templates (labeled with A:):

    Q: <MSG>[081109 204453 34 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_2377150260128098806 size 67108864]</MSG>
    A: <TPL>[BLOCK* NameSystem.addStoredBlock: blockMap updated: <*>:<*> is added to <*> size <*>]</TPL>

    Q: <MSG>- 1129734520 2005.10.19 R17-M0-N0-I:J18-U01 2005-10-19-08.08.40.058960 R17-M0-N0-I:J18-U01 RAS KERNEL INFO shutdown complete</MSG>
    A: <TPL>shutdown complete</TPL>

    Q: <MSG>20231114T101914E ERROR 14 while processing line 123: cannot find input '42'</MSG>
    A: <TPL>ERROR <*> while processing line <*>: cannot find input <*></TPL>

    Q: <MSG>2023-01-14 23:05:14 INFO: Reading data from /user/input/file.txt</MSG>
    A: <TPL>Reading data from <*> </TPL>
    Here is the input log message: <MSG>{raw_log}</MSG>
    Please print the corresponding template.
    """
    response = get_completion_from_gpt(semantic_prompt)
    enhanced_templates.append(response)
    # with open(semantic_log_template_output_file_path, 'a') as semantic_file:
    #     semantic_file.write(response + '')
    if counter == 50:
        break    



Semantic log templates saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128091624/semantic_raw_results/semantic_output.txt


In [35]:
# save and format output data in a csv file
Format_output.save_raw_output(semantic_log_template_output_file_path, enhanced_templates)
# Save all semantic log templates to a file
print(f"Semantic log templates saved to: {semantic_log_template_output_file_path}")

Semantic log templates saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128091624/semantic_raw_results/semantic_output.txt


In [36]:
# convert raw output into formatted file 
Format_output.remove_TPL_from_output(semantic_log_template_output_file_path, semantic_log_template_output_file_path)

Processed output saved to: /Users/navneetsharma/Documents/NMBU/MS Data Science @ NMBU/Master's Thesis/semantic_log_parsing/results/Semantic_20241128091624/semantic_raw_results/semantic_output.txt
