In [25]:
import pandas as pd
import random
import re
import os

from config import config
from gpt_model import get_completion_from_gpt


# Set the ROOT_DIR to your repository root.
ROOT_DIR = os.path.dirname(os.path.abspath(''))

# Set the DATA_DIR to the directory where your data resides.
DATA_DIR = os.path.join(ROOT_DIR, 'data/')

# Load the uploaded CSV files
anomaly_label_path = os.path.join(DATA_DIR, "deepLoglizer_data/anomaly_label.csv")
hdfs_logs_file_path = os.path.join(DATA_DIR, "logs_and_annotations/HDFS/logs.txt")


save_dir_path = os.path.join(ROOT_DIR, 'results')
output_file_path = os.path.join(save_dir_path, "hdfs_log_labels.csv")



In [26]:
anomaly_label_df = pd.read_csv(anomaly_label_path)

# Read the first 200 lines from log.txt into a list
with open(hdfs_logs_file_path, 'r') as log_file:
    hdfs_log_lines = [line.strip() for line in log_file.readlines()[:200]]



In [27]:
print(len(hdfs_log_lines))


200


In [28]:
# Step 1: Generate log labels for HDFS logs
output_lines = []
counter_1 = 0

for log in hdfs_log_lines:
    prompt = f"""
    You will be provided with a log message delimited by <MSG> and </MSG>. 
    The log texts describe various system events in a distributed file system. 
    Your task is to understand the semantics of the log message and extract the BlockId if present. 
    If no BlockId exists, output "NoBlockId". 
    Additionally, classify the log message as "Normal" or "Anomaly" based on its content.
    Provide the result in the following format:

    Here are a few examples of log messages (labeled with Q:) and corresponding templates (labeled with A:):

    Q: <MSG>081110 224958 16603 INFO dfs.DataNode$DataXceiver: Receiving block blk_-2989288139685694818 src: /10.250.19.227:60160 dest: /10.250.19.227:50010</MSG>
    A: <ANM>blk_-2989288139685694818,Normal</ANM>

    Q: <MSG>081110 023456 6415 WARN dfs.DataNode$DataXceiver: 10.251.67.225:50010:Got exception while serving blk_-6900989714336081087 to /10.251.25.237:</MSG>
    A: <ANM>blk_-6900989714336081087,Anomaly</ANM>
    
    Q: <MSG>081111 043140 19599 INFO dfs.DataNode$PacketResponder: PacketResponder 1 for block blk_-2680500627064966252 terminating</MSG>
    A: <ANM>blk_-2680500627064966252,Normal</ANM>
    
    Output should always be like <ANM>BlockId,Label</ANM>.
    Never print an explanation or description. Print only the result.
    Here is the input log message: <MSG>{log}</MSG>
    """

    # Replace get_completion_from_gpt with your LLM function
    response = get_completion_from_gpt(prompt)

    output_lines.append(response.strip())

    if counter_1 % 10 == 0:
        print(f'{counter_1}: {response}')
        
    counter_1 += 1

# Save the results to a file
with open(output_file_path, "w") as output_file:
    output_file.write("BlockId,Label\n")
    output_file.write("\n".join(output_lines))

print(f"Log labels saved to {output_file_path}")


0: <ANM>blk_6888300867578983331,Normal</ANM>
10: <ANM>blk_-7548149518969960333,Normal</ANM>
20: <ANM>blk_1064470652608359218,Normal</ANM>
30: <ANM>blk_1512136249403454074,Normal</ANM>
40: <ANM>blk_8935202950442998446,Normal</ANM>
50: <ANM>blk_-3681974396824196300,Normal</ANM>
60: <ANM>blk_-997605125898553536,Normal</ANM>
70: <ANM>blk_-8380267327243110056,Normal</ANM>
80: <ANM>blk_1598414622053793245,Normal</ANM>
90: <ANM>blk_3764801892187716497,Normal</ANM>
100: <ANM>blk_7297060562345904886,Normal</ANM>
110: <ANM>blk_-3792836284792472725,Normal</ANM>
120: <ANM>blk_8861349372992394289,Normal</ANM>
130: <ANM>blk_-4139299269696044017,Normal</ANM>
140: <ANM>blk_4820650745157199554,Normal</ANM>
150: <ANM>blk_3073404534069578592,Normal</ANM>
160: <ANM>blk_8418106412701718933,Normal</ANM>
170: <ANM>blk_5100975846124291571,Normal</ANM>
180: <ANM>blk_-3693786175267111588,Normal</ANM>
190: <ANM>blk_-5852844080027817147,Normal</ANM>
Log labels saved to /Users/navneetsharma/Documents/NMBU/MS Data 