In [42]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

csv_file = "HDFS_2k.log_structured-INFO.csv"  # Replace with the path to your CSV file
log_data = pd.read_csv(csv_file)


In [43]:
# Initialize the BERT tokenizer and model
model_name = "bert-base-uncased"  # Choose the BERT model you prefer
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Step 1: Tokenize and preprocess the log data
tokenized_logs = []
attention_masks = []
lng=[]
for log_text in log_data["EventTemplate"]:

    # Tokenize the log message
    tokens = tokenizer.tokenize(log_text)
    lng.append(tokens)
    # Add special tokens and apply padding
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)

    # Optionally, truncate or pad to a fixed length
    max_length = 100  # Set your desired maximum length
    input_ids = input_ids[:max_length] + [tokenizer.pad_token_id] * (max_length - len(input_ids))

    tokenized_logs.append(input_ids)

    # Create attention mask
    attn_mask = [1] * len(input_ids) + [0] * (max_length - len(input_ids))
    attention_masks.append(attn_mask)




In [44]:
# Convert tokenized logs to PyTorch tensors
log_tensors = torch.tensor(tokenized_logs)
attention_masks = torch.tensor(attention_masks)

log_embeddings = []
count=0

for log_tensor, attn_mask in zip(log_tensors, attention_masks):
    # Convert log tensor and attention mask to PyTorch
    count+=1
    log_tensor = log_tensor.unsqueeze(0)
    attn_mask = attn_mask.unsqueeze(0)

    # Pass the log tensor and attention mask through the BERT model to obtain embeddings
    with torch.no_grad():
        outputs = model(log_tensor, attention_mask=attn_mask)

    # Extract the embedding for [CLS] token (outputs[0][:, 0, :])
    log_embedding = torch.mean(outputs[0][:, 0, :], dim=0).numpy()
    log_embeddings.append(log_embedding)
print(count)


1920


In [45]:
# Step 3: Anomaly Detection
# Calculate the mean (centroid) of all log embeddings
if len(log_embeddings) > 0:
    all_logs_centroid = np.mean(log_embeddings, axis=0)
else:
    # Handle the case where there are no logs
    all_logs_centroid = None


In [46]:
all_logs_centroid

array([-4.45565104e-01,  3.12775135e-01,  4.27508891e-01,  4.35265273e-01,
       -2.43269220e-01, -3.24049532e-01,  6.76855981e-01, -9.23638716e-02,
        5.10043725e-02, -3.99785675e-02, -1.81926027e-01, -5.18746912e-01,
       -1.55138150e-01,  4.41699862e-01,  3.50951374e-01,  8.37581813e-01,
       -3.17377716e-01,  4.56318915e-01, -5.16122542e-02, -2.65861675e-02,
        4.51254517e-01,  2.80960530e-01,  7.85265803e-01,  1.44672468e-01,
       -1.89078733e-01,  1.34110183e-01, -6.04096465e-02, -9.82895195e-01,
       -5.18841386e-01,  1.01434387e-01, -4.71658736e-01,  5.88276029e-01,
        3.46569359e-01, -2.82224327e-01,  3.73220265e-01, -1.79744154e-01,
       -4.67002809e-01, -1.07521802e-01,  4.94423389e-01,  9.74619538e-02,
        9.45832729e-02, -6.00739777e-01,  6.40910566e-01, -2.60894541e-02,
        4.79512699e-02, -6.59608781e-01, -3.17332053e+00,  3.99474919e-01,
        9.78691131e-02, -8.34250152e-01,  2.64434088e-02, -3.71937960e-01,
        6.25147223e-01,  

In [23]:
threshold = 0.9606 # Adjust the threshold as needed
logs_anomal=[]
# Compare each log with the mean log centroid using cosine similarity
for i, log_embedding in enumerate(log_embeddings):
    if all_logs_centroid is not None:
        similarity_score = cosine_similarity([log_embedding], [all_logs_centroid])[0][0]
        # Compare similarity score with the threshold
        if similarity_score < threshold:
            print(f"Anomaly detected: Log {i} with score {similarity_score}")
            logs_anomal.append(i)

In [47]:
csv_anomaly_file = "HDFS_2k.log_structured-Anomaly.csv"  # Replace with the path to your CSV file
log_anomaly_data = pd.read_csv(csv_anomaly_file)


In [48]:
# Step 1: Tokenize and preprocess the log data
tokenized_anomaly_logs = []
attention_anomaly_masks = []
anomaly_lng=[]
for log_text in log_anomaly_data["EventTemplate"]:

    # Tokenize the log message
    tokens = tokenizer.tokenize(log_text)
    anomaly_lng.append(tokens)
    # Add special tokens and apply padding
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)

    # Optionally, truncate or pad to a fixed length
    max_length = 100  # Set your desired maximum length
    input_ids = input_ids[:max_length] + [tokenizer.pad_token_id] * (max_length - len(input_ids))

    tokenized_anomaly_logs.append(input_ids)

    # Create attention mask
    attn_mask = [1] * len(input_ids) + [0] * (max_length - len(input_ids))
    attention_anomaly_masks.append(attn_mask)


In [49]:
log_tensors = torch.tensor(tokenized_anomaly_logs)
attention_masks = torch.tensor(attention_anomaly_masks)

log_anomaly_embeddings = []
count=0

for log_tensor, attn_mask in zip(log_tensors, attention_masks):
    # Convert log tensor and attention mask to PyTorch
    count+=1
    log_tensor = log_tensor.unsqueeze(0)
    attn_mask = attn_mask.unsqueeze(0)

    # Pass the log tensor and attention mask through the BERT model to obtain embeddings
    with torch.no_grad():
        outputs = model(log_tensor, attention_mask=attn_mask)

    # Extract the embedding for [CLS] token (outputs[0][:, 0, :])
    log_embedding = torch.mean(outputs[0][:, 0, :], dim=0).numpy()
    log_anomaly_embeddings.append(log_embedding)
    print(count)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80


In [50]:
threshold = 0.9906 # Adjust the threshold as needed
logs_anomal=[]
# Compare each log with the mean log centroid using cosine similarity
for i, log_embedding in enumerate(log_anomaly_embeddings):
    if all_logs_centroid is not None:
        similarity_score = cosine_similarity([log_embedding], [all_logs_centroid])[0][0]
        # Compare similarity score with the threshold
        if similarity_score < threshold:
            print(f"Anomaly detected: Log {i} with score {similarity_score}")
            logs_anomal.append(i)

Anomaly detected: Log 0 with score 0.9883904457092285
Anomaly detected: Log 1 with score 0.9883904457092285
Anomaly detected: Log 2 with score 0.9883904457092285
Anomaly detected: Log 3 with score 0.9883904457092285
Anomaly detected: Log 4 with score 0.9883904457092285
Anomaly detected: Log 5 with score 0.9883904457092285
Anomaly detected: Log 6 with score 0.9883904457092285
Anomaly detected: Log 7 with score 0.9883904457092285
Anomaly detected: Log 8 with score 0.9883904457092285
Anomaly detected: Log 9 with score 0.9883904457092285
Anomaly detected: Log 10 with score 0.9883904457092285
Anomaly detected: Log 11 with score 0.9883904457092285
Anomaly detected: Log 12 with score 0.9883904457092285
Anomaly detected: Log 13 with score 0.9883904457092285
Anomaly detected: Log 14 with score 0.9883904457092285
Anomaly detected: Log 15 with score 0.9883904457092285
Anomaly detected: Log 16 with score 0.9883904457092285
Anomaly detected: Log 17 with score 0.9883904457092285
Anomaly detected: Lo

In [51]:
cosine_similarity([log_embeddings[0]], [all_logs_centroid])[0][0]

np.float32(0.97572625)

In [55]:
def getembd(log_text):

    # Tokenize the log message
    tokens = tokenizer.tokenize(log_text)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = tokenizer.build_inputs_with_special_tokens(input_ids)

    # Optionally, truncate or pad to a fixed length
    max_length = 100  # Set your desired maximum length
    input_ids = input_ids[:max_length] + [tokenizer.pad_token_id] * (max_length - len(input_ids))
    attn_mask = [1] * len(input_ids) + [0] * (max_length - len(input_ids))

    log_tensor = torch.tensor(input_ids)
    attention_mask = torch.tensor(attn_mask)

    log_tensor = log_tensor.unsqueeze(0)
    attention_mask = attention_mask.unsqueeze(0)

    # Pass the log tensor and attention mask through the BERT model to obtain embeddings
    with torch.no_grad():
        outputs = model(log_tensor, attention_mask=attention_mask)

    # Extract the embedding for [CLS] token (outputs[0][:, 0, :])
    log_embedding = torch.mean(outputs[0][:, 0, :], dim=0).numpy()
    return log_embedding


In [56]:
getembd("a")

array([-5.17588794e-01,  4.16581899e-01,  5.01730323e-01,  4.92995113e-01,
        3.37186530e-02, -5.76818585e-01,  4.31254119e-01, -7.45090544e-02,
        1.12897903e-02, -2.11184904e-01, -4.76766467e-01, -4.61972117e-01,
       -2.17913557e-02,  4.43377733e-01,  4.66679722e-01,  9.65156674e-01,
       -3.36830854e-01,  4.34409052e-01, -1.64808139e-01,  1.01009451e-01,
        4.53219682e-01,  2.26642072e-01,  8.44880223e-01,  1.44102484e-01,
       -6.01752222e-01,  2.45675445e-03,  6.27504662e-02, -8.78156960e-01,
       -3.95616144e-01,  1.40480727e-01, -6.43445611e-01,  3.62279534e-01,
       -5.11741899e-02, -3.44391555e-01,  7.27903247e-02, -2.13535398e-01,
       -4.60022300e-01, -2.02711642e-01,  3.00796986e-01,  4.44190443e-01,
        1.77162766e-01, -6.14302039e-01,  4.76288736e-01, -5.55296987e-02,
       -7.82990009e-02, -6.07043743e-01, -3.17913771e+00,  3.33119571e-01,
       -3.75548750e-03, -5.22429824e-01,  1.37171075e-01, -4.56670225e-01,
        7.91515350e-01, -

In [67]:
cosine_similarity([getembd("PacketResponder <*> for block blk_<*> terminating")], [getembd("PacketResponder <*> for block blk_<*> initiating")])[0][0]

np.float32(0.9966758)