<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/preprocessing_BGL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=46dc107eec261aa09c24506bb954cf0c92076a89c6c55f96da19311419dc46f3
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tr

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [1]:
!wget 'https://zenodo.org/record/8196385/files/BGL.zip'
!unzip "/content/BGL.zip" -d "/content/"

--2023-09-10 15:10:50--  https://zenodo.org/record/8196385/files/BGL.zip
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57489019 (55M) [application/octet-stream]
Saving to: ‘BGL.zip’


2023-09-10 15:12:39 (524 KB/s) - ‘BGL.zip’ saved [57489019/57489019]

Archive:  /content/BGL.zip
  inflating: /content/BGL.log        
  inflating: /content/README.md      


In [21]:
import random

import pandas as pd
import numpy as np
from collections import OrderedDict
import re

from sklearn.utils import shuffle
import pickle
import string
import time
from datetime import datetime
import json



In [7]:
def clean(s):
    """ Preprocess log message
    Parameters
    ----------
    s: str, raw log message

    Returns
    -------
    str, preprocessed log message without number tokens and special characters
    """
    # s = re.sub(r'(\d+\.){3}\d+(:\d+)?', " ", s)
    # s = re.sub(r'(\/.*?\.[\S:]+)', ' ', s)
    s = re.sub('\]|\[|\)|\(|\=|\,|\;', ' ', s)
    s = " ".join([word.lower() if word.isupper() else word for word in s.strip().split()])
    s = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', s))
    s = " ".join([word for word in s.split() if not bool(re.search(r'\d', word))])
    trantab = str.maketrans(dict.fromkeys(list(string.punctuation)))
    content = s.translate(trantab)
    s = " ".join([word.lower().strip() for word in content.strip().split()])
    return s





def load_supercomputers(log_file, train_ratio=0.8, windows_size=20, step_size=5):
    """ Load BGL, Thunderbird, and Spirit unstructured log into train and test data
    Parameters
    ----------
    log_file: str, the file path of raw log (extension: .log).
    train_ratio: float, the ratio of training data for train/test split.
    windows_size: int, the window size for sliding window
    step_size: int, the step size for sliding window. if step_size is equal to window_size then fixed window is applied.

    Returns
    -------
    (x_tr, y_tr): the training data
    (x_te, y_te): the testing data
    """
    print("Loading", log_file)

    with open(log_file, mode="r", encoding='utf8') as f:
        logs = f.readlines()
        logs = [x.strip() for x in logs]
    E = {}

    print("Loaded", len(logs), "lines!")
    x_norm, x_abnorm = [], []
    i = 0
    index = 0
    c = 0
    t0 = time.time()
    while i < (int(len(logs)) - windows_size):
        c += 1
        if c % 1000 == 0:
            print("\rLoading {0:.2f}% - {1} unique logs".format(i * 100 / len(logs), len(E.keys())), end="")

        seq = []
        label = 0
        for j in range(i, i + windows_size):
            if logs[j][0] != "-":
                label = 1
            content = logs[j]
            # remove label from log messages
            content = content[content.find(' ') + 1:]
            content = clean(content.lower())
            if content not in E.keys():
                try:
                    E[content] = index
                    index += 1
                except Exception as _:
                    print(content)
            emb = E[content]
            seq.append(emb)
        if label==0:
          x_norm.append(seq.copy())
        else :
          x_abnorm.append(seq.copy())
        i = i + step_size
    print("\nlast index:", i)

    num_train = int(train_ratio * len(x_norm))
    x_train = x_norm[0:num_train]
    x_norm_test = x_norm[num_train:]


    num_train = len(x_train)
    num_norm_test = len(x_norm_test)
    num_abnorm_test = len(x_abnorm)
    num_total_norm = num_train + num_norm_test
    num_total = num_train + num_norm_test + num_abnorm_test


    print('Total: {} instances, {} anomaly, {} normal' \
          .format(num_total, num_abnorm_test, num_total_norm))
    print('Train: {} instances, {} anomaly, {} normal' \
          .format(num_train, 0, num_train))
    print('Test: {} instances, {} anomaly, {} normal\n' \
          .format(num_norm_test + num_abnorm_test, num_abnorm_test, num_norm_test))

    return x_train, x_norm_test, x_abnorm, E




In [8]:
log_file = "/content/BGL.log"
x_train, x_norm_test, x_abnorm, E = load_supercomputers(log_file, train_ratio=0.8, windows_size=20,step_size=5)

Loading /content/BGL.log
Loaded 4747963 lines!
Loading 99.94% - 735 unique logs
last index: 4747945
Total: 949589 instances, 81033 anomaly, 868556 normal
Train: 694844 instances, 0 anomaly, 694844 normal
Test: 254745 instances, 81033 anomaly, 173712 normal



In [20]:
embeddings = model.encode(list(E.keys()))
embeddings.shape

(736, 384)

In [28]:
# save hdfs_sequence file in google drive
with open('/content/log2index', 'w') as log_file:
  for key in E.keys():
      log_file.write(key + '\n')


with open('/content/X_train_index', 'w') as log_file:
  for line in x_train:
    log_entry = ' '.join(str(token) for token in line)
    log_file.write(log_entry + '\n')

with open('/content/Xnorm_test_index', 'w') as log_file:
  for line in x_norm_test:
    log_entry = ' '.join(str(token) for token in line)
    log_file.write(log_entry + '\n')


with open('/content/Xabnorm_test_index', 'w') as log_file:
  for line in x_abnorm:
    log_entry = ' '.join(str(token) for token in line)
    log_file.write(log_entry + '\n')


# Extract the values (lists) from the dictionary and save each as a JSON array
with open('/content/index2embed', 'w') as file:
    for value_list in embeddings:
        json.dump(value_list.tolist(), file)
        file.write('\n')  # Add a newline to separate arrays