<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/autoencoder4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# upload HDFS dataset
!wget 'https://zenodo.org/record/3227177/files/HDFS_1.tar.gz'
!tar -xzvf "/content/HDFS_1.tar.gz" -C "/content/"   #unzip the file

--2023-08-31 18:27:20--  https://zenodo.org/record/3227177/files/HDFS_1.tar.gz
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 161886385 (154M) [application/octet-stream]
Saving to: ‘HDFS_1.tar.gz’


2023-08-31 18:31:54 (578 KB/s) - ‘HDFS_1.tar.gz’ saved [161886385/161886385]

HDFS.log
anomaly_label.csv


In [2]:
import re
import string
import numpy as np

import pandas as pd

In [3]:
def find_blockid(text):
  match = re.search(r"blk_[-\w]+", text)
  return match.group()

In [4]:
with open('/content/HDFS.log', "r") as file:

  logs = {} # gather logs(index) which are blong to a specific block id

  for line in file:
    blockId = find_blockid(line)
    if not blockId in logs:
          logs[blockId] = []

    logs[blockId].append(line)


In [5]:
data_df = pd.DataFrame(list(logs.items()), columns=['BlockId', 'EventSequence'])
data_df.to_csv("HDFS_sequence.csv",index=None)

In [6]:
data_df.head()

Unnamed: 0,BlockId,EventSequence
0,blk_-1608999687919862906,[081109 203518 143 INFO dfs.DataNode$DataXceiv...
1,blk_7503483334202473044,[081109 203520 142 INFO dfs.DataNode$DataXceiv...
2,blk_-3544583377289625738,[081109 203521 145 INFO dfs.DataNode$DataXceiv...
3,blk_-9073992586687739851,[081109 203523 143 INFO dfs.DataNode$DataXceiv...
4,blk_7854771516489510256,[081109 203529 148 INFO dfs.DataNode$DataXceiv...


In [7]:
#Take a quick look at the labels, which we see, its classified based on blockID

labels = pd.read_csv('/content/anomaly_label.csv')
labels.head()

Unnamed: 0,BlockId,Label
0,blk_-1608999687919862906,Normal
1,blk_7503483334202473044,Normal
2,blk_-3544583377289625738,Anomaly
3,blk_-9073992586687739851,Normal
4,blk_7854771516489510256,Normal


In [8]:
# Merge the labels with the data_df DataFrame
data_df = data_df.merge(labels, on='BlockId', how='left')
data_df.head(3)

Unnamed: 0,BlockId,EventSequence,Label
0,blk_-1608999687919862906,[081109 203518 143 INFO dfs.DataNode$DataXceiv...,Normal
1,blk_7503483334202473044,[081109 203520 142 INFO dfs.DataNode$DataXceiv...,Normal
2,blk_-3544583377289625738,[081109 203521 145 INFO dfs.DataNode$DataXceiv...,Anomaly


In [9]:
# Splitting the dataset into train and test
hdfs_sequence_normal = data_df[data_df['Label'] == 'Normal']

hdfs_sequence_abnormal = data_df[data_df['Label'] != 'Normal']

# Print the lengths of train and test datasets
print("Normal Dataset Length:", len(hdfs_sequence_normal))
print("Abnormal Test Dataset Length:", len(hdfs_sequence_abnormal))

Normal Dataset Length: 558223
Abnormal Test Dataset Length: 16838


In [10]:
# Calculate the split indices based on the desired ratio
total_rows = len(hdfs_sequence_normal)
train_ratio = 0.8
train_rows = int(total_rows * train_ratio)

# Split the DataFrame into training and testing sets
train_hdfs_sequence_normal = hdfs_sequence_normal.iloc[:train_rows]
test_hdfs_sequence_normal = hdfs_sequence_normal.iloc[train_rows:]

In [11]:
train_hdfs_sequence_normal.head()

Unnamed: 0,BlockId,EventSequence,Label
0,blk_-1608999687919862906,[081109 203518 143 INFO dfs.DataNode$DataXceiv...,Normal
1,blk_7503483334202473044,[081109 203520 142 INFO dfs.DataNode$DataXceiv...,Normal
3,blk_-9073992586687739851,[081109 203523 143 INFO dfs.DataNode$DataXceiv...,Normal
4,blk_7854771516489510256,[081109 203529 148 INFO dfs.DataNode$DataXceiv...,Normal
5,blk_1717858812220360316,[081109 203530 145 INFO dfs.DataNode$DataXceiv...,Normal


In [14]:
!pip install -U sentence-transformers



In [15]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [16]:
from sklearn.decomposition import PCA

In [17]:
def clean(s):
    """ Preprocess log message
    Parameters
    ----------
    s: str, raw log message

    Returns
    -------
    str, preprocessed log message without number tokens and special characters
    """
    # s = re.sub(r'(\d+\.){3}\d+(:\d+)?', " ", s)
    # s = re.sub(r'(\/.*?\.[\S:]+)', ' ', s)
    s = re.sub('\]|\[|\)|\(|\=|\,|\;', ' ', s)
    s = " ".join([word.lower() if word.isupper() else word for word in s.strip().split()])
    s = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', s))
    s = " ".join([word for word in s.split() if not bool(re.search(r'\d', word))])
    trantab = str.maketrans(dict.fromkeys(list(string.punctuation)))
    content = s.translate(trantab)
    s = " ".join([word.lower().strip() for word in content.strip().split()])
    return s

In [12]:
for index, row in train_hdfs_sequence_normal.iterrows():
  for line in row['EventSequence']:
    print(line)
    break
  break

081109 203518 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010



In [49]:
def structured(df, model, log2index={}, index2embed = {}):

    blockId_logs = {}   # Gathers logs (indices) belonging to specific block IDs
    index = len(log2index)

    for _, row in df.iterrows():
        blockId = row['BlockId']
        blockId_logs[blockId] = []
        #print(len(log2index))
        for line in row['EventSequence']:
            cleaned_line = clean(line)

            if cleaned_line not in log2index:
                log2index[cleaned_line] = index
                index2embed[index] = model.encode(cleaned_line)
                blockId_logs[blockId].append(index)
                index += 1
            else:
                blockId_logs[blockId].append(log2index[cleaned_line])

    return log2index, index2embed, blockId_logs


In [28]:
log2index_train, index2embed_train, blockId_logs_train = structured(train_hdfs_sequence_normal,model)

In [60]:
print(len(blockId_logs_train))
print(len(log2index_train))
print(len(index2embed_train))

446578
52
52


In [61]:
pca = PCA(n_components=0.9)
pca.fit(np.array(list(index2embed_train.values())))

In [54]:
# normal test data
log2index_ntest, index2embed_ntest, blockId_logs_ntest = structured(test_hdfs_sequence_normal,model,log2index_train,index2embed_train)

print(len(log2index_ntest))
print(len(index2embed_ntest))
print(len(blockId_logs_ntest))

446578
52
52


In [57]:
# abnormal test data
log2index_atest, index2embed_atest, blockId_logs_atest = structured(hdfs_sequence_abnormal,model,log2index_ntest,index2embed_ntest)

print(len(log2index_atest))
print(len(index2embed_atest))
print(len(blockId_logs_atest))

52
52
16838


In [62]:
embeddings = np.array(list(index2embed_atest.values()))
reduced_embeddings = pca.transform(embeddings)

reduced_index2embed = {}
for i,key in enumerate(index2embed_atest.keys()):
  reduced_index2embed[key]=reduced_embeddings[i]

In [64]:
data_df = pd.DataFrame(list(blockId_logs_atest.items()), columns=['BlockId', 'EventSequence'])
data_df.to_csv("blockId_logs.csv",index=None)

df = pd.DataFrame(list(log2index_atest.items()), columns=['cleaned_log', 'index'])
df.to_csv("log2index.csv",index=None)

df = pd.DataFrame(list(reduced_index2embed.items()), columns=['index', 'embedding'])
df.to_csv("reduced_index2embed.csv",index=None)