<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/autoencoder4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# upload HDFS dataset
!wget 'https://zenodo.org/record/3227177/files/HDFS_1.tar.gz'
!tar -xzvf "/content/HDFS_1.tar.gz" -C "/content/"   #unzip the file

--2023-08-31 16:59:50--  https://zenodo.org/record/3227177/files/HDFS_1.tar.gz
Resolving zenodo.org (zenodo.org)... 188.185.124.72
Connecting to zenodo.org (zenodo.org)|188.185.124.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 161886385 (154M) [application/octet-stream]
Saving to: ‘HDFS_1.tar.gz’


2023-08-31 17:01:19 (1.75 MB/s) - ‘HDFS_1.tar.gz’ saved [161886385/161886385]

HDFS.log
anomaly_label.csv


In [3]:
import re
import string
import numpy as np

import pandas as pd

In [4]:
def find_blockid(text):
  match = re.search(r"blk_[-\w]+", text)
  return match.group()

In [9]:
with open('/content/HDFS.log', "r") as file:

  logs = {} # gather logs(index) which are blong to a specific block id
  index = -1

  for line in file:
    blockId = find_blockid(line)
    if not blockId in logs:
          logs[blockId] = []

    logs[blockId].append(line)


In [10]:
data_df = pd.DataFrame(list(logs.items()), columns=['BlockId', 'EventSequence'])
data_df.to_csv("HDFS_sequence.csv",index=None)

In [11]:
data_df.head()

Unnamed: 0,BlockId,EventSequence
0,blk_-1608999687919862906,[081109 203518 143 INFO dfs.DataNode$DataXceiv...
1,blk_7503483334202473044,[081109 203520 142 INFO dfs.DataNode$DataXceiv...
2,blk_-3544583377289625738,[081109 203521 145 INFO dfs.DataNode$DataXceiv...
3,blk_-9073992586687739851,[081109 203523 143 INFO dfs.DataNode$DataXceiv...
4,blk_7854771516489510256,[081109 203529 148 INFO dfs.DataNode$DataXceiv...


In [12]:
#Take a quick look at the labels, which we see, its classified based on blockID

labels = pd.read_csv('/content/anomaly_label.csv')
labels.head()

Unnamed: 0,BlockId,Label
0,blk_-1608999687919862906,Normal
1,blk_7503483334202473044,Normal
2,blk_-3544583377289625738,Anomaly
3,blk_-9073992586687739851,Normal
4,blk_7854771516489510256,Normal


In [13]:
# Merge the labels with the data_df DataFrame
data_df = data_df.merge(labels, on='BlockId', how='left')
data_df.head(3)

Unnamed: 0,BlockId,EventSequence,Label
0,blk_-1608999687919862906,[081109 203518 143 INFO dfs.DataNode$DataXceiv...,Normal
1,blk_7503483334202473044,[081109 203520 142 INFO dfs.DataNode$DataXceiv...,Normal
2,blk_-3544583377289625738,[081109 203521 145 INFO dfs.DataNode$DataXceiv...,Anomaly


In [14]:
# Splitting the dataset into train and test
hdfs_sequence_normal = data_df[data_df['Label'] == 'Normal']

hdfs_sequence_abnormal = data_df[data_df['Label'] != 'Normal']

# Print the lengths of train and test datasets
print("Normal Dataset Length:", len(hdfs_sequence_normal))
print("Abnormal Test Dataset Length:", len(hdfs_sequence_abnormal))

Normal Dataset Length: 558223
Abnormal Test Dataset Length: 16838


In [15]:

from sklearn.model_selection import train_test_split


train_hdfs_sequence_normal, test_hdfs_sequence_normal = train_test_split(hdfs_sequence_normal, test_size=0.2)

print("Normal Dataset Length:", len(train_hdfs_sequence_normal))
print("Abnormal Test Dataset Length:", len(test_hdfs_sequence_normal))

Normal Dataset Length: 446578
Abnormal Test Dataset Length: 111645


In [16]:
train_hdfs_sequence_normal.head()

Unnamed: 0,BlockId,EventSequence,Label
353050,blk_-4153422275309731448,[081111 043113 19946 INFO dfs.DataNode$DataXce...,Normal
251396,blk_4942047441206983604,[081110 221005 28 INFO dfs.FSNamesystem: BLOCK...,Normal
84624,blk_-5975806018648702319,[081110 012619 33 INFO dfs.FSNamesystem: BLOCK...,Normal
346018,blk_-6428268436725554489,[081111 041909 19567 INFO dfs.DataNode$DataXce...,Normal
453015,blk_3520104078812274738,[081111 073057 23343 INFO dfs.DataNode$DataXce...,Normal


In [20]:
for index, row in train_hdfs_sequence_normal.iterrows():
  for line in row['EventSequence']:
    print(line)
    break
  break

081111 043113 19946 INFO dfs.DataNode$DataXceiver: Receiving block blk_-4153422275309731448 src: /10.251.26.131:58476 dest: /10.251.26.131:50010



In [None]:
def structured(address):
  with open(address, "r") as file:

      log2index = {}    # give for each unique cleaned log an index
      index2embed = {}  # give for each index(belongs to a unique cleaned log) sentence embedding of that unique cleaned log
      index = -1

      for line in file:

        blockId = find_blockid(line)
        if not blockId in blockId_logs:
              blockId_logs[blockId] = []

        cleaned_line = clean(line)

        i = log2index.get(cleaned_line) # i is the index of the cleaned log
        if i is None:
            index += 1
            log2index[cleaned_line] = index
            index2embed[index] = model.encode(cleaned_line)
            blockId_logs[blockId].append(index)
        else :
            blockId_logs[blockId].append(i)


  embeddings = np.array(list(index2embed.values()))
  pca = PCA(n_components=0.9)
  pca.fit(embeddings)
  reduced_embeddings = pca.transform(embeddings)

  reduced_index2embed = {}
  for i,key in enumerate(index2embed.keys()):
    reduced_index2embed[key]=reduced_embeddings[i]

  return log2index, reduced_index2embed, blockId_logs