<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/LSTM2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# download datasets
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_abnormal'

--2023-07-19 22:12:58--  https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 257875 (252K) [text/plain]
Saving to: ‘hdfs_train.5’


2023-07-19 22:12:59 (59.5 MB/s) - ‘hdfs_train.5’ saved [257875/257875]

--2023-07-19 22:12:59--  https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29284282 (28M) [text/plain]
Saving to: ‘hdfs_test_normal.5’


2023-07-19 22:13:01 (293 MB/s

In [4]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
import os
import numpy as np

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
n_templates = 28 #total number of unique templates

def template_to_tensor(template):
    tensor = [0] * n_templates
    if template != -1 :    # we put template= -1 for session less than windows_size in empty part and it will be all zero
      tensor[template] = 1
    return tensor

In [6]:
window_size = 10
inputs = []
outputs = []
row = '1 2 3 4 '
line = [int(i) - 1 for i in row.strip().split()] # we substract by one from templates index for starting from zero
line = line + [-1] * (window_size + 1 - len(line)) #if the length of the line is less than windows size, it covers by new defined template 28
for i in range(len(line) - window_size):
    inputs.append(line[i:i + window_size])
    outputs.append(line[i + window_size])

print(inputs)
print(outputs)

[[0, 1, 2, 3, -1, -1, -1, -1, -1, -1]]
[-1]


In [7]:
window_size = 10
inputs = []
outputs = []
row = '1 2 3 4 '
line = [int(i) - 1 for i in row.strip().split()] # we substract by one from templates index for starting from zero
line = line + [-1] * (window_size + 1 - len(line)) #if the length of the line is less than windows size, it covers by new defined template 28
line = [template_to_tensor(i) for i in line]
for i in range(len(line) - window_size):
    inputs.append(line[i:i + window_size])
    outputs.append(line[i + window_size])

print(inputs)
print(outputs)

[[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [8]:
name = 'hdfs_train'
window_size = 10
num_sessions = 0
inputs = []
outputs = []

with open('/content/' + name, 'r') as f:
        for row in f:
            num_sessions += 1
            line = [int(i) - 1 for i in row.strip().split()] # we substract by one from templates index for starting from zero
            if len(line)<11:
              print(line)
            line = line + [-1] * (window_size + 1 - len(line)) #if the length of the line is less than windows size, it covers by new defined template 28
            line = [template_to_tensor(i) for i in line]
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                outputs.append(line[i + window_size])

print('Number of sessions({}): {}'.format(name, num_sessions))
print('Number of seqs({}): {}'.format(name, len(inputs)))
dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs,dtype=torch.float ))

Number of sessions(hdfs_train): 4855
Number of seqs(hdfs_train): 46575


In [9]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))  # out.shape : [batch_size, sequence_length, hidden_size]
        out = self.fc(out[:, -1, :]) #The : before , -1, : indicates that we want to include all elements along the first dimension (batch dimension). -1 represents the index of the last element along the second dimension (sequence length). : after , -1 indicates that we want to include all elements along the third dimension (hidden size)
        return out

In [10]:
input_size = 28 # means each input has one feature(template's code)
num_layers = 2
hidden_size = 64
num_classes = 28
batch_size = 2048
num_epochs = 300
model_dir = 'model'

In [11]:
model = Model(input_size, hidden_size, num_layers, num_classes).to(device)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [12]:
# Train the model
start_time = time.time()
total_step = len(dataloader)
for epoch in range(num_epochs):  # Loop over the dataset multiple times
    train_loss = 0
    for step, (seq, label) in enumerate(dataloader):
        # Forward pass
        seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
        output = model(seq)
        loss = criterion(output, label.to(device))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
print('Finished Training')

Epoch [1/300], train_loss: 3.0240
Epoch [2/300], train_loss: 1.9071
Epoch [3/300], train_loss: 1.8107
Epoch [4/300], train_loss: 1.7301
Epoch [5/300], train_loss: 1.5251
Epoch [6/300], train_loss: 1.2932
Epoch [7/300], train_loss: 1.0563
Epoch [8/300], train_loss: 0.8862
Epoch [9/300], train_loss: 0.8033
Epoch [10/300], train_loss: 0.7578
Epoch [11/300], train_loss: 0.7210
Epoch [12/300], train_loss: 0.6860
Epoch [13/300], train_loss: 0.6531
Epoch [14/300], train_loss: 0.6242
Epoch [15/300], train_loss: 0.6011
Epoch [16/300], train_loss: 0.5822
Epoch [17/300], train_loss: 0.5634
Epoch [18/300], train_loss: 0.5440
Epoch [19/300], train_loss: 0.5221
Epoch [20/300], train_loss: 0.5019
Epoch [21/300], train_loss: 0.4850
Epoch [22/300], train_loss: 0.4703
Epoch [23/300], train_loss: 0.4567
Epoch [24/300], train_loss: 0.4426
Epoch [25/300], train_loss: 0.4236
Epoch [26/300], train_loss: 0.4088
Epoch [27/300], train_loss: 0.3995
Epoch [28/300], train_loss: 0.3877
Epoch [29/300], train_loss: 0

In [13]:
num_candidates = 9 # on paper is g , top-g(here top 9) probabilities to appear next are considered normal


In [14]:
def generate(name):
    # If you what to replicate the DeepLog paper results(Actually, I have a better result than DeepLog paper results),
    # you should use the 'list' not 'set' to obtain the full dataset, I use 'set' just for test and acceleration.
    hdfs = set()
    # hdfs = []
    with open('/content/' + name, 'r') as f:
        for row in f:
            line = [int(i) - 1 for i in row.strip().split()]
            line = line + [-1] * (window_size + 1 - len(line)) #if the length of the line is less than windows size, it covers by -1
            hdfs.add(tuple(line))
            # hdfs.append(tuple(line))
    print('Number of sessions({}): {}'.format(name, len(hdfs)))
    return hdfs

In [15]:
test_normal_loader = generate('hdfs_test_normal')
test_abnormal_loader = generate('hdfs_test_abnormal')

Number of sessions(hdfs_test_normal): 14177
Number of sessions(hdfs_test_abnormal): 4123


In [16]:
# Test the model
TP = 0
FP = 0

fp_set = set()
fn_set = set()  # Initialize the FN list

start_time = time.time()
with torch.no_grad():
    for line in test_normal_loader:
        for i in range(len(line) - window_size):
            session = line[i:i + window_size]
            seq = [template_to_tensor(temp) for temp in session]
            label = template_to_tensor(line[i + window_size])
            seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
            label = torch.tensor(label).view(-1).to(device)
            output = model(seq)
            predicted = torch.argsort(output, 1)[0][-num_candidates:]
            if torch.argmax(label) not in predicted:
                FP += 1
                fp_set.add(line)
                break   #with just one wrong prediction in a line , we assume , abnormal
with torch.no_grad():
    for line in test_abnormal_loader:
        for i in range(len(line) - window_size):
            session = line[i:i + window_size]
            seq = [template_to_tensor(temp) for temp in session]
            label = template_to_tensor(line[i + window_size])
            seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
            label = torch.tensor(label).view(-1).to(device)
            output = model(seq)
            predicted = torch.argsort(output, 1)[0][-num_candidates:]
            if torch.argmax(label) not in predicted:
                TP += 1
                break
            else:
                fn_set.add(line)  # Append the line to the FN list when a false negative occurs

elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
# Compute precision, recall and F1-measure
FN = len(test_abnormal_loader) - TP
P = 100 * TP / (TP + FP)
R = 100 * TP / (TP + FN)
F1 = 2 * P * R / (P + R)
print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1))
print('Finished Predicting')

elapsed_time: 181.796s
false positive (FP): 372, false negative (FN): 644, Precision: 90.340%, Recall: 84.380%, F1-measure: 87.259%
Finished Predicting


In [17]:
len(fp_set)

372

In [18]:
len(fn_set)

3964

In [39]:
fn_list = []
with torch.no_grad():
    for line in test_abnormal_loader:
      for i in range(len(line) - window_size):
          session = line[i:i + window_size]
          seq = [template_to_tensor(temp) for temp in session]
          label = template_to_tensor(line[i + window_size])
          seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
          label = torch.tensor(label).view(-1).to(device)
          output = model(seq)
          predicted = torch.argsort(output, 1)[0][-num_candidates:]
          if torch.argmax(label) not in predicted:
              TP += 1
              break
          else:
              if line[-11:-1] == session : #it means we came to the last sliding windows of the line(include line[-2], and line[-1] is our last label) without predicting abnormal
                fn_list.append(line)




In [40]:
len(fn_list)

649

In [42]:
k=0
for line in fp_list:
  k+=1
  for i in range(len(line) - window_size):
    session = line[i:i + window_size]
    seq = [template_to_tensor(temp) for temp in session]
    label = template_to_tensor(line[i + window_size])
    seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
    label = torch.tensor(label).view(-1).to(device)
    output = model(seq)
    predicted = torch.argsort(output, 1)[0][-num_candidates:]
    if torch.argmax(label) not in predicted:
      #print(seq)
      print(torch.argmax(label))
      print(output)
      print(torch.argsort(output, 1)[0][-15:])
      break
  if k==5:
    break

NameError: ignored

In [44]:
k=0
for line in fn_list:
  k+=1
  print(line)
  for i in range(len(line) - window_size):
    session = line[i:i + window_size]
    seq = [template_to_tensor(temp) for temp in session]
    label = template_to_tensor(line[i + window_size])
    seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
    label = torch.tensor(label).view(-1).to(device)
    output = model(seq)
    predicted = torch.argsort(output, 1)[0][-num_candidates:]
    if torch.argmax(label) in predicted:
      #print(seq)
      print(torch.argmax(label))
      print(output)
      print(torch.argsort(output, 1)[0][-15:])
      break
  if k==15:
    break

(4, 4, 21, 4, 10, 8, 25, 25, 10, 8, 10, 8, 25, 2, 3, 2, 24, 17, 4, 5, 15, 25, 25, 20, 24, 17, 4, 5, 15, 25, 25, 20, 2, 3, 2, 22, 22, 22, 20, 20, 20)
tensor(10, device='cuda:0')
tensor([[-3.9015, -0.0990,  1.5863, -3.8834, -1.0817, -3.5460, -3.1518, -3.9147,
          3.9864, -4.2262, 11.5035, -3.8253, -3.4973, -4.0609, -3.8128, -0.8991,
         -3.7993, -1.5726, -4.8105, -3.3805, -4.0684, -0.7707,  0.7654, -4.1127,
         -0.4764,  6.4379, -3.2284, -3.7026]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
tensor([12, 19, 26,  6, 17,  4, 15, 21, 24,  1, 22,  2,  8, 25, 10],
       device='cuda:0')
(21, 4, 4, 4, 10, 8, 10, 8, 25, 25, 10, 8, 24, 4, 17, 25, 5, 15, 25, 20, 22, 22, 22, 20, 20, 20)
tensor(10, device='cuda:0')
tensor([[-4.1400,  0.0220,  1.4041, -3.6026, -2.8916, -4.3265, -3.4762, -4.3217,
          3.7456, -4.2130,  9.1355, -4.0597, -3.7541, -4.0977, -4.0241, -2.0453,
         -3.9458, -1.4266, -4.7337, -3.5215, -4.1359, -0.4249,  1.4820, -4.0578,
         -1.1853,  8.2