<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/autoencoder6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# we upload, in case that we cant parse logs(a lot of time and ram consumption)

# Mount Google Drive to save datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#copy log file
!cp '/content/drive/MyDrive/logs_train' '/content/'
!cp '/content/drive/MyDrive/logs_ntest' '/content/'
!cp '/content/drive/MyDrive/logs_atest' '/content/'
!cp '/content/drive/MyDrive/log2index' '/content/'
!cp '/content/drive/MyDrive/reduced_embeddings' '/content/'


In [None]:
import pandas as pd
import json
from sklearn.metrics.pairwise import cosine_similarity
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import os
import numpy as np


# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cuda')

In [None]:
with open('/content/reduced_embeddings', 'r') as json_file:
    embeddings = json.load(json_file)

In [None]:
len(embeddings[0])


10

In [None]:

name = 'logs_train'
window_size = 10
num_sessions = 0
inputs = []
#outputs = []

with open('/content/' + name, 'r') as f:
        for row in f:
            num_sessions += 1
            line = [ embeddings[int(i)] for i in row.strip().split()]
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                #outputs.append(line[i + window_size])

print('Number of sessions({}): {}'.format(name, num_sessions))
print('Number of seqs({}): {}'.format(name, len(inputs)))
#dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))
dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float))

Number of sessions(logs_train): 446578
Number of seqs(logs_train): 4704000


In [None]:
# Define the LSTM Autoencoder model with dropout
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout_prob):
        super(LSTMAutoencoder, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True, dropout=dropout_prob)
        self.decoder = nn.LSTM(hidden_dim * 2, input_dim, num_layers, batch_first=True, dropout=dropout_prob)


    def forward(self, x):
        encoded, _ = self.encoder(x)
        #print(encoded.shape)
        encoded = encoded[:,-1:,:] # output of last cell
        #print(encoded.shape)
        input_decode = torch.tile(encoded, (1, 10, 1))
        decoded, _ = self.decoder(input_decode)
        return decoded

In [None]:
model = LSTMAutoencoder(10, 32, 2, 0.2).to(device)
model

LSTMAutoencoder(
  (encoder): LSTM(10, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (decoder): LSTM(64, 10, num_layers=2, batch_first=True, dropout=0.2)
)

In [None]:
data = torch.randn(1,10, 10)
model(data.to(device)).shape

torch.Size([1, 10, 10])

In [None]:
# Calculate the number of parameters
num_params = sum(p.numel() for p in model.parameters())
print(f"Number of Parameters: {num_params}")

Number of Parameters: 40272


In [None]:
dataloader = DataLoader(dataset, batch_size=256)

# Loss and optimizer
criterion = nn.MSELoss()

In [None]:
len(dataloader) # regard to batch size

18375

In [None]:
for step, (seq) in enumerate(dataloader):
  print(seq[0].shape)
  break

torch.Size([256, 10, 10])


In [None]:
def adjust_learning_rate(optimizer, epoch, lr_step=(15,25,35,45), lr_decay_ratio=0.5):
    """Adjust the learning rate based on the epoch number."""
    if epoch == 0:
        optimizer.param_groups[0]['lr'] /= 16
    elif epoch in [1, 2, 3, 4]:  # in step five , we finish warm up ,and start normal learning rate
        optimizer.param_groups[0]['lr'] *= 2
    if epoch in lr_step: # in these steps , we are geting close to optimal point so we need to have shorter step
        optimizer.param_groups[0]['lr'] *= lr_decay_ratio
    return optimizer

optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))

In [None]:
num_epochs = 50
input_size = 10 # embedding size

# Train the model
start_time = time.time()
total_step = len(dataloader)
for epoch in range(num_epochs):  # Loop over the dataset multiple times
    optimizer = adjust_learning_rate(optimizer, epoch)
    train_loss = 0
    for step, (seq) in enumerate(dataloader):
        # Forward pass
        seq = seq[0].clone().detach().view(-1, window_size, input_size).to(device)
        output = model(seq)
        loss = criterion(output, seq.to(device))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
print('Finished Training')


Epoch [1/50], train_loss: 0.0152
Epoch [2/50], train_loss: 0.0081
Epoch [3/50], train_loss: 0.0051
Epoch [4/50], train_loss: 0.0037
Epoch [5/50], train_loss: 0.0029
Epoch [6/50], train_loss: 0.0026
Epoch [7/50], train_loss: 0.0023
Epoch [8/50], train_loss: 0.0020
Epoch [9/50], train_loss: 0.0018
Epoch [10/50], train_loss: 0.0017
Epoch [11/50], train_loss: 0.0016
Epoch [12/50], train_loss: 0.0016
Epoch [13/50], train_loss: 0.0015
Epoch [14/50], train_loss: 0.0015
Epoch [15/50], train_loss: 0.0015
Epoch [16/50], train_loss: 0.0015
Epoch [17/50], train_loss: 0.0015
Epoch [18/50], train_loss: 0.0015
Epoch [19/50], train_loss: 0.0014
Epoch [20/50], train_loss: 0.0014
Epoch [21/50], train_loss: 0.0014
Epoch [22/50], train_loss: 0.0014
Epoch [23/50], train_loss: 0.0014
Epoch [24/50], train_loss: 0.0014
Epoch [25/50], train_loss: 0.0014
Epoch [26/50], train_loss: 0.0015
Epoch [27/50], train_loss: 0.0014
Epoch [28/50], train_loss: 0.0014
Epoch [29/50], train_loss: 0.0014
Epoch [30/50], train_lo

KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Autoencoder3_parameters.pth')

In [None]:
len(embeddings)

53

In [None]:
embeddings.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [None]:
def generate(name):
    window_size = 10
    hdfs = {} #store the unique sequences and their counts.
    length = 0
    with open('/content/' + name, 'r') as f:
        for row in f:
            line = [int(i) for i in row.strip().split()]
            line = line + [52] * (window_size + 1 - len(line)) #if the length of the line is less than windows size, it covers by 30 a template with zeros vector
            hdfs[tuple(line)] = hdfs.get(tuple(line), 0) + 1   #If the tuple is not present in the dictionary, hdfs.get(tuple(ln), 0) returns 0, and the code initializes the count to 1.
            length += 1
            # hdfs.append(tuple(line))
    print('Number of sessions({}): {}'.format(name, len(hdfs)))
    return hdfs, length

In [None]:
test_normal_loader, test_normal_length = generate('logs_ntest')
test_abnormal_loader, test_abnormal_length = generate('logs_atest')

Number of sessions(logs_ntest): 1091
Number of sessions(logs_atest): 4126


In [None]:
def evaluation(threshold):
  # Test the model
  model.eval()

  TP = 0
  FP = 0

  start_time = time.time()
  with torch.no_grad():
      for line in test_normal_loader.keys():
          for i in range(len(line) - window_size):
              session = line[i:i + window_size]
              seq = [embeddings[temp] for temp in session]
              seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
              output = model(seq)

              loss = criterion(output, seq)

              if (loss.cpu().detach().numpy()>threshold):
                FP += test_normal_loader[line] # numbers of that set we have
                break
  with torch.no_grad():
      for line in test_abnormal_loader.keys():
          for i in range(len(line) - window_size):
              session = line[i:i + window_size]
              seq = [embeddings[temp] for temp in session]
              seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
              output = model(seq)

              loss = criterion(output, seq)

              if (loss.cpu().detach().numpy()>threshold):
                TP += test_abnormal_loader[line]
                break
  elapsed_time = time.time() - start_time
  print('elapsed_time: {:.3f}s'.format(elapsed_time))
  # Compute precision, recall and F1-measure
  FN = test_abnormal_length - TP
  P = 100 * TP / (TP + FP)
  R = 100 * TP / (TP + FN)
  F1 = 2 * P * R / (P + R)
  print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1))
  print('Finished Predicting')

In [None]:
threshold = [0.0022,0.0025,0.0028,0.0030,0.0033]
for i in threshold:
  print('-------------------------------------------------------------------------')
  print('threshold = ', i)
  evaluation(i)

-------------------------------------------------------------------------
threshold =  0.0022
elapsed_time: 22.977s
false positive (FP): 3753, false negative (FN): 173, Precision: 81.619%, Recall: 98.973%, F1-measure: 89.462%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.0025
elapsed_time: 23.257s
false positive (FP): 2979, false negative (FN): 179, Precision: 84.830%, Recall: 98.937%, F1-measure: 91.342%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.0028
elapsed_time: 23.966s
false positive (FP): 2480, false negative (FN): 218, Precision: 87.016%, Recall: 98.705%, F1-measure: 92.493%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.003
elapsed_time: 22.943s
false positive (FP): 1952, false negative (FN): 1187, Precision: 88.911%, Recall: 92.950%, F1-measure: 90.886%
Finished Predicting
----------------

In [None]:
threshold = [0.0026,0.0027,0.0028,0.0029]
for i in threshold:
  print('-------------------------------------------------------------------------')
  print('threshold = ',i)
  evaluation(i)

-------------------------------------------------------------------------
threshold =  0.0026
elapsed_time: 23.151s
false positive (FP): 2892, false negative (FN): 181, Precision: 85.206%, Recall: 98.925%, F1-measure: 91.555%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.0027
elapsed_time: 23.274s
false positive (FP): 2803, false negative (FN): 189, Precision: 85.590%, Recall: 98.878%, F1-measure: 91.755%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.0028
elapsed_time: 22.897s
false positive (FP): 2480, false negative (FN): 218, Precision: 87.016%, Recall: 98.705%, F1-measure: 92.493%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.0029
elapsed_time: 23.310s
false positive (FP): 1973, false negative (FN): 1047, Precision: 88.893%, Recall: 93.782%, F1-measure: 91.272%
Finished Predicting


In [None]:
threshold = [0.00275,0.0028,0.00285]
for i in threshold:
  print('-------------------------------------------------------------------------')
  print('threshold = ',i)
  evaluation(i)

-------------------------------------------------------------------------
threshold =  0.00275
elapsed_time: 22.871s
false positive (FP): 2633, false negative (FN): 196, Precision: 86.340%, Recall: 98.836%, F1-measure: 92.166%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.0028
elapsed_time: 24.295s
false positive (FP): 2480, false negative (FN): 218, Precision: 87.016%, Recall: 98.705%, F1-measure: 92.493%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.00285
elapsed_time: 24.732s
false positive (FP): 2437, false negative (FN): 222, Precision: 87.209%, Recall: 98.682%, F1-measure: 92.591%
Finished Predicting


In [None]:
threshold = [0.00282,0.00284,0.00286,0.00288]
for i in threshold:
  print('-------------------------------------------------------------------------')
  print('threshold = ',i)
  evaluation(i)

-------------------------------------------------------------------------
threshold =  0.00282
elapsed_time: 23.915s
false positive (FP): 2479, false negative (FN): 221, Precision: 87.018%, Recall: 98.687%, F1-measure: 92.486%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.00284
elapsed_time: 23.665s
false positive (FP): 2438, false negative (FN): 222, Precision: 87.205%, Recall: 98.682%, F1-measure: 92.589%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.00286
elapsed_time: 23.782s
false positive (FP): 2436, false negative (FN): 222, Precision: 87.214%, Recall: 98.682%, F1-measure: 92.594%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.00288
elapsed_time: 24.027s
false positive (FP): 2434, false negative (FN): 1047, Precision: 86.645%, Recall: 93.782%, F1-measure: 90.072%
Finished Predicting
