<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/BGL2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#upload log files
!wget 'https://raw.githubusercontent.com/mostafa-ja/Anomaly-detection/main/datasets/BGL/X_train_index'
!wget 'https://raw.githubusercontent.com/mostafa-ja/Anomaly-detection/main/datasets/BGL/Xabnorm_test_index'
!wget 'https://raw.githubusercontent.com/mostafa-ja/Anomaly-detection/main/datasets/BGL/Xnorm_test_index'
!wget 'https://raw.githubusercontent.com/mostafa-ja/Anomaly-detection/main/datasets/BGL/log2index'
!wget 'https://raw.githubusercontent.com/mostafa-ja/Anomaly-detection/main/datasets/BGL/reduced_index2embed'

--2023-09-04 16:05:04--  https://raw.githubusercontent.com/mostafa-ja/Anomaly-detection/main/datasets/BGL/X_train_index
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9341136 (8.9M) [text/plain]
Saving to: ‘X_train_index.1’


2023-09-04 16:05:04 (104 MB/s) - ‘X_train_index.1’ saved [9341136/9341136]

--2023-09-04 16:05:05--  https://raw.githubusercontent.com/mostafa-ja/Anomaly-detection/main/datasets/BGL/Xabnorm_test_index
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1568608 (1.5M) [text/plain]
Saving to: ‘Xabnorm_test_index.1

In [3]:
import pandas as pd
import json
from sklearn.metrics.pairwise import cosine_similarity
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import os
import numpy as np
from tqdm import tqdm

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
#Json file name
with open('/content/reduced_index2embed') as f:
    embeddings = [json.loads(line) for line in f.readlines()]

len(embeddings[0])

60

In [5]:
# Define the LSTM Autoencoder model with dropout
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, sequence_length, dropout_prob): #input_dim= embeddings_dim
        super(LSTMAutoencoder, self).__init__()
        self.sequence_length = sequence_length
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True, dropout=dropout_prob)
        self.decoder = nn.LSTM(hidden_dim , input_dim, num_layers, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)

    def forward(self, x):
        encoded, _ = self.encoder(x)
        #print(encoded.shape)
        encoded = encoded[:,-1:,:] # output of last cell
        encoded = self.fc(encoded)
        #print(encoded.shape)
        input_decode = torch.tile(encoded, (1, self.sequence_length, 1))
        decoded, _ = self.decoder(input_decode)
        return decoded

In [6]:
sequence_length = 5
input_size = len(embeddings[0]) #embedding vector dimension
hidden_dim = 128
num_layers = 2
dropout_prob = 0.2

model = LSTMAutoencoder(input_size, hidden_dim, 2, sequence_length, dropout_prob).to(device)
model

LSTMAutoencoder(
  (encoder): LSTM(60, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (decoder): LSTM(128, 60, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=256, out_features=128, bias=True)
)

In [7]:
data = torch.randn(1,sequence_length, input_size) #batch_size=1
model(data.to(device)).shape

torch.Size([1, 5, 60])

In [8]:
# Calculate the number of parameters
num_params = sum(p.numel() for p in model.parameters())
print(f"Number of Parameters: {num_params}")

Number of Parameters: 697600


In [9]:
name = 'X_train_index'
window_size = sequence_length
num_sessions = 0
inputs = []
#outputs = []

with open('/content/' + name, 'r') as f:
        for row in tqdm(f, desc="Processing Rows"):
            num_sessions += 1
            line = [ embeddings[int(i)] for i in row.strip().split()]
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                #outputs.append(line[i + window_size])

print('Number of sessions({}): {}'.format(name, num_sessions))
print('Number of seqs({}): {}'.format(name, len(inputs)))
#dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))
dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float))

Processing Rows: 173709it [00:04, 42944.59it/s]


Number of sessions(X_train_index): 173709
Number of seqs(X_train_index): 2605635


In [10]:
dataloader = DataLoader(dataset, batch_size=512, shuffle=True)
len(dataloader) # regarding to batch size

5090

In [11]:
for step, (seq) in enumerate(dataloader):
  print(seq[0].shape)
  break

torch.Size([512, 5, 60])


In [12]:
def adjust_learning_rate(optimizer, epoch, lr_step=(8,12,16), lr_decay_ratio=0.2):
    """Adjust the learning rate based on the epoch number."""
    if epoch == 0:
        optimizer.param_groups[0]['lr'] /= 8
    elif epoch in [1, 2, 3]:  # in step five , we finish warm up ,and start normal learning rate
        optimizer.param_groups[0]['lr'] *= 2
    if epoch in lr_step: # in these steps , we are geting close to optimal point so we need to have shorter step
        optimizer.param_groups[0]['lr'] *= lr_decay_ratio
    return optimizer

In [13]:
num_epochs = 20
learning_rate = 0.001

# Loss and optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999))
criterion = nn.MSELoss()

In [16]:
# Train the model
model.train()
start_time = time.time()
total_step = len(dataloader)
for epoch in tqdm(range(num_epochs), desc="Processing Rows"):  # Loop over the dataset multiple times
    optimizer = adjust_learning_rate(optimizer, epoch)
    train_loss = 0
    for step, (seq) in enumerate(dataloader):
        # Forward pass
        seq = seq[0].clone().detach().view(-1, window_size, input_size).to(device)
        output = model(seq)
        loss = criterion(output, seq.to(device))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
print('Finished Training')


Processing Rows:   5%|▌         | 1/20 [01:10<22:22, 70.66s/it]

Epoch [1/20], train_loss: 0.0007


Processing Rows:  10%|█         | 2/20 [02:21<21:17, 70.96s/it]

Epoch [2/20], train_loss: 0.0007


Processing Rows:  15%|█▌        | 3/20 [03:32<20:07, 71.03s/it]

Epoch [3/20], train_loss: 0.0007


Processing Rows:  20%|██        | 4/20 [04:44<18:57, 71.11s/it]

Epoch [4/20], train_loss: 0.0006


Processing Rows:  25%|██▌       | 5/20 [05:55<17:45, 71.06s/it]

Epoch [5/20], train_loss: 0.0006


Processing Rows:  30%|███       | 6/20 [07:06<16:34, 71.06s/it]

Epoch [6/20], train_loss: 0.0006


Processing Rows:  35%|███▌      | 7/20 [08:17<15:25, 71.19s/it]

Epoch [7/20], train_loss: 0.0006


Processing Rows:  40%|████      | 8/20 [09:28<14:12, 71.04s/it]

Epoch [8/20], train_loss: 0.0005


Processing Rows:  45%|████▌     | 9/20 [10:39<13:03, 71.21s/it]

Epoch [9/20], train_loss: 0.0005


Processing Rows:  50%|█████     | 10/20 [11:51<11:52, 71.20s/it]

Epoch [10/20], train_loss: 0.0005


Processing Rows:  55%|█████▌    | 11/20 [13:02<10:40, 71.17s/it]

Epoch [11/20], train_loss: 0.0005


Processing Rows:  60%|██████    | 12/20 [14:13<09:29, 71.16s/it]

Epoch [12/20], train_loss: 0.0005


Processing Rows:  65%|██████▌   | 13/20 [15:24<08:17, 71.14s/it]

Epoch [13/20], train_loss: 0.0005


Processing Rows:  70%|███████   | 14/20 [16:35<07:06, 71.12s/it]

Epoch [14/20], train_loss: 0.0005


Processing Rows:  70%|███████   | 14/20 [16:39<07:08, 71.36s/it]


KeyboardInterrupt: ignored

In [None]:
# Mount Google Drive to save datasets
from google.colab import drive
drive.mount('/content/drive')

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/BGL1_parameters.pth')

# **EVALUATION**


in cpu
```
my_model = model.load_state_dict(torch.load('model_parameters.pth', map_location=torch.device(device)))

# Put the model in evaluation mode if necessary
model.eval()
```

in cuda


```
model.load_state_dict(torch.load('model_parameters.pth'))

# Put the model in evaluation mode if necessary
model.eval()
```



In [None]:
len(embeddings)

In [18]:
def generate(name):
    window_size = sequence_length
    hdfs = {} #store the unique sequences and their counts.
    length = 0
    with open('/content/' + name, 'r') as f:
        for row in f:
            line = [int(i) for i in row.strip().split()]
            hdfs[tuple(line)] = hdfs.get(tuple(line), 0) + 1   #If the tuple is not present in the dictionary, hdfs.get(tuple(ln), 0) returns 0, and the code initializes the count to 1.
            length += 1
            # hdfs.append(tuple(line))
    print('Number of sessions({}): {}'.format(name, len(hdfs)))
    return hdfs, length

In [19]:
test_normal_loader, test_normal_length = generate('Xnorm_test_index')
test_abnormal_loader, test_abnormal_length = generate('Xabnorm_test_index')

Number of sessions(Xnorm_test_index): 17191
Number of sessions(Xabnorm_test_index): 3595


In [20]:
def evaluation(threshold):
  # Test the model
  model.eval()

  TP = 0
  FP = 0

  start_time = time.time()
  with torch.no_grad():
      for line in tqdm(test_normal_loader.keys(), desc="Processing Rows"):
          for i in range(len(line) - window_size):
              session = line[i:i + window_size]
              seq = [embeddings[temp] for temp in session]
              seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
              output = model(seq)

              loss = criterion(output, seq)

              if (loss.cpu().detach().numpy()>threshold):
                FP += test_normal_loader[line] # numbers of that set we have
                break
  with torch.no_grad():
      for line in tqdm(test_abnormal_loader.keys(), desc="Processing Rows"):
          for i in range(len(line) - window_size):
              session = line[i:i + window_size]
              seq = [embeddings[temp] for temp in session]
              seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
              output = model(seq)

              loss = criterion(output, seq)

              if (loss.cpu().detach().numpy()>threshold):
                TP += test_abnormal_loader[line]
                break
  elapsed_time = time.time() - start_time
  print('elapsed_time: {:.3f}s'.format(elapsed_time))
  # Compute precision, recall and F1-measure
  FN = test_abnormal_length - TP
  P = 100 * TP / (TP + FP)
  R = 100 * TP / (TP + FN)
  F1 = 2 * P * R / (P + R)
  print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1))
  print('Finished Predicting')

In [23]:
threshold = [0.001,0.0015,0.002,0.0025]
for i in threshold:
  print('-------------------------------------------------------------------------')
  print('threshold = ',i)
  evaluation(i)

-------------------------------------------------------------------------
threshold =  0.001


Processing Rows: 100%|██████████| 17191/17191 [00:25<00:00, 666.06it/s]
Processing Rows: 100%|██████████| 3595/3595 [00:04<00:00, 743.89it/s]


elapsed_time: 30.655s
false positive (FP): 23037, false negative (FN): 2773, Precision: 43.154%, Recall: 86.314%, F1-measure: 57.540%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.0015


Processing Rows: 100%|██████████| 17191/17191 [00:26<00:00, 639.54it/s]
Processing Rows: 100%|██████████| 3595/3595 [00:04<00:00, 801.05it/s] 


elapsed_time: 31.383s
false positive (FP): 22839, false negative (FN): 2885, Precision: 43.208%, Recall: 85.761%, F1-measure: 57.464%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.002


Processing Rows: 100%|██████████| 17191/17191 [00:32<00:00, 521.82it/s]
Processing Rows: 100%|██████████| 3595/3595 [00:06<00:00, 596.56it/s]


elapsed_time: 38.983s
false positive (FP): 21009, false negative (FN): 2993, Precision: 45.113%, Recall: 85.228%, F1-measure: 58.998%
Finished Predicting
-------------------------------------------------------------------------
threshold =  0.0025


Processing Rows: 100%|██████████| 17191/17191 [00:38<00:00, 447.46it/s]
Processing Rows: 100%|██████████| 3595/3595 [00:08<00:00, 448.80it/s]

elapsed_time: 46.447s
false positive (FP): 20592, false negative (FN): 4650, Precision: 43.121%, Recall: 77.050%, F1-measure: 55.295%
Finished Predicting





In [None]:
threshold = [0.0057,0.0058,0.0059,0.0060,0.0061,0.0062,0.0063]
for i in threshold:
  print('-------------------------------------------------------------------------')
  print('threshold = ', i)
  evaluation(i)