<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/C(deeplog_model).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import os
import numpy as np
import pandas as pd

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
# Mount Google Drive to upload datasets (csv files)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import train and datasets based on ratio

def split_log_file(input_file, train_ratio=0.7):
    # Read the log file and split it into lines
    with open(input_file, 'r') as log_file:
        log_lines = log_file.readlines()

    # Calculate the number of lines for the train and test sets
    num_lines = len(log_lines)
    num_train_lines = int(num_lines * train_ratio)
    num_test_lines = num_lines - num_train_lines

    # Write the lines corresponding to the train set to a new train log file
    with open('hdfs_train', 'w') as train_file:
        train_file.writelines(log_lines[:num_train_lines])

    # Write the remaining lines (test set) to a new test log file
    with open('hdfs_test_normal', 'w') as test_file:
        test_file.writelines(log_lines[num_train_lines:])

# split normal log file
split_log_file('/content/drive/MyDrive/HDFS/structured_hdfs/hdfs_sequence_normal', train_ratio=0.1)

#copy test abnormal file to current directory
!cp '/content/drive/MyDrive/HDFS/structured_hdfs/hdfs_test_sequence_abnormal' '/content/'

alternative files :


```

# download datasets
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_abnormal'
     
```
be careful these files are logs , not csv


In [3]:
# download datasets
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_abnormal'

--2023-07-24 16:04:15--  https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 257875 (252K) [text/plain]
Saving to: ‘hdfs_train’


2023-07-24 16:04:16 (14.4 MB/s) - ‘hdfs_train’ saved [257875/257875]

--2023-07-24 16:04:16--  https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29284282 (28M) [text/plain]
Saving to: ‘hdfs_test_normal’


2023-07-24 16:04:18 (203 MB/s) - ‘h

In [14]:
# count session for each dataset

def count_sessions(dataset):
    num_sessions = 0
    with open('/content/'+ dataset, 'r') as f:
        for row in f:
            num_sessions += 1
    print('Number of sessions({}): {}'.format(dataset, num_sessions))

datasets = ['hdfs_train','hdfs_test_normal','hdfs_test_abnormal']

for dataset in datasets:
  count_sessions(dataset)

Number of sessions(hdfs_train): 4855
Number of sessions(hdfs_test_normal): 553366
Number of sessions(hdfs_test_abnormal): 16838


In [15]:
# all templates in our datasets

datasets = ['hdfs_train','hdfs_test_normal','hdfs_test_abnormal']
templates = set()

for dataset in datasets:
  with open('/content/' + dataset, 'r') as f:
          for row in f:
            for temp in row.split():
              templates.add(temp)

print(sorted(templates))
print('nember of templates : ',len(templates))

['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '3', '4', '5', '6', '7', '8', '9']
nember of templates :  28


test:


```
name = 'hdfs_train_sequence'
window_size = 10
num_sessions = 0
inputs = []
outputs = []

with open('/content/' + name, 'r') as f:
        for row in f:
            num_sessions += 1
            line = [ int(i) for i in row.strip().split()]
            print(line)
            for i in range(len(line) - window_size):
                print(line[i:i + window_size])
                print(line[i + window_size])
                break
            break

ans:
[0, 1, 0, 0, 2, 2, 3, 3, 2, 3, 4, 4, 4, 5,...]
[0, 1, 0, 0, 2, 2, 3, 3, 2, 3]
4
```



In [4]:
name = 'hdfs_train'
window_size = 10
num_sessions = 0
inputs = []
outputs = []

with open('/content/' + name, 'r') as f:
        for row in f:
            num_sessions += 1
            line = [ int(i) for i in row.strip().split()]
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                outputs.append(line[i + window_size])

print('Number of sessions({}): {}'.format(name, num_sessions))
print('Number of seqs({}): {}'.format(name, len(inputs)))
dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))


Number of sessions(hdfs_train): 4855
Number of seqs(hdfs_train): 46575


In [5]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))  # out.shape : [batch_size, sequence_length, hidden_size]
        out = self.fc(out[:, -1, :]) #The : before , -1, : indicates that we want to include all elements along the first dimension (batch dimension). -1 represents the index of the last element along the second dimension (sequence length). : after , -1 indicates that we want to include all elements along the third dimension (hidden size)
        return out


In [6]:
input_size = 1
num_layers = 2
hidden_size = 64
num_classes = 28  # templates + 1 abnormal output
batch_size = 2048
num_epochs = 150

In [7]:
model = Model(input_size, hidden_size, num_layers, num_classes).to(device)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [8]:

# Train the model
start_time = time.time()
total_step = len(dataloader)
for epoch in range(num_epochs):  # Loop over the dataset multiple times
    train_loss = 0
    for step, (seq, label) in enumerate(dataloader):
        # Forward pass
        seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
        output = model(seq)
        loss = criterion(output, label.to(device))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
print('Finished Training')

Epoch [1/150], train_loss: 2.7101
Epoch [2/150], train_loss: 1.8622
Epoch [3/150], train_loss: 1.7668
Epoch [4/150], train_loss: 1.6340
Epoch [5/150], train_loss: 1.4068
Epoch [6/150], train_loss: 1.1704
Epoch [7/150], train_loss: 0.9617
Epoch [8/150], train_loss: 0.8200
Epoch [9/150], train_loss: 0.7285
Epoch [10/150], train_loss: 0.6682
Epoch [11/150], train_loss: 0.6153
Epoch [12/150], train_loss: 0.5736
Epoch [13/150], train_loss: 0.5403
Epoch [14/150], train_loss: 0.5183
Epoch [15/150], train_loss: 0.5070
Epoch [16/150], train_loss: 0.4896
Epoch [17/150], train_loss: 0.4789
Epoch [18/150], train_loss: 0.4691
Epoch [19/150], train_loss: 0.4584
Epoch [20/150], train_loss: 0.4478
Epoch [21/150], train_loss: 0.4376
Epoch [22/150], train_loss: 0.4236
Epoch [23/150], train_loss: 0.4098
Epoch [24/150], train_loss: 0.4005
Epoch [25/150], train_loss: 0.3917
Epoch [26/150], train_loss: 0.3838
Epoch [27/150], train_loss: 0.3804
Epoch [28/150], train_loss: 0.3728
Epoch [29/150], train_loss: 0

KeyboardInterrupt: ignored

In [9]:
def generate(name):
    # If you what to replicate the DeepLog paper results(Actually, I have a better result than DeepLog paper results),
    # you should use the 'list' not 'set' to obtain the full dataset, I use 'set' just for test and acceleration.
    hdfs = set()
    # hdfs = []
    with open('/content/' + name, 'r') as f:
        for row in f:
            line = [int(i) for i in row.strip().split()]
            line = line + [0] * (window_size + 1 - len(line)) #if the length of the line is less than windows size, it covers by -1
            hdfs.add(tuple(line))
            # hdfs.append(tuple(line))
    print('Number of sessions({}): {}'.format(name, len(hdfs)))
    return hdfs


In [10]:

test_normal_loader = generate('hdfs_test_normal')
test_abnormal_loader = generate('hdfs_test_abnormal')

Number of sessions(hdfs_test_normal): 14177
Number of sessions(hdfs_test_abnormal): 4123


In [11]:
num_candidates = 9 # on paper is g , top-g(here top 9) probabilities to appear next are considered normal

In [16]:
# Test the model
model.eval()

TP = 0
FP = 0

start_time = time.time()
with torch.no_grad():
    for line in test_normal_loader:
        for i in range(len(line) - window_size):
            seq = line[i:i + window_size]
            label = line[i + window_size]
            seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
            label = torch.tensor(label).view(-1).to(device)
            output = model(seq)
            predicted = torch.argsort(output, 1)[0][-num_candidates:]
            if label not in predicted:
                FP += 1
                break   #with just one wrong prediction in a line , we assume , abnormal
with torch.no_grad():
    for line in test_abnormal_loader:
        for i in range(len(line) - window_size):
            seq = line[i:i + window_size]
            label = line[i + window_size]
            seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
            label = torch.tensor(label).view(-1).to(device)
            output = model(seq)
            predicted = torch.argsort(output, 1)[0][-num_candidates:]
            if label not in predicted:
                TP += 1
                break
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
# Compute precision, recall and F1-measure
FN = len(test_abnormal_loader) - TP
P = 100 * TP / (TP + FP)
R = 100 * TP / (TP + FN)
F1 = 2 * P * R / (P + R)
print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1))
print('Finished Predicting')

elapsed_time: 148.871s
false positive (FP): 1048, false negative (FN): 272, Precision: 78.608%, Recall: 93.403%, F1-measure: 85.369%
Finished Predicting
