<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/C(loganomaly_model).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:

import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [36]:
# download datasets
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_abnormal'

--2023-07-26 16:53:20--  https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 257875 (252K) [text/plain]
Saving to: ‘hdfs_train.1’


2023-07-26 16:53:20 (57.5 MB/s) - ‘hdfs_train.1’ saved [257875/257875]

--2023-07-26 16:53:21--  https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29284282 (28M) [text/plain]
Saving to: ‘hdfs_test_normal.1’


2023-07-26 16:53:23 (291 MB/s

In [37]:
# count session for each dataset

def count_sessions(dataset):
    num_sessions = 0
    with open('/content/'+ dataset, 'r') as f:
        for row in f:
            num_sessions += 1
    print('Number of sessions({}): {}'.format(dataset, num_sessions))

datasets = ['hdfs_train','hdfs_test_normal','hdfs_test_abnormal']

for dataset in datasets:
  count_sessions(dataset)

Number of sessions(hdfs_train): 4855
Number of sessions(hdfs_test_normal): 553366
Number of sessions(hdfs_test_abnormal): 16838


In [38]:
# all templates in our datasets

datasets = ['hdfs_train','hdfs_test_normal','hdfs_test_abnormal']
templates = set()

for dataset in datasets:
  with open('/content/' + dataset, 'r') as f:
          for row in f:
            for temp in row.split():
              templates.add(temp)

print(sorted(templates))
print('nember of templates : ',len(templates))

['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '3', '4', '5', '6', '7', '8', '9']
nember of templates :  28


In [39]:
name = 'hdfs_train'
window_size = 10
num_sessions = 0
inputs = []
outputs = []

with open('/content/' + name, 'r') as f:
        for row in f:
            num_sessions += 1
            line = [ (int(i)-1) for i in row.strip().split()]
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                outputs.append(line[i + window_size])

print('Number of sessions({}): {}'.format(name, num_sessions))
print('Number of seqs({}): {}'.format(name, len(inputs)))
dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))


Number of sessions(hdfs_train): 4855
Number of seqs(hdfs_train): 46575


In [40]:
#import json
#def read_json(filename):
    #with open(filename, 'r') as load_f:
        #file_dict = json.load(load_f)
    #return file_dict

from collections import Counter

def sliding_window(data_dir, window_size):
    '''
    dataset structure
        result_logs(dict):
            result_logs['feature0'] = list()
            result_logs['feature1'] = list()
            ...
        labels(list)
    '''
    #event2semantic_vec = read_json(data_dir + 'hdfs/event2semantic_vec.json')
    num_sessions = 0
    result_logs = {}
    result_logs['Sequentials'] = []
    result_logs['Quantitatives'] = []
    #result_logs['Semantics'] = []
    labels = []

    with open(data_dir, 'r') as f:
        for line in f.readlines():
            num_sessions += 1
            line = [(int(i)-1) for i in line.strip().split()]
            for i in range(len(line) - window_size):

                Sequential_pattern = list(line[i:i + window_size])

                Quantitative_pattern = [0] * 28  # 28 templates we have
                log_counter = Counter(Sequential_pattern)
                for key in log_counter:
                  if key > 0:  # avoid considering -1 in templates , but we just see in test dataset
                    Quantitative_pattern[key] = log_counter[key]

                #Semantic_pattern = []
                #for event in Sequential_pattern:
                    #if event == 0:
                        #Semantic_pattern.append([-1] * 300)
                    #else:
                        #Semantic_pattern.append(event2semantic_vec[str(event - 1)])

                #Sequential_pattern = np.array(Sequential_pattern)[:, np.newaxis]
                #Quantitative_pattern = np.array(Quantitative_pattern)[:, np.newaxis]
                result_logs['Sequentials'].append(Sequential_pattern)
                result_logs['Quantitatives'].append(Quantitative_pattern)
                #result_logs['Semantics'].append(Semantic_pattern)

                labels.append(line[i + window_size])

    print('File {}, number of sessions {}'.format(data_dir, num_sessions))
    print('File {}, number of seqs {}'.format(data_dir, len(result_logs['Sequentials'])))

    return result_logs, labels


In [41]:
result_logs, labels = sliding_window('/content/hdfs_train', 10)

File /content/hdfs_train, number of sessions 4855
File /content/hdfs_train, number of seqs 46575


In [42]:
dataset = TensorDataset(torch.tensor(result_logs['Sequentials'], dtype=torch.float), torch.tensor(result_logs['Quantitatives'], dtype=torch.float), torch.tensor(labels))
dataloader = DataLoader(dataset, batch_size=1)


In [43]:
from torch.autograd import Variable

class loganomaly(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_keys):
        super(loganomaly, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm0 = nn.LSTM(input_size,
                             hidden_size,
                             num_layers,
                             batch_first=True)
        self.lstm1 = nn.LSTM(input_size,
                             hidden_size,
                             num_layers,
                             batch_first=True)
        self.fc = nn.Linear(2 * hidden_size, num_keys)
        self.attention_size = self.hidden_size

        self.w_omega = Variable(
            torch.zeros(self.hidden_size, self.attention_size))
        self.u_omega = Variable(torch.zeros(self.attention_size))

        self.sequence_length = 28

    def attention_net(self, lstm_output):
        output_reshape = torch.Tensor.reshape(lstm_output,
                                              [-1, self.hidden_size])
        attn_tanh = torch.tanh(torch.mm(output_reshape, self.w_omega))
        attn_hidden_layer = torch.mm(
            attn_tanh, torch.Tensor.reshape(self.u_omega, [-1, 1]))
        exps = torch.Tensor.reshape(torch.exp(attn_hidden_layer),
                                    [-1, self.sequence_length])
        alphas = exps / torch.Tensor.reshape(torch.sum(exps, 1), [-1, 1])
        alphas_reshape = torch.Tensor.reshape(alphas,
                                              [-1, self.sequence_length, 1])
        state = lstm_output
        attn_output = torch.sum(state * alphas_reshape, 1)
        return attn_output

    def forward(self, features, device):
        input0, input1 = features[0], features[1]

        h0_0 = torch.zeros(self.num_layers, input0.size(0),
                           self.hidden_size).to(device)
        c0_0 = torch.zeros(self.num_layers, input0.size(0),
                           self.hidden_size).to(device)

        out0, _ = self.lstm0(input0, (h0_0, c0_0))

        h0_1 = torch.zeros(self.num_layers, input1.size(0),
                           self.hidden_size).to(device)
        c0_1 = torch.zeros(self.num_layers, input1.size(0),
                           self.hidden_size).to(device)

        out1, _ = self.lstm1(input1, (h0_1, c0_1))
        multi_out = torch.cat((out0[:, -1, :], out1[:, -1, :]), -1)
        out = self.fc(multi_out)
        return out

In [44]:

input_size = 1
num_layers = 2
hidden_size = 64
num_classes = 28  # templates
batch_size = 2048
num_epochs = 370
window_size = 10

In [45]:
model = loganomaly(input_size, hidden_size, num_layers, num_classes).to(device)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

criterion = nn.CrossEntropyLoss()


In [46]:
def adjust_learning_rate(optimizer, epoch, lr_step=(300, 350), lr_decay_ratio=0.1):
    """Adjust the learning rate based on the epoch number."""
    if epoch == 0:
        optimizer.param_groups[0]['lr'] /= 32
    elif epoch in [2, 4, 6, 8, 10]:  # in step 10 , we finish warm up ,and start normal learning rate
        optimizer.param_groups[0]['lr'] *= 2
    if epoch in lr_step: # in these steps , we are geting close to optimal point so we need to have shorter step
        optimizer.param_groups[0]['lr'] *= lr_decay_ratio
    return optimizer

# Define options here
options = {
    'lr': 0.001,
    'lr_step': (300, 350), #steps(epoch) for updating learning rate
    'lr_decay_ratio': 0.1,
    # Add other options here
}

optimizer = optim.Adam(model.parameters(), lr=options['lr'], betas=(0.9, 0.999))


In [47]:

# Train the model
start_time = time.time()
total_step = len(dataloader)
for epoch in range(num_epochs):  # Loop over the dataset multiple times
    optimizer = adjust_learning_rate(optimizer, epoch, options['lr_step'], options['lr_decay_ratio'])
    print(optimizer.param_groups[0]['lr'])
    train_loss = 0
    for step, (seq,quan,label) in enumerate(dataloader):
        # Move data to the device
        seq = seq.view(-1, window_size, input_size).to(device)
        quan = quan.view(-1, 28, input_size).to(device)
        label = label.to(device)

        # Forward pass
        output = model(features=(seq, quan), device=device)
        loss = criterion(output, label)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
print('Finished Training')

3.125e-05
Epoch [1/370], train_loss: 3.3239
3.125e-05
Epoch [2/370], train_loss: 3.2961
6.25e-05
Epoch [3/370], train_loss: 3.2530
6.25e-05
Epoch [4/370], train_loss: 3.1835
0.000125
Epoch [5/370], train_loss: 3.0139
0.000125
Epoch [6/370], train_loss: 2.4787
0.00025
Epoch [7/370], train_loss: 1.8865
0.00025
Epoch [8/370], train_loss: 1.7778
0.0005
Epoch [9/370], train_loss: 1.6388
0.0005
Epoch [10/370], train_loss: 1.3952
0.001
Epoch [11/370], train_loss: 1.0811
0.001
Epoch [12/370], train_loss: 0.8493
0.001
Epoch [13/370], train_loss: 0.7217
0.001
Epoch [14/370], train_loss: 0.6399
0.001
Epoch [15/370], train_loss: 0.5818
0.001
Epoch [16/370], train_loss: 0.5439
0.001
Epoch [17/370], train_loss: 0.5187
0.001
Epoch [18/370], train_loss: 0.4846
0.001
Epoch [19/370], train_loss: 0.4644
0.001
Epoch [20/370], train_loss: 0.4494
0.001
Epoch [21/370], train_loss: 0.4550
0.001
Epoch [22/370], train_loss: 0.4410
0.001
Epoch [23/370], train_loss: 0.4266
0.001
Epoch [24/370], train_loss: 0.4053

In [53]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
#save the model
torch.save(model.state_dict(), '/content/drive/MyDrive/HDFS/loganomaly_model_parameter')

In [55]:

# upload the model
model_path = '/content/drive/MyDrive/HDFS/loganomaly_model_parameter'
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [48]:
def generate(name):
    window_size = 10
    hdfs = {} #store the unique sequences and their counts.
    length = 0
    with open('/content/' + name, 'r') as f:
        for ln in f.readlines():
            ln = [(int(i)-1) for i in ln.strip().split()]
            ln = ln + [-1] * (window_size + 1 - len(ln))     #ensure that all sequences have a fixed length of window_size + 1, even if the original line had fewer elements.
            hdfs[tuple(ln)] = hdfs.get(tuple(ln), 0) + 1   #If the tuple is not present in the dictionary, hdfs.get(tuple(ln), 0) returns 0, and the code initializes the count to 1.
            length += 1
    print('Number of sessions({}): {}'.format(name, len(hdfs)))
    return hdfs, length

In [49]:

test_normal_loader, test_normal_length = generate('hdfs_test_normal')
test_abnormal_loader, test_abnormal_length = generate('hdfs_test_abnormal')

Number of sessions(hdfs_test_normal): 14177
Number of sessions(hdfs_test_abnormal): 4123


In [50]:
num_candidates = 9 # on paper is g , top-g(here top 9) probabilities to appear next are considered normal

In [51]:
def test_data(model, data_loader, num_candidates, device):
  model.eval()

  S = 0

  start_time = time.time()
  with torch.no_grad():
      for line in tqdm(data_loader.keys()):
          for i in range(len(line) - window_size):
              seq0 = line[i:i + window_size]
              label = line[i + window_size]
              seq1 = [0] * 28
              log_conuter = Counter(seq0)
              for key in log_conuter:
                if key > 0:  # avoid considering -1 in templates
                  seq1[key] = log_conuter[key]

              seq0 = torch.tensor(seq0, dtype=torch.float).view(-1, window_size, input_size).to(device)
              seq1 = torch.tensor(seq1, dtype=torch.float).view(-1, num_classes, input_size).to(device)
              label = torch.tensor(label).view(-1).to(device)
              output = model(features=[seq0, seq1], device=device)
              predicted = torch.argsort(output,1)[0][-num_candidates:]
              if label not in predicted:
                  S += data_loader[line]
                  break

  return S

In [52]:
# Test the model on normal data
FP_normal = test_data(model, test_normal_loader, num_candidates, device)

# Test the model on abnormal data
TP_abnormal = test_data(model, test_abnormal_loader, num_candidates, device)

# Compute precision, recall, and F1-measure
FN = test_abnormal_length - TP_abnormal
P = 100 * TP_abnormal / (TP_abnormal + FP_normal)
R = 100 * TP_abnormal / (TP_abnormal + FN)
F1 = 2 * P * R / (P + R)

print('------------------------------------------------')
print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP_normal, FN, P, R, F1))
print('Finished Predicting')

100%|██████████| 14177/14177 [03:46<00:00, 62.62it/s]
100%|██████████| 4123/4123 [00:50<00:00, 81.31it/s]

------------------------------------------------
false positive (FP): 654, false negative (FN): 1259, Precision: 95.971%, Recall: 92.523%, F1-measure: 94.215%
Finished Predicting



