<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/C2(deeplog_model).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import os
import numpy as np
import pandas as pd

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# Mount Google Drive to upload datasets (csv files)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import train and datasets based on ratio

def split_log_file(input_file, train_ratio=0.7):
    # Read the log file and split it into lines
    with open(input_file, 'r') as log_file:
        log_lines = log_file.readlines()

    # Calculate the number of lines for the train and test sets
    num_lines = len(log_lines)
    num_train_lines = int(num_lines * train_ratio)
    num_test_lines = num_lines - num_train_lines

    # Write the lines corresponding to the train set to a new train log file
    with open('hdfs_train', 'w') as train_file:
        train_file.writelines(log_lines[:num_train_lines])

    # Write the remaining lines (test set) to a new test log file
    with open('hdfs_test_normal', 'w') as test_file:
        test_file.writelines(log_lines[num_train_lines:])

# split normal log file
split_log_file('/content/drive/MyDrive/HDFS/structured_hdfs/hdfs_sequence_normal', train_ratio=0.1)

#copy test abnormal file to current directory
!cp '/content/drive/MyDrive/HDFS/structured_hdfs/hdfs_test_abnormal' '/content/'

alternative files :


```

# download datasets
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_abnormal'
     
```
be careful these files are logs , not csv


In [None]:
# download datasets
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_abnormal'

--2023-07-25 16:22:07--  https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 257875 (252K) [text/plain]
Saving to: ‘hdfs_train’


2023-07-25 16:22:07 (44.0 MB/s) - ‘hdfs_train’ saved [257875/257875]

--2023-07-25 16:22:08--  https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29284282 (28M) [text/plain]
Saving to: ‘hdfs_test_normal’


2023-07-25 16:22:08 (60.3 MB/s) - ‘

In [None]:
# count session for each dataset

def count_sessions(dataset):
    num_sessions = 0
    with open('/content/'+ dataset, 'r') as f:
        for row in f:
            num_sessions += 1
    print('Number of sessions({}): {}'.format(dataset, num_sessions))

datasets = ['hdfs_train','hdfs_test_normal','hdfs_test_abnormal']

for dataset in datasets:
  count_sessions(dataset)

Number of sessions(hdfs_train): 4855
Number of sessions(hdfs_test_normal): 553366
Number of sessions(hdfs_test_abnormal): 16838


In [None]:
# all templates in our datasets

datasets = ['hdfs_train','hdfs_test_normal','hdfs_test_abnormal']
templates = set()

for dataset in datasets:
  with open('/content/' + dataset, 'r') as f:
          for row in f:
            for temp in row.split():
              templates.add(temp)

print(sorted(templates))
print('nember of templates : ',len(templates))

['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '3', '4', '5', '6', '7', '8', '9']
nember of templates :  28


test:


```
name = 'hdfs_train_sequence'
window_size = 10
num_sessions = 0
inputs = []
outputs = []

with open('/content/' + name, 'r') as f:
        for row in f:
            num_sessions += 1
            line = [ int(i) for i in row.strip().split()]
            print(line)
            for i in range(len(line) - window_size):
                print(line[i:i + window_size])
                print(line[i + window_size])
                break
            break

ans:
[0, 1, 0, 0, 2, 2, 3, 3, 2, 3, 4, 4, 4, 5,...]
[0, 1, 0, 0, 2, 2, 3, 3, 2, 3]
4
```



In [2]:
name = 'hdfs_train'
window_size = 10
num_sessions = 0
inputs = []
outputs = []

with open('/content/' + name, 'r') as f:
        for row in f:
            num_sessions += 1
            line = [ (int(i)-1) for i in row.strip().split()]
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                outputs.append(line[i + window_size])

print('Number of sessions({}): {}'.format(name, num_sessions))
print('Number of seqs({}): {}'.format(name, len(inputs)))
dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))


Number of sessions(hdfs_train): 4855
Number of seqs(hdfs_train): 46575


In [3]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))  # out.shape : [batch_size, sequence_length, hidden_size]
        out = self.fc(out[:, -1, :]) #The : before , -1, : indicates that we want to include all elements along the first dimension (batch dimension). -1 represents the index of the last element along the second dimension (sequence length). : after , -1 indicates that we want to include all elements along the third dimension (hidden size)
        return out


In [9]:
input_size = 1
num_layers = 2
hidden_size = 64
num_classes = 28  # templates + 1 abnormal output
batch_size = 2048
num_epochs = 375

In [10]:
model = Model(input_size, hidden_size, num_layers, num_classes).to(device)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()


# learning rate warm-up
The reason for this warm-up strategy is that when the training starts, the model's weights are randomly initialized, and the optimizer might make large updates in the first few iterations. If the learning rate is too high during this phase, it can cause the model to diverge or take overly large steps and result in unstable training.

If your data set is highly differentiated, you can suffer from a sort of "early over-fitting". If your shuffled data happens to include a cluster of related, strongly-featured observations, your model's initial training can skew badly toward those features -- or worse, toward incidental features that aren't truly related to the topic at all.

Warm-up is particularly useful when using large-batch training, as it helps prevent sharp changes in the model's parameters, which can destabilize the optimization process.



In deep learning, especially when using advanced optimization techniques like learning rate scheduling or weight decay, you may have multiple parameter groups with different learning rates, weight decay values, or other optimization-specific settings.

Each parameter group typically corresponds to a specific set of model parameters. For example, in transfer learning, you may have one parameter group for the pre-trained layers with a lower learning rate and another parameter group for the newly added layers with a higher learning rate.

For example, consider the following code:


```
optimizer = torch.optim.Adam([
    {'params': model.fc.parameters(), 'lr': 0.001},  # Learning rate for the fully connected layer
    {'params': model.conv.parameters(), 'lr': 0.0001},  # Learning rate for the convolutional layers
], weight_decay=0.01)
```
In this case, the optimizer has two parameter groups: one for the fully connected layer and another for the convolutional layers. The for loop can be used to access and modify the learning rates for each parameter group as follows:



```
for param_group in optimizer.param_groups:
    print(param_group['lr'])  # Print the learning rate for each parameter group
    param_group['lr'] *= 0.1  # Multiply the learning rate by 0.1 for each parameter group
```




In [34]:
optimizer = optim.Adam(model.parameters(),lr=0.01)


# we just one loop here
print(optimizer.param_groups[0].keys())
print(optimizer.param_groups[0]['lr'])
print(optimizer.param_groups[0]['betas'])


dict_keys(['params', 'lr', 'betas', 'eps', 'weight_decay', 'amsgrad', 'maximize', 'foreach', 'capturable', 'differentiable', 'fused'])
0.01
(0.9, 0.999)


betas=(0.9, 0.999) indicates that the first moment (mean) will be updated using a moving average with a decay rate of 0.9, and the second moment (uncentered variance) will be updated using a moving average with a decay rate of 0.999. These values are commonly used as the default in many deep learning frameworks.

In [12]:
def adjust_learning_rate(optimizer, epoch, lr_step=(300, 350), lr_decay_ratio=0.1):
    """Adjust the learning rate based on the epoch number."""
    if epoch == 0:
        optimizer.param_groups[0]['lr'] /= 32
    elif epoch in [1, 2, 3, 4, 5]:  # in step five , we finish warm up ,and start normal learning rate
        optimizer.param_groups[0]['lr'] *= 2
    if epoch in lr_step: # in these steps , we are geting close to optimal point so we need to have shorter step
        optimizer.param_groups[0]['lr'] *= lr_decay_ratio
    return optimizer

# Define options here
options = {
    'lr': 0.001,
    'lr_step': (300, 350), #steps(epoch) for updating learning rate
    'lr_decay_ratio': 0.1,
    # Add other options here
}

optimizer = optim.Adam(model.parameters(), lr=options['lr'], betas=(0.9, 0.999))


validation part

```

    if epoch >= num_epochs // 2 and epoch % 2 == 0:
        model.eval()
        total_losses = 0
        for step, (seq, label) in enumerate(dataloader):
            # Forward pass
            seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
            output = model(seq)
            loss = criterion(output, label.to(device))

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
        print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))```



In [None]:

# Train the model
start_time = time.time()
total_step = len(dataloader)
for epoch in range(num_epochs):  # Loop over the dataset multiple times
    optimizer = adjust_learning_rate(optimizer, epoch, options['lr_step'], options['lr_decay_ratio'])
    print(optimizer.param_groups[0]['lr'])
    train_loss = 0
    for step, (seq, label) in enumerate(dataloader):
        # Forward pass
        seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
        output = model(seq)
        loss = criterion(output, label.to(device))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
print('Finished Training')

3.125e-05
Epoch [1/375], train_loss: 3.2987
6.25e-05
Epoch [2/375], train_loss: 3.2606
0.000125
Epoch [3/375], train_loss: 3.1662
0.00025
Epoch [4/375], train_loss: 2.7627
0.0005
Epoch [5/375], train_loss: 1.9722
0.001
Epoch [6/375], train_loss: 1.6034
0.001
Epoch [7/375], train_loss: 1.3347
0.001
Epoch [8/375], train_loss: 1.1209
0.001
Epoch [9/375], train_loss: 0.9453
0.001
Epoch [10/375], train_loss: 0.8230
0.001
Epoch [11/375], train_loss: 0.7400
0.001


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#save the model
torch.save(model.state_dict(), '/content/drive/MyDrive/LSTM_model_parameter')

In [None]:

# upload the model
model_path = '/content/drive/MyDrive/LSTM_model_parameter'
model.load_state_dict(torch.load(model_path))

In [None]:
def generate(name):
    # If you what to replicate the DeepLog paper results(Actually, I have a better result than DeepLog paper results),
    # you should use the 'list' not 'set' to obtain the full dataset, I use 'set' just for test and acceleration.
    #hdfs = set()
    hdfs = []
    with open('/content/' + name, 'r') as f:
        for row in f:
            line = [(int(i)-1) for i in row.strip().split()]
            line = line + [-1] * (window_size + 1 - len(line)) #if the length of the line is less than windows size, it covers by -1
            #hdfs.add(tuple(line))
            hdfs.append(tuple(line))
    print('Number of sessions({}): {}'.format(name, len(hdfs)))
    return hdfs


In [None]:

test_normal_loader = generate('hdfs_test_normal')
test_abnormal_loader = generate('hdfs_test_abnormal')

Number of sessions(hdfs_test_normal): 553366
Number of sessions(hdfs_test_abnormal): 16838


In [None]:
num_candidates = 9 # on paper is g , top-g(here top 9) probabilities to appear next are considered normal

In [None]:
# Test the model
model.eval()

TP = 0
FP = 0

start_time = time.time()
with torch.no_grad():
    for line in test_normal_loader:
        for i in range(len(line) - window_size):
            seq = line[i:i + window_size]
            label = line[i + window_size]
            seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
            label = torch.tensor(label).view(-1).to(device)
            output = model(seq)
            predicted = torch.argsort(output, 1)[0][-num_candidates:]
            if label not in predicted:
                FP += 1
                break   #with just one wrong prediction in a line , we assume , abnormal
with torch.no_grad():
    for line in test_abnormal_loader:
        for i in range(len(line) - window_size):
            seq = line[i:i + window_size]
            label = line[i + window_size]
            seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
            label = torch.tensor(label).view(-1).to(device)
            output = model(seq)
            predicted = torch.argsort(output, 1)[0][-num_candidates:]
            if label not in predicted:
                TP += 1
                break
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
# Compute precision, recall and F1-measure
FN = len(test_abnormal_loader) - TP
P = 100 * TP / (TP + FP)
R = 100 * TP / (TP + FN)
F1 = 2 * P * R / (P + R)
print('false positive (FP): {}, false negative (FN): {}, Precision: {:.3f}%, Recall: {:.3f}%, F1-measure: {:.3f}%'.format(FP, FN, P, R, F1))
print('Finished Predicting')

elapsed_time: 3368.678s
false positive (FP): 867, false negative (FN): 1258, Precision: 94.729%, Recall: 92.529%, F1-measure: 93.616%
Finished Predicting


In [None]:
print(TP)
print(FN)
print(FP)
print()

15580
1258
867

