<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/C(deeplog_model).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:

import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import os
import numpy as np
import pandas as pd

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


alternative files :


```

# download datasets
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_train'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_normal'
!wget 'https://raw.githubusercontent.com/donglee-afar/logdeep/master/data/hdfs/hdfs_test_abnormal'
     
```
be careful these files are logs , not csv


In [1]:
# Mount Google Drive to upload datasets (csv files)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [70]:
# import train and datasets based on ratio

def split_log_file(input_file, train_ratio=0.7):
    # Read the log file and split it into lines
    with open(input_file, 'r') as log_file:
        log_lines = log_file.readlines()

    # Calculate the number of lines for the train and test sets
    num_lines = len(log_lines)
    num_train_lines = int(num_lines * train_ratio)
    num_test_lines = num_lines - num_train_lines

    # Write the lines corresponding to the train set to a new train log file
    with open('hdfs_train_sequence', 'w') as train_file:
        train_file.writelines(log_lines[:num_train_lines])

    # Write the remaining lines (test set) to a new test log file
    with open('hdfs_test_sequence_normal', 'w') as test_file:
        test_file.writelines(log_lines[num_train_lines:])

# split normal log file
split_log_file('/content/drive/MyDrive/HDFS/structured_hdfs/hdfs_sequence_normal', train_ratio=0.7)

#copy test abnormal file to current directory
!cp '/content/drive/MyDrive/HDFS/structured_hdfs/hdfs_test_sequence_abnormal' '/content/'

In [79]:
names = ['hdfs_train_sequence','hdfs_test_sequence_normal','hdfs_test_sequence_abnormal']
templates = set()

for name in names:
  with open('/content/' + name, 'r') as f:
          for row in f:
            for temp in row.split():
              templates.add(temp)

print(sorted(templates))
print('nember of templates : ',len(templates))

['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '5', '6', '7', '8', '9']
nember of templates :  48


test:


```
name = 'hdfs_train_sequence'
window_size = 10
num_sessions = 0
inputs = []
outputs = []

with open('/content/' + name, 'r') as f:
        for row in f:
            num_sessions += 1
            line = [ int(i) for i in row.strip().split()]
            print(line)
            for i in range(len(line) - window_size):
                print(line[i:i + window_size])
                print(line[i + window_size])
                break
            break

ans:
[0, 1, 0, 0, 2, 2, 3, 3, 2, 3, 4, 4, 4, 5,...]
[0, 1, 0, 0, 2, 2, 3, 3, 2, 3]
4
```



In [83]:
name = 'hdfs_train_sequence'
window_size = 10
num_sessions = 0
inputs = []
outputs = []

with open('/content/' + name, 'r') as f:
        for row in f:
            num_sessions += 1
            line = [ int(i) for i in row.strip().split()]
            for i in range(len(line) - window_size):
                inputs.append(line[i:i + window_size])
                outputs.append(line[i + window_size])

print('Number of sessions({}): {}'.format(name, num_sessions))
print('Number of seqs({}): {}'.format(name, len(inputs)))
dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))


Number of sessions(hdfs_train_sequence): 390756
Number of seqs(hdfs_train_sequence): 4200230


In [78]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))  # out.shape : [batch_size, sequence_length, hidden_size]
        out = self.fc(out[:, -1, :]) #The : before , -1, : indicates that we want to include all elements along the first dimension (batch dimension). -1 represents the index of the last element along the second dimension (sequence length). : after , -1 indicates that we want to include all elements along the third dimension (hidden size)
        return out


In [80]:
input_size = 1
num_layers = 2
hidden_size = 64
num_classes = 49  # 48 templates + 1 abnormal output
batch_size = 2048
num_epochs = 150

In [84]:
model = Model(input_size, hidden_size, num_layers, num_classes).to(device)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [None]:

# Train the model
start_time = time.time()
total_step = len(dataloader)
for epoch in range(num_epochs):  # Loop over the dataset multiple times
    train_loss = 0
    for step, (seq, label) in enumerate(dataloader):
        # Forward pass
        seq = seq.clone().detach().view(-1, window_size, input_size).to(device)
        output = model(seq)
        loss = criterion(output, label.to(device))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print('Epoch [{}/{}], train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / total_step))
elapsed_time = time.time() - start_time
print('elapsed_time: {:.3f}s'.format(elapsed_time))
print('Finished Training')