In [1]:
!pip install spellpy

Collecting spellpy
  Downloading spellpy-0.0.9-py3-none-any.whl (19 kB)
Installing collected packages: spellpy
Successfully installed spellpy-0.0.9


In [2]:
import os
import sys
import logging
import pandas as pd
from spellpy import spell
from sklearn.model_selection import train_test_split

In [5]:
input_dir = "./data/banking_simulation"
output_dir = "./data/banking_simulation/result"

# 2023-11-03 01:46:40 - IP: 152.237.212.155 - Location: Berlin, Germany - Device ID: D7487C - User: ryan - Failed login attempt from user ryan.
log_format = "<Date> <Time> - IP: <IP> - Location: <Location> - Device ID: <DeviceID> - User: <User> - <Content>"
log_main = "banking_simulation"
tau = 0.5

def preprocess():
    parser = spell.LogParser(
        indir=input_dir,
        outdir=output_dir,
        log_format=log_format,
        logmain=log_main,
        tau=tau
    )

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    log_files = [
        'banking_simulation_normal.log',
        'banking_simulation_abnormal.log'
    ]
    for log_name in log_files:
        parser.parse(log_name)

    df_normal = pd.read_csv(f'{output_dir}/banking_simulation_normal.log_structured.csv')
    df_abnormal = pd.read_csv(f'{output_dir}/banking_simulation_abnormal.log_structured.csv')

    return df_normal, df_abnormal

df_normal, df_abnormal = preprocess()

In [6]:
df_normal.head()

Unnamed: 0,LineId,Date,Time,IP,Location,DeviceID,User,Content,EventId,EventTemplate,ParameterList
0,1,INFO:,2023-11-09 00:40:41,125.185.177.95,"Berlin, Germany",D86D92,aiden,Server status: Service interruption in progres...,8c6357bd,Server status Service interruption in progress...,[]
1,2,INFO:,2023-11-09 00:40:41,65.4.98.176,"New York, USA",475C29,aiden,Failed login attempt from user aiden.,6e63e1cc,<*> login attempt from user <*>,"['Failed', 'aiden']"
2,3,WARNING:,2023-11-09 00:40:41,71.194.191.10,"Tokyo, Japan",C3A664,aiden,Memory usage: 86% used.,a0f90bd7,Memory usage <*> used.,['86%']
3,4,INFO:,2023-11-09 00:40:41,186.119.229.225,"New York, USA",76AE17,aiden,CPU utilization: 28% utilized.,39075140,CPU utilization <*> utilized.,['28%']
4,5,INFO:,2023-11-09 00:45:41,65.4.98.176,"New York, USA",475C29,aiden,Successful login attempt from user aiden.,6e63e1cc,<*> login attempt from user <*>,"['Successful', 'aiden']"


In [7]:
df_normal.shape

(85447, 11)

In [8]:
df_normal['EventTemplate'].value_counts()

Memory usage <*> used.                                              11939
CPU utilization <*> utilized.                                       11939
Execution time Execution time <*> seconds.                          11939
Disk space <*> free.                                                11327
Successful <*> of <*> units.                                         8193
Network activity <*> network <*>                                     7963
<*> login attempt from user <*>                                      5620
Server status <*> and <*>                                            4814
Server status Service interruption in progress. Investigating...     2417
Server status All systems are operational.                           2403
Server status Minor issues detected but operational.                 2305
Disk space Disk space critically low only <*> free.                   612
Name: EventTemplate, dtype: int64

In [9]:
df_normal_train, df_normal_test = train_test_split(df_normal, test_size=0.2, shuffle=False)

In [10]:
%%time
# def group_logs_by_datetime(df, event_id_map):
#     df["Datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"])
#     df = df[["Datetime", "EventId"]]
#     df["EventId"] = df["EventId"].apply(lambda e: event_id_map[e] if event_id_map.get(e) else -1)
#     deeplog_df = df.set_index("Datetime").resample("1min").apply(lambda arr: list(arr)).reset_index()
#     return deeplog_df

def group_by_user(df, event_id_map):
    df = df[["User", "EventId"]]
    df["EventId"] = df["EventId"].apply(lambda e: event_id_map[e] if event_id_map.get(e) else -1)
    deeplog_df = df.groupby('User')['EventId'].agg(list).reset_index()
    return deeplog_df

def save_deeplog_df(filename, df):
    with open(filename, "w") as f:
        for event_id_list in df["EventId"]:
            for event_id in event_id_list:
                f.write(str(event_id) + " ")
            f.write("\n")

def generate_log_key_sequences(df_normal_train, df_normal_test, df_abnormal):
    event_id_map = {}
    for index, event_id in enumerate(df_normal_train["EventId"].unique(), 1):
        event_id_map[event_id] = index

    print(f"Número de log keys únicos {len(event_id_map)}")

    print(event_id_map)
    deeplog_train_df = group_by_user(df_normal_train, event_id_map)
    save_deeplog_df("train", deeplog_train_df)

    deeplog_test_normal = group_by_user(df_normal_test, event_id_map)
    save_deeplog_df("test_normal", deeplog_test_normal)

    deeplog_test_abnormal = group_by_user(df_abnormal, event_id_map)
    save_deeplog_df("test_abnormal", deeplog_test_abnormal)

generate_log_key_sequences(df_normal_train, df_normal_test, df_abnormal)

Número de log keys únicos 13
{'8c6357bd': 1, '6e63e1cc': 2, 'a0f90bd7': 3, '39075140': 4, 'e760a141': 5, '05d9848a': 6, 'a57ba91d': 7, '36c47a43': 8, 'f5727082': 9, 'a156c921': 10, '2ef54992': 11, '9399db72': 12, '1383d305': 13}
CPU times: user 88.5 ms, sys: 3.1 ms, total: 91.6 ms
Wall time: 91.6 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
import json

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

NUM_CLASSES = 13
NUM_CANDIDATES = 5
EPOCHS = 20
WINDOW_SIZE =  3
BATCH_SIZE = 64
SEED = 1

INPUT_SIZE = 1
HIDDEN_SIZE = 64
NUM_LAYERS = 2

In [12]:
class Generate:
    def __init__(self):
        self.file = None

    def generate(self, filename, window_size):
        num_sessions = 0
        inputs = []
        outputs = []

        line = self.init_line(filename)
        while line:
            line = tuple(map(lambda n: n - 1, map(int, line.strip().split())))
            for i in range(len(line) - window_size):
                inputs.append(line[i:i+window_size])
                outputs.append(line[i+window_size])
            line = self.readline()
            num_sessions += 1

        print('Number of session({}): {}'.format(filename, len(inputs)))
        print('Number of seqs({}): {}'.format(filename, len(inputs)))

        dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))

        return dataset

    def init_line(self, filename):
        f = open(filename, 'r')
        self.file = f
        line = self.file.readline()
        return line

    def readline(self):
        line = self.file.readline()
        return line

def get_train_data_loader():
    print("Get train data loader")
    generate = Generate()
    sequence_dataset = generate.generate(filename="train", window_size=WINDOW_SIZE)
    dataloader = DataLoader(sequence_dataset, batch_size=BATCH_SIZE, shuffle=None, sampler=None)
    return dataloader


def save_model(model, model_dir):
    print("Saving the model.")
    path = os.path.join(model_dir, 'model.pth')
    torch.save(model.cpu().state_dict(), path)
    model_info_path = os.path.join(model_dir, 'model_info.pth')
    with open(model_info_path, 'wb') as f:
        model_info = {
            'input_size': INPUT_SIZE,
            'hidden_size': HIDDEN_SIZE,
            'num_layers': NUM_LAYERS,
            'num_classes': NUM_CLASSES,
            'num_candidates': NUM_CANDIDATES,
            'window_size': WINDOW_SIZE,
        }
        torch.save(model_info, f)

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

In [14]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size).to(input.device)
        c0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size).to(input.device)
        out, _ = self.lstm(input, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [15]:
%%time
torch.manual_seed(SEED)
train_loader = get_train_data_loader()

print("processed {}/{} ({:.0f}%) of traind data".format(
    len(train_loader.sampler), len(train_loader.dataset),
    100. * len(train_loader.sampler) / len(train_loader.dataset)
))

model = Model(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

for epoch in range(1, EPOCHS + 1):
    model.train()
    train_loss = 0
    for seq, label in train_loader:
        seq = seq.clone().detach().view(-1, WINDOW_SIZE, INPUT_SIZE).to(device)
        optimizer.zero_grad()
        output = model(seq)
        loss = criterion(output, label.to(device))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print('Epoch [{}/{}], Train_loss: {}'.format(
        epoch, EPOCHS, round(train_loss/len(train_loader.dataset), 4)
    ))

print('Finished Training')

Get train data loader
Number of session(train): 68216
Number of seqs(train): 68216
processed 68216/68216 (100%) of traind data
Epoch [1/20], Train_loss: 0.0204
Epoch [2/20], Train_loss: 0.0131
Epoch [3/20], Train_loss: 0.012
Epoch [4/20], Train_loss: 0.0116
Epoch [5/20], Train_loss: 0.0114
Epoch [6/20], Train_loss: 0.0113
Epoch [7/20], Train_loss: 0.0113
Epoch [8/20], Train_loss: 0.0112
Epoch [9/20], Train_loss: 0.0112
Epoch [10/20], Train_loss: 0.0112
Epoch [11/20], Train_loss: 0.0111
Epoch [12/20], Train_loss: 0.0111
Epoch [13/20], Train_loss: 0.0111
Epoch [14/20], Train_loss: 0.0111
Epoch [15/20], Train_loss: 0.0111
Epoch [16/20], Train_loss: 0.0111
Epoch [17/20], Train_loss: 0.0111
Epoch [18/20], Train_loss: 0.0111
Epoch [19/20], Train_loss: 0.0111
Epoch [20/20], Train_loss: 0.0111
Finished Training
CPU times: user 56.7 s, sys: 2.25 s, total: 58.9 s
Wall time: 1min 6s


In [16]:
!mkdir model

In [17]:
model

Model(
  (lstm): LSTM(1, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=13, bias=True)
)

In [18]:
save_model(model, 'model')

Saving the model.


## Prediccion

In [19]:
def load_model(model_dir):
    print("Loading the model.")

    model_info = {}
    with open(os.path.join(model_dir, "model_info.pth"), "rb") as f:
        model_info = torch.load(f)

    print("model_info: {}".format(model_info))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Current device: {}".format(device))

    model = Model(
        input_size=model_info["input_size"],
        hidden_size=model_info["hidden_size"],
        num_layers=model_info["num_layers"],
        num_classes=model_info["num_classes"],
    )

    with open(os.path.join(model_dir, "model.pth"), "rb") as f:
        model.load_state_dict(torch.load(f))

    return {
        "model": model.to(device),
        "window_size": model_info["window_size"],
        "input_size": model_info["input_size"],
        "num_candidates": model_info["num_candidates"]
    }

def predict(input_data, model_info):
    line = input_data['line']
    num_candidates = model_info['num_candidates']
    input_size = model_info['input_size']
    window_size = model_info['window_size']
    model = model_info['model']

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    predict_cnt = 0
    anomaly_cnt = 0
    predict_list = [0] * len(line)
    for i in range(len(line) - window_size):
        seq = line[i:i + window_size]
        label = line[i + window_size]
        seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
        label = torch.tensor(label).view(-1).to(device)
        output = model(seq)
        predict = torch.argsort(output, 1)[0][-num_candidates:]
        if label not in predict:
            anomaly_cnt += 1
            predict_list[i + window_size] = 1
            predict_cnt += 1
    return {'anomaly_cnt': anomaly_cnt, 'predict_cnt': predict_cnt, 'predict_list': predict_list}

In [20]:
%%time
model_info = load_model("./model")
test_abnormal_list = []
with open('test_abnormal', 'r') as f:
    for line in f.readlines():
        line = list(map(lambda n: n - 1, map(int, line.strip().split())))
        response = predict(json.loads(json.dumps({"line": line})), model_info)
        test_abnormal_list.append(response)

test_normal_list = []
with open('test_normal', 'r') as f:
    for line in f.readlines():
        line = list(map(lambda n: n - 1, map(int, line.strip().split())))
        response = predict(json.loads(json.dumps({"line": line})), model_info)
        test_normal_list.append(response)

Loading the model.
model_info: {'input_size': 1, 'hidden_size': 64, 'num_layers': 2, 'num_classes': 13, 'num_candidates': 5, 'window_size': 3}
Current device: cuda
CPU times: user 12.7 s, sys: 30.5 ms, total: 12.7 s
Wall time: 12.8 s


In [21]:
%%time
threshold = 5
abnormal_has_anomaly = [1 if t["anomaly_cnt"] > threshold else 0 for t in test_abnormal_list]
abnormal_cnt_anomaly = [t["anomaly_cnt"] for t in test_abnormal_list]
abnormal_predict = []
for test_abnormal in test_abnormal_list:
    abnormal_predict += test_abnormal["predict_list"]

normal_has_anomaly = [1 if t['anomaly_cnt'] > threshold else 0 for t in test_normal_list]
normal_cnt_anomaly = [t['anomaly_cnt'] for t in test_normal_list]
normal_predict = []
for test_normal in test_normal_list:
    normal_predict += test_normal['predict_list']

ground_truth = [1]*len(abnormal_has_anomaly) + [0]*len(normal_has_anomaly)
predict = abnormal_has_anomaly + normal_has_anomaly
TP = 0
FP = 0
TN = 0
FN = 0
accu = 0
for p, t in zip(predict, ground_truth):
    if p == t:
        accu += 1

    if p == 1 and t == 1:
        TP += 1
    elif p == 1 and t == 0:
        FP += 1
    elif p == 0 and t == 1:
        FN += 1
    else:
        TN += 1

print(f'thres: {threshold}')
print(f'TP: {TP}')
print(f'FP: {FP}')
print(f'TN: {TN}')
print(f'FN: {FN}')

accuracy = accu / len(predict)
precision = TP / (TP + FP) if (TP + FP) else 0
recall = TP / (TP + FN) if (TP + FN) else 0
F1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0

print(f'accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {F1}')

thres: 5
TP: 45
FP: 21
TN: 26
FN: 2
accuracy: 0.7553191489361702
Precision: 0.6818181818181818
Recall: 0.9574468085106383
F1: 0.7964601769911505
CPU times: user 3.24 ms, sys: 26 µs, total: 3.27 ms
Wall time: 6.86 ms
