In [1]:
import os
import sys
import logging
import pandas as pd
from spellpy import spell
from sklearn.model_selection import train_test_split

In [2]:
input_dir = "./data/banking_simulation"
output_dir = "./data/banking_simulation/result"

# 2023-11-03 01:46:40 - IP: 152.237.212.155 - Location: Berlin, Germany - Device ID: D7487C - User: ryan - Failed login attempt from user ryan.
log_format = "<Date> <Time> - IP: <IP> - Location: <Location> - Device ID: <DeviceID> - User: <User> - <Content>"
log_main = "banking_simulation"
tau = 0.5

def preprocess():
    parser = spell.LogParser(
        indir=input_dir,
        outdir=output_dir,
        log_format=log_format,
        logmain=log_main,
        tau=tau
    )

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    log_files = [
        'banking_simulation_normal.log',
        'banking_simulation_abnormal.log'
    ]
    for log_name in log_files:
        parser.parse(log_name)

    df_normal = pd.read_csv(f'{output_dir}/banking_simulation_normal.log_structured.csv')
    df_abnormal = pd.read_csv(f'{output_dir}/banking_simulation_abnormal.log_structured.csv')

    return df_normal, df_abnormal

df_normal, df_abnormal = preprocess()

[2023-11-03 02:32:18,627][INFO]: Parsing file: ./data/banking_simulation/banking_simulation_normal.log
[2023-11-03 02:32:18,760][INFO]: Loaded 36.9% of log lines.
[2023-11-03 02:32:18,856][INFO]: Loaded 73.7% of log lines.
[2023-11-03 02:32:18,927][INFO]: Loaded 100.0% of log lines.
[2023-11-03 02:32:18,945][INFO]: load_data() finished!
[2023-11-03 02:32:18,952][INFO]: Load objects done, lastestLineId: 87121
[2023-11-03 02:32:19,240][INFO]: Processed 36.9% of log lines.
[2023-11-03 02:32:19,511][INFO]: Processed 73.7% of log lines.
[2023-11-03 02:32:19,722][INFO]: Processed 100.0% of log lines.
[2023-11-03 02:32:20,438][INFO]: Output parse file
[2023-11-03 02:32:20,660][INFO]: lastestLindId: 87121
[2023-11-03 02:32:21,798][INFO]: rootNodePath: ./data/banking_simulation/result/rootNode.pkl
[2023-11-03 02:32:21,800][INFO]: logCluLPath: ./data/banking_simulation/result/logCluL.pkl
[2023-11-03 02:32:21,802][INFO]: Store objects done.
[2023-11-03 02:32:21,802][INFO]: Parsing done. [Time tak

In [3]:
df_normal.head()

Unnamed: 0,LineId,Date,Time,IP,Location,DeviceID,User,Content,EventId,EventTemplate,ParameterList
0,87122,2023-11-03,02:13:05,66.214.242.20,"New York, USA",3C00BA,jack,Failed login attempt from user jack.,6e63e1cc,<*> login attempt from user <*>,"['Failed', 'jack']"
1,87123,2023-11-03,02:18:05,66.214.242.20,"New York, USA",3C00BA,jack,Successful login attempt from user jack.,6e63e1cc,<*> login attempt from user <*>,"['Successful', 'jack']"
2,87124,2023-11-03,02:13:05,193.179.221.227,"Mumbai, India",AAC6A9,grace,Failed login attempt from user grace.,6e63e1cc,<*> login attempt from user <*>,"['Failed', 'grace']"
3,87125,2023-11-03,02:18:05,193.179.221.227,"Mumbai, India",AAC6A9,grace,Successful login attempt from user grace.,6e63e1cc,<*> login attempt from user <*>,"['Successful', 'grace']"
4,87126,2023-11-03,02:13:05,120.43.83.79,"London, UK",7E9E7E,penelope,Successful login attempt from user penelope.,6e63e1cc,<*> login attempt from user <*>,"['Successful', 'penelope']"


In [4]:
df_normal.shape

(27121, 11)

In [5]:
df_normal['EventTemplate'].value_counts()

Successful <*> of <*> units. <*>    18243
<*> login attempt from user <*>      8878
Name: EventTemplate, dtype: int64

In [6]:
df_normal['EventId'].value_counts()

73d80640    18243
6e63e1cc     8878
Name: EventId, dtype: int64

In [7]:
df_normal['User'].value_counts()

alice        659
oscar        636
leah         636
daniel       624
quinn        617
bob          604
victor       603
katherine    603
zane         601
ethan        601
david        600
sophia       596
ella         595
jack         592
mason        591
lily         588
ivy          587
james        586
lucy         581
emma         578
eve          576
zoe          574
ava          574
oliver       572
olivia       572
aiden        572
grace        570
liam         569
xander       569
hannah       565
noah         560
sophie       559
ursula       557
chloe        557
willow       557
penelope     556
nora         556
jacob        555
thomas       555
mia          554
charlie      547
ryan         547
benjamin     543
frank        542
yasmine      532
harper       530
amelia       523
Name: User, dtype: int64

In [None]:
df_normal_train, df_normal_test = train_test_split(df_normal, test_size=0.2, shuffle=False)

In [None]:
%%time
def group_logs_by_datetime(df, event_id_map):
    df["Datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"])
    df = df[["Datetime", "EventId"]]
    df["EventId"] = df["EventId"].apply(lambda e: event_id_map[e] if event_id_map.get(e) else -1)
    deeplog_df = df.set_index("Datetime").resample("1min").apply(lambda arr: list(arr)).reset_index()
    return deeplog_df


def save_deeplog_df(filename, df):
    with open(filename, "w") as f:
        for event_id_list in df["EventId"]:
            for event_id in event_id_list:
                f.write(str(event_id) + " ")
            f.write("\n")

def generate_log_key_sequences(df_normal_train, df_normal_test, df_abnormal):
    event_id_map = {}
    for index, event_id in enumerate(df_normal_train["EventId"].unique(), 1):
        event_id_map[event_id] = index

    print(f"Número de log keys únicos {len(event_id_map)}")

    print(event_id_map)
    deeplog_train_df = group_logs_by_datetime(df_normal_train, event_id_map)
    save_deeplog_df("train", deeplog_train_df)

    deeplog_test_normal = group_logs_by_datetime(df_normal_test, event_id_map)
    save_deeplog_df("test_normal", deeplog_test_normal)

    deeplog_test_abnormal = group_logs_by_datetime(df_abnormal, event_id_map)
    save_deeplog_df("test_abnormal", deeplog_test_abnormal)

generate_log_key_sequences(df_normal_train, df_normal_test, df_abnormal)

In [None]:
import json

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

NUM_CLASSES = 1143
NUM_CANDIDATES = 114
EPOCHS = 35
WINDOW_SIZE =  3
BATCH_SIZE = 64
SEED = 1

INPUT_SIZE = 1
HIDDEN_SIZE = 64
NUM_LAYERS = 2

In [None]:
class Generate:
    def __init__(self):
        self.file = None
  
    def generate(self, filename, window_size):
        num_sessions = 0
        inputs = []
        outputs = []

        line = self.init_line(filename)
        while line:
            line = tuple(map(lambda n: n - 1, map(int, line.strip().split())))
            for i in range(len(line) - window_size):
                inputs.append(line[i:i+window_size])
                outputs.append(line[i+window_size])
            line = self.readline()
            num_sessions += 1

        print('Number of session({}): {}'.format(filename, len(inputs)))
        print('Number of seqs({}): {}'.format(filename, len(inputs)))

        dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float), torch.tensor(outputs))

        return dataset

    def init_line(self, filename):
        f = open(filename, 'r')
        self.file = f
        line = self.file.readline()
        return line

    def readline(self):
        line = self.file.readline()
        return line

def get_train_data_loader():
    print("Get train data loader")
    generate = Generate()
    sequence_dataset = generate.generate(filename="train", window_size=WINDOW_SIZE)
    dataloader = DataLoader(sequence_dataset, batch_size=BATCH_SIZE, shuffle=None, sampler=None)
    return dataloader


def save_model(model, model_dir):
    print("Saving the model.")
    path = os.path.join(model_dir, 'model.pth')
    torch.save(model.cpu().state_dict(), path)
    model_info_path = os.path.join(model_dir, 'model_info.pth')
    with open(model_info_path, 'wb') as f:
        model_info = {
            'input_size': INPUT_SIZE,
            'hidden_size': HIDDEN_SIZE,
            'num_layers': NUM_LAYERS,
            'num_classes': NUM_CLASSES,
            'num_candidates': NUM_CANDIDATES,
            'window_size': WINDOW_SIZE,
        }
        torch.save(model_info, f)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

In [None]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size).to(input.device)
        c0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size).to(input.device)
        out, _ = self.lstm(input, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [None]:
%%time
torch.manual_seed(SEED)
train_loader = get_train_data_loader()

print("processed {}/{} ({:.0f}%) of traind data".format(
    len(train_loader.sampler), len(train_loader.dataset),
    100. * len(train_loader.sampler) / len(train_loader.dataset)
))

model = Model(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

for epoch in range(1, EPOCHS + 1):
    model.train()
    train_loss = 0
    for seq, label in train_loader:
        seq = seq.clone().detach().view(-1, WINDOW_SIZE, INPUT_SIZE).to(device)
        optimizer.zero_grad()
        output = model(seq)
        loss = criterion(output, label.to(device))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print('Epoch [{}/{}], Train_loss: {}'.format(
        epoch, EPOCHS, round(train_loss/len(train_loader.dataset), 4)
    ))

print('Finished Training')

In [None]:
!mkdir model

In [None]:
model

In [None]:
save_model(model, 'model')

## Prediccion

In [None]:
def load_model(model_dir):
    print("Loading the model.")

    model_info = {}
    with open(os.path.join(model_dir, "model_info.pth"), "rb") as f:
        model_info = torch.load(f)

    print("model_info: {}".format(model_info))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Current device: {}".format(device))

    model = Model(
        input_size=model_info["input_size"],
        hidden_size=model_info["hidden_size"],
        num_layers=model_info["num_layers"],
        num_classes=model_info["num_classes"],
    )

    with open(os.path.join(model_dir, "model.pth"), "rb") as f:
        model.load_state_dict(torch.load(f))

    return {
        "model": model.to(device),
        "window_size": model_info["window_size"],
        "input_size": model_info["input_size"],
        "num_candidates": model_info["num_candidates"]
    }

def predict(input_data, model_info):
    line = input_data['line']
    num_candidates = model_info['num_candidates']
    input_size = model_info['input_size']
    window_size = model_info['window_size']
    model = model_info['model']

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    predict_cnt = 0
    anomaly_cnt = 0
    predict_list = [0] * len(line)
    for i in range(len(line) - window_size):
        seq = line[i:i + window_size]
        label = line[i + window_size]
        seq = torch.tensor(seq, dtype=torch.float).view(-1, window_size, input_size).to(device)
        label = torch.tensor(label).view(-1).to(device)
        output = model(seq)
        predict = torch.argsort(output, 1)[0][-num_candidates:]
        if label not in predict:
            anomaly_cnt += 1
            predict_list[i + window_size] = 1
            predict_cnt += 1
    return {'anomaly_cnt': anomaly_cnt, 'predict_cnt': predict_cnt, 'predict_list': predict_list}

In [None]:
%%time
model_info = load_model("./model")
test_abnormal_list = []
with open('test_abnormal', 'r') as f:
    for line in f.readlines():
        line = list(map(lambda n: n - 1, map(int, line.strip().split())))
        response = predict(json.loads(json.dumps({"line": line})), model_info)
        test_abnormal_list.append(response)

test_normal_list = []
with open('test_normal', 'r') as f:
    for line in f.readlines():
        line = list(map(lambda n: n - 1, map(int, line.strip().split())))
        response = predict(json.loads(json.dumps({"line": line})), model_info)
        test_normal_list.append(response)

In [None]:
%%time
threshold = 25
abnormal_has_anomaly = [1 if t["anomaly_cnt"] > threshold else 0 for t in test_abnormal_list]
abnormal_cnt_anomaly = [t["anomaly_cnt"] for t in test_abnormal_list]
abnormal_predict = []
for test_abnormal in test_abnormal_list:
    abnormal_predict += test_abnormal["predict_list"]

normal_has_anomaly = [1 if t['anomaly_cnt'] > threshold else 0 for t in test_normal_list]
normal_cnt_anomaly = [t['anomaly_cnt'] for t in test_normal_list]
normal_predict = []
for test_normal in test_normal_list:
    normal_predict += test_normal['predict_list']

ground_truth = [1]*len(abnormal_has_anomaly) + [0]*len(normal_has_anomaly)
predict = abnormal_has_anomaly + normal_has_anomaly
TP = 0
FP = 0
TN = 0
FN = 0
accu = 0
for p, t in zip(predict, ground_truth):
    if p == t:
        accu += 1

    if p == 1 and t == 1:
        TP += 1
    elif p == 1 and t == 0:
        FP += 1
    elif p == 0 and t == 1:
        FN += 1
    else:
        TN += 1

print(f'thres: {threshold}')
print(f'TP: {TP}')
print(f'FP: {FP}')
print(f'TN: {TN}')
print(f'FN: {FN}')

accuracy = accu / len(predict)
precision = TP / (TP + FP) if (TP + FP) else 0
recall = TP / (TP + FN) if (TP + FN) else 0
F1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0

print(f'accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {F1}')