# AutoEncoder vs Local Outlier Factor

Here we want to understand if AE can be more efficient than LOF

In [1]:
import os
import time
import numpy as np
import logging

from itertools import product
import pandas as pd
import re
import gensim as gs
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from sklearn.neighbors import LocalOutlierFactor
import matplotlib.pyplot as plt

from multiprocessing import Pool
from functools import partial

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow

# Define functions

#### 1. Log Preprocesing

In [2]:
def preprocess(data):
    for col in data.columns:
        if col == "message":
            data[col] = data[col].apply(clean_message)
        else:
            data[col] = data[col].apply(to_str)

    data = data.fillna("EMPTY")
    
def clean_message(line):
    """Remove all none alphabetical characters from message strings."""
    words = list(re.findall("[a-zA-Z]+", line))
    return words

def to_str(x):
    """Convert all non-str lists to string lists for Word2Vec."""
    ret = " ".join([str(y) for y in x]) if isinstance(x, list) else str(x)
    return ret

#### 2. Text encoding

In [3]:
def create(logs, vector_length, window_size):
    """Create new word2vec model."""
    model = gs.models.Word2Vec(sentences=list(logs), size=vector_length, window=window_size)
    return model

def get_vectors(model, logs, vector_length):
    """Return logs as list of vectorized words"""
    vectors = []
    for x in logs:
        temp = []
        for word in x:
            if word in model.wv:
                temp.append(model.wv[word])
            else:
                temp.append(np.array([0]*vector_length))
        vectors.append(temp)
    return vectors

def _log_words_to_one_vector(log_words_vectors):
        result = []
        log_array_transposed = np.array(log_words_vectors, dtype=object).transpose()
        for coord in log_array_transposed:
            result.append(np.mean(coord))
        return result

def vectorized_logs_to_single_vectors(vectors):
    """Represent log messages as vectors according to the vectors
    of the words in these logs

    :params vectors: list of log messages, represented as list of words vectors
            [[wordvec11, wordvec12], [wordvec21, wordvec22], ...]
    """
    result = []
    for log_words_vector in vectors:
        result.append(_log_words_to_one_vector(log_words_vector))
    return np.array(result)

#### 3. Training

Time measure decorator

In [4]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [5]:
@timeit
def train_lof(X, n_neighbors, metric):
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, metric=metric)
    pred = lof.fit_predict(X)
    lof_model = LocalOutlierFactor(n_neighbors=n_neighbors, metric=metric, novelty=True)
    lof_model.fit(X)
    return pred, lof_model

In [6]:
class AutoEncoder(Model):
    """
    Parameters
    ----------
    output_units: int
      Number of output units
  
    code_size: int
      Number of units in bottle neck
    """

    def __init__(self, output_units, code_size=8):
        super().__init__()
        self.encoder = Sequential([
          Dense(64, activation='relu'),
          Dropout(0.1),
          Dense(32, activation='relu'),
          Dropout(0.1),
          Dense(16, activation='relu'),
          Dropout(0.1),
          Dense(code_size, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(16, activation='relu'),
          Dropout(0.1),
          Dense(32, activation='relu'),
          Dropout(0.1),
          Dense(64, activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

    @timeit
    def fitit(self, *args, **kwargs):
        self.fit(*args, **kwargs)

In [7]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    # provides losses of individual instances
    reconstruction_errors = tensorflow.keras.losses.msle(reconstructions, x_train_scaled)
    # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
                + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_test_scaled, threshold):
    predictions = model.predict(x_test_scaled)
    # provides losses of individual instances
    errors = tensorflow.keras.losses.msle(predictions, x_test_scaled)
    # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1 if x == True else 0)
    return preds, errors

#### 4. Prediction

In [8]:
def infer_lof(log, lof, loglist):
    log = pd.DataFrame({"message": log}, index=[1])
    preprocess(log)
    
    vector = []
    w2v = gs.models.Word2Vec([log.message.iloc[0]] + loglist,
                             min_count=1, size=25, window=5)
    for word in log.message.iloc[0]:
        if word in w2v.wv.vocab.keys():
            vector.append(w2v.wv[word])
        else:
            vector.append(np.array([0]*25))
    one_vector = _log_words_to_one_vector(vector)
    pred = lof.predict([one_vector])
    score = abs(lof.score_samples([one_vector])[0])
    if pred[0] == -1:
        return 1, score
    return 0, score

# Implementation

In [41]:
data_path = r"file:///home/nadzya/Apps/log-anomaly-detector/validation_data/solidex.by.json"
data = pd.DataFrame(pd.read_json(data_path, orient=str).message).iloc[:10000]

preprocessed_data = data.copy()
preprocess(preprocessed_data)

logs_list = list(preprocessed_data.message)

In [42]:
w2v = create(logs_list, vector_length=25, window_size=5)

In [43]:
vectors = get_vectors(model=w2v, logs=logs_list, vector_length=25)
logs_as_vectors = vectorized_logs_to_single_vectors(vectors)

## LOF

In [44]:
pred, lof = train_lof(logs_as_vectors, 100, 'euclidean')

'train_lof'  3829.86 ms


In [45]:
anomaly_lof = []
for x in pred:
    if x == 1:
        anomaly_lof.append(0)
    else:
        anomaly_lof.append(1)

In [46]:
100*len([x for x in anomaly_lof if x == 1])/len(data)

1.19

In [47]:
data_lof = data.copy()
data_lof["anomaly"] = anomaly_lof

## AE

In [48]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
logs_scaled = min_max_scaler.fit_transform(logs_as_vectors.copy())

In [49]:
ae = AutoEncoder(output_units=logs_scaled.shape[1])
ae.compile(loss='msle', metrics=['mse'], optimizer='adam')

In [50]:
ae_result = ae.fitit(logs_scaled, logs_scaled, epochs=20, batch_size=512)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
'fitit'  3268.38 ms


In [51]:
threshold = find_threshold(ae, logs_scaled)
threshold

0.0149065366831122

In [52]:
predictions, ae_errors = get_predictions(ae, logs_scaled, threshold)

In [53]:
100*len(predictions.loc[predictions == 1])/len(data)

7.27

In [54]:
data_ae = data.copy()
data_ae["anomaly"] = predictions

## AE vs LOF

In [55]:
lof_anomaly_msgs = list(data_lof.loc[data_lof["anomaly"] == 1].message)

In [56]:
ae_anomaly_msgs = list(data_ae.loc[data_ae["anomaly"] == 1].message)

In [57]:
lof_diff_logs = []
ae_diff_logs = []
for x in list(set(ae_anomaly_msgs) - set(lof_anomaly_msgs)) + list(set(lof_anomaly_msgs) - set(ae_anomaly_msgs)) :
    if (x in lof_anomaly_msgs) and (not x in ae_anomaly_msgs):
        lof_diff_logs.append(x)
    if x in ae_anomaly_msgs and not x in lof_anomaly_msgs:
        ae_diff_logs.append(x)

In [58]:
print("Total logs", len(data))
print("Anomalies, detected by LOF:", len(lof_anomaly_msgs))
print("Anomaly percentage, detected by LOF:", len(lof_anomaly_msgs)*100/len(data), "%")

print("Anomalies, detected by AE:", len(ae_anomaly_msgs))
print("Anomaly percentage, detected by AE:", len(ae_anomaly_msgs)*100/len(data), "%")

print("Number of messages, that LOF detected, but AE did not:", len(lof_diff_logs))
print("Number of messages, that AE detected, but LOF did not:", len(ae_diff_logs))

Total logs 10000
Anomalies, detected by LOF: 119
Anomaly percentage, detected by LOF: 1.19 %
Anomalies, detected by AE: 727
Anomaly percentage, detected by AE: 7.27 %
Number of messages, that LOF detected, but AE did not: 7
Number of messages, that AE detected, but LOF did not: 97


In [59]:
lof_diff_logs

['<158>Nov 25 15:18:31 195-137-160-145 nginx-access 34.230.24.123 - - [25/Nov/2021:15:18:24 +0300] "GET / HTTP/1.1" 301 0 "-" "CCBot/2.0 (https://commoncrawl.org/faq/)"',
 '<158>Nov 25 16:01:54 195-137-160-145 nginx-access 35.85.55.209 - - [25/Nov/2021:16:01:46 +0300] "GET /wp-content/themes/wsp/assets/img/favicon-32x32.png HTTP/1.1" 200 1839 "-" "Go-http-client/1.1"',
 '<158>Nov 25 16:53:56 195-137-160-145 nginx-access 109.252.51.37 - - [25/Nov/2021:16:53:50 +0300] "GET /category/stati/feed/ HTTP/1.1" 200 28856 "-" "Reeder/5000.99.01 CFNetwork/1325.0.1 Darwin/21.1.0"',
 '<158>Nov 25 14:53:40 195-137-160-145 nginx-access 109.252.51.37 - - [25/Nov/2021:14:53:40 +0300] "GET /category/stati/feed/ HTTP/1.1" 200 28856 "-" "Reeder/5000.99.01 CFNetwork/1325.0.1 Darwin/21.1.0"',
 '<158>Nov 25 14:23:49 195-137-160-145 nginx-access 109.252.51.37 - - [25/Nov/2021:14:23:39 +0300] "GET /category/stati/feed/ HTTP/1.1" 200 28856 "-" "Reeder/5000.99.01 CFNetwork/1325.0.1 Darwin/21.1.0"',
 '<158>Nov 25

In [60]:
ae_diff_logs

['<158>Nov 25 16:32:55 195-137-160-145 nginx-access 63.143.42.253 - - [25/Nov/2021:16:32:49 +0300] "HEAD / HTTP/1.1" 200 0 "http://solidex.by" "Mozilla/5.0+(compatible; UptimeRobot/2.0; http://www.uptimerobot.com/)"',
 '<158>Nov 25 16:32:55 195-137-160-145 nginx-access 63.143.42.253 - - [25/Nov/2021:16:32:48 +0300] "HEAD / HTTP/1.1" 301 0 "http://solidex.by" "Mozilla/5.0+(compatible; UptimeRobot/2.0; http://www.uptimerobot.com/)"',
 '<158>Nov 25 14:22:49 195-137-160-145 nginx-access 63.143.42.253 - - [25/Nov/2021:14:22:48 +0300] "HEAD / HTTP/1.1" 301 0 "http://solidex.by" "Mozilla/5.0+(compatible; UptimeRobot/2.0; http://www.uptimerobot.com/)"',
 '<158>Nov 25 14:41:20 195-137-160-145 nginx-access 69.162.124.230 - - [25/Nov/2021:14:41:19 +0300] "HEAD / HTTP/1.1" 200 0 "http://solidex.by" "Mozilla/5.0+(compatible; UptimeRobot/2.0; http://www.uptimerobot.com/)"',
 '<158>Nov 25 15:56:13 195-137-160-145 nginx-access 69.162.124.230 - - [25/Nov/2021:15:56:13 +0300] "HEAD / HTTP/1.1" 301 0 "ht

### The result

AE detects more anomaly messages than LOF, but some of these messages are not actually anomalies.

Both LOF and AE detects the same types of anomaly messages, but AE marks all messages of such type (for example, login failed) as anomalies. In contrast, LOF detects only part of such messages

# Ensembling

In [61]:
lof_scores = abs(lof.score_samples(logs_as_vectors))
lof_scores_normalized = lof_scores/max(lof_scores)

In [62]:
lof_ae_scores = list(zip(list(lof_scores), list(map(float, ae_errors))))

In [63]:
threshold

0.0149065366831122

In [64]:
ensemble_anomalies = []
for i in range(len(anomaly_lof)):
    if ae_errors[i] > threshold and lof_scores[i] > 1:
        ensemble_anomalies.append(data.iloc[i].message)

In [65]:
set(ae_anomaly_msgs) - set(ensemble_anomalies)

{'<158>Nov 25 13:06:15 195-137-160-145 nginx-access 69.162.124.230 - - [25/Nov/2021:13:06:13 +0300] "HEAD / HTTP/1.1" 301 0 "http://solidex.by" "Mozilla/5.0+(compatible; UptimeRobot/2.0; http://www.uptimerobot.com/)"',
 '<158>Nov 25 13:06:15 195-137-160-145 nginx-access 69.162.124.230 - - [25/Nov/2021:13:06:14 +0300] "HEAD / HTTP/1.1" 200 0 "http://solidex.by" "Mozilla/5.0+(compatible; UptimeRobot/2.0; http://www.uptimerobot.com/)"',
 '<158>Nov 25 13:16:15 195-137-160-145 nginx-access 69.162.124.230 - - [25/Nov/2021:13:16:13 +0300] "HEAD / HTTP/1.1" 301 0 "http://solidex.by" "Mozilla/5.0+(compatible; UptimeRobot/2.0; http://www.uptimerobot.com/)"',
 '<158>Nov 25 13:16:15 195-137-160-145 nginx-access 69.162.124.230 - - [25/Nov/2021:13:16:14 +0300] "HEAD / HTTP/1.1" 200 0 "http://solidex.by" "Mozilla/5.0+(compatible; UptimeRobot/2.0; http://www.uptimerobot.com/)"',
 '<158>Nov 25 13:27:56 195-137-160-145 nginx-access 63.143.42.253 - - [25/Nov/2021:13:27:48 +0300] "HEAD / HTTP/1.1" 301 0 "

In [68]:
set(lof_anomaly_msgs) - set(ensemble_anomalies)

{'<158>Nov 25 14:23:49 195-137-160-145 nginx-access 109.252.51.37 - - [25/Nov/2021:14:23:39 +0300] "GET /category/stati/feed/ HTTP/1.1" 200 28856 "-" "Reeder/5000.99.01 CFNetwork/1325.0.1 Darwin/21.1.0"',
 '<158>Nov 25 14:53:40 195-137-160-145 nginx-access 109.252.51.37 - - [25/Nov/2021:14:53:40 +0300] "GET /category/stati/feed/ HTTP/1.1" 200 28856 "-" "Reeder/5000.99.01 CFNetwork/1325.0.1 Darwin/21.1.0"',
 '<158>Nov 25 15:18:31 195-137-160-145 nginx-access 34.230.24.123 - - [25/Nov/2021:15:18:24 +0300] "GET / HTTP/1.1" 301 0 "-" "CCBot/2.0 (https://commoncrawl.org/faq/)"',
 '<158>Nov 25 15:18:31 195-137-160-145 nginx-access 34.230.24.123 - - [25/Nov/2021:15:18:27 +0300] "GET / HTTP/1.1" 200 3717 "-" "CCBot/2.0 (https://commoncrawl.org/faq/)"',
 '<158>Nov 25 16:01:54 195-137-160-145 nginx-access 35.85.55.209 - - [25/Nov/2021:16:01:46 +0300] "GET /wp-content/themes/wsp/assets/img/favicon-32x32.png HTTP/1.1" 200 1839 "-" "Go-http-client/1.1"',
 '<158>Nov 25 16:53:46 195-137-160-145 nginx

In [67]:
ensemble_anomalies

['<158>Nov 25 12:04:11 195-137-160-145 nginx-access 167.99.37.163 - - [25/Nov/2021:12:04:10 +0300] "GET /courses/ HTTP/1.1" 200 3207 "http://www.solidex.by/o-kompanii/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"',
 '<158>Nov 25 13:01:14 195-137-160-145 nginx-access 54.36.148.92 - - [25/Nov/2021:13:01:11 +0300] "GET /blogs/solidex/building-the-house-or-the-network-in-common.php HTTP/1.1" 301 0 "-" "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)"',
 '<158>Nov 25 13:01:14 195-137-160-145 nginx-access 54.36.148.92 - - [25/Nov/2021:13:01:11 +0300] "GET /blogs/solidex/building-the-house-or-the-network-in-common.php HTTP/1.1" 301 0 "-" "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)"',
 '<158>Nov 25 14:24:59 195-137-160-145 nginx-access 216.244.66.231 - - [25/Nov/2021:14:24:56 +0300] "GET /robots.txt HTTP/1.1" 404 169 "-" "Mozilla/5.0 (compatible; DotBot/1.2; +https://opensiteexplorer.org/d

### The result

AE helps to remove FP from LOF prefictions and add some extra anomaly entries. LOF also helps to remove the garbage from AE predictions