# AutoEncoder vs Local Outlier Factor

Here we want to understand if AE can be more efficient than LOF

In [1]:
import os
import time
import numpy as np
import logging

from itertools import product
import pandas as pd
import re
import gensim as gs
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from sklearn.neighbors import LocalOutlierFactor
import matplotlib.pyplot as plt

from multiprocessing import Pool
from functools import partial

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow

# Define functions

#### 1. Log Preprocesing

In [2]:
def preprocess(data):
    for col in data.columns:
        if col == "message":
            data[col] = data[col].apply(clean_message)
        else:
            data[col] = data[col].apply(to_str)

    data = data.fillna("EMPTY")
    
def clean_message(line):
    """Remove all none alphabetical characters from message strings."""
    words = list(re.findall("[a-zA-Z]+", line))
    return words

def to_str(x):
    """Convert all non-str lists to string lists for Word2Vec."""
    ret = " ".join([str(y) for y in x]) if isinstance(x, list) else str(x)
    return ret

#### 2. Text encoding

In [3]:
def create(logs, vector_length, window_size):
    """Create new word2vec model."""
    model = gs.models.Word2Vec(sentences=list(logs), size=vector_length, window=window_size)
    return model

def get_vectors(model, logs, vector_length):
    """Return logs as list of vectorized words"""
    vectors = []
    for x in logs:
        temp = []
        for word in x:
            if word in model.wv:
                temp.append(model.wv[word])
            else:
                temp.append(np.array([0]*vector_length))
        vectors.append(temp)
    return vectors

def _log_words_to_one_vector(log_words_vectors):
        result = []
        log_array_transposed = np.array(log_words_vectors, dtype=object).transpose()
        for coord in log_array_transposed:
            result.append(np.mean(coord))
        return result

def vectorized_logs_to_single_vectors(vectors):
    """Represent log messages as vectors according to the vectors
    of the words in these logs

    :params vectors: list of log messages, represented as list of words vectors
            [[wordvec11, wordvec12], [wordvec21, wordvec22], ...]
    """
    result = []
    for log_words_vector in vectors:
        result.append(_log_words_to_one_vector(log_words_vector))
    return np.array(result)

#### 3. Training

Time measure decorator

In [4]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [5]:
@timeit
def train_lof(X, n_neighbors, metric):
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, metric=metric)
    pred = lof.fit_predict(X)
    lof_model = LocalOutlierFactor(n_neighbors=n_neighbors, metric=metric, novelty=True)
    lof_model.fit(X)
    return pred, lof_model

In [6]:
class AutoEncoder(Model):
    """
    Parameters
    ----------
    output_units: int
      Number of output units
  
    code_size: int
      Number of units in bottle neck
    """

    def __init__(self, output_units, code_size=8):
        super().__init__()
        self.encoder = Sequential([
          Dense(64, activation='relu'),
          Dropout(0.1),
          Dense(32, activation='relu'),
          Dropout(0.1),
          Dense(16, activation='relu'),
          Dropout(0.1),
          Dense(code_size, activation='relu')
        ])
        self.decoder = Sequential([
          Dense(16, activation='relu'),
          Dropout(0.1),
          Dense(32, activation='relu'),
          Dropout(0.1),
          Dense(64, activation='relu'),
          Dropout(0.1),
          Dense(output_units, activation='sigmoid')
        ])
  
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

    @timeit
    def fitit(self, *args, **kwargs):
        super().fit(*args, **kwargs)

In [78]:
def find_threshold(model, x_train_scaled):
    reconstructions = model.predict(x_train_scaled)
    # provides losses of individual instances
    reconstruction_errors = tensorflow.keras.losses.msle(reconstructions, x_train_scaled)
    # threshold for anomaly scores
    threshold = np.mean(reconstruction_errors.numpy()) \
                + np.std(reconstruction_errors.numpy())
    return threshold

def get_predictions(model, x_test_scaled, threshold):
    predictions = model.predict(x_test_scaled)
    # provides losses of individual instances
    errors = tensorflow.keras.losses.msle(predictions, x_test_scaled)
    # 1 = anomaly, 0 = normal
    anomaly_mask = pd.Series(errors) > threshold
    preds = anomaly_mask.map(lambda x: 1 if x == True else 0)
    return preds, errors

#### 4. Prediction

In [8]:
def infer_lof(log, lof, loglist):
    log = pd.DataFrame({"message": log}, index=[1])
    preprocess(log)
    
    vector = []
    w2v = gs.models.Word2Vec([log.message.iloc[0]] + loglist,
                             min_count=1, size=25, window=5)
    for word in log.message.iloc[0]:
        if word in w2v.wv.vocab.keys():
            vector.append(w2v.wv[word])
        else:
            vector.append(np.array([0]*25))
    one_vector = _log_words_to_one_vector(vector)
    pred = lof.predict([one_vector])
    score = abs(lof.score_samples([one_vector])[0])
    if pred[0] == -1:
        return 1, score
    return 0, score

# Implementation

In [196]:
data_path = r"file:///home/nadzya/Apps/log-anomaly-detector/validation_data/slx.json"
data = pd.DataFrame(pd.read_json(data_path, orient=str).message).iloc[:10000]

preprocessed_data = data.copy()
preprocess(preprocessed_data)

logs_list = list(preprocessed_data.message)

In [197]:
w2v = create(logs_list, vector_length=25, window_size=5)

In [198]:
vectors = get_vectors(model=w2v, logs=logs_list, vector_length=25)
logs_as_vectors = vectorized_logs_to_single_vectors(vectors)

## LOF

In [None]:
pred, lof = train_lof(logs_as_vectors, 100, 'euclidean')

In [None]:
anomaly_lof = []
for x in pred:
    if x == 1:
        anomaly_lof.append(0)
    else:
        anomaly_lof.append(1)

In [None]:
100*len([x for x in anomaly_lof if x == 1])/len(data)

In [None]:
data_lof = data.copy()
data_lof["anomaly"] = anomaly_lof

## AE

In [None]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
logs_scaled = min_max_scaler.fit_transform(logs_as_vectors.copy())

In [None]:
ae = AutoEncoder(output_units=logs_scaled.shape[1])
ae.compile(loss='msle', metrics=['mse'], optimizer='adam')

In [None]:
ae_result = ae.fitit(logs_scaled, logs_scaled, epochs=20, batch_size=512)

In [None]:
threshold = find_threshold(ae, logs_scaled)
threshold

In [None]:
predictions, ae_errors = get_predictions(ae, logs_scaled, threshold)

In [None]:
100*len(predictions.loc[predictions == 1])/len(data)

In [None]:
data_ae = data.copy()
data_ae["anomaly"] = predictions

## AE vs LOF

In [None]:
lof_anomaly_msgs = list(data_lof.loc[data_lof["anomaly"] == 1].message)

In [None]:
ae_anomaly_msgs = list(data_ae.loc[data_ae["anomaly"] == 1].message)

In [None]:
lof_diff_logs = []
ae_diff_logs = []
for x in list(set(ae_anomaly_msgs) - set(lof_anomaly_msgs)) + list(set(lof_anomaly_msgs) - set(ae_anomaly_msgs)) :
    if (x in lof_anomaly_msgs) and (not x in ae_anomaly_msgs):
        lof_diff_logs.append(x)
    if x in ae_anomaly_msgs and not x in lof_anomaly_msgs:
        ae_diff_logs.append(x)

In [None]:
print("Total logs", len(data))
print("Anomalies, detected by LOF:", len(lof_anomaly_msgs))
print("Anomaly percentage, detected by LOF:", len(lof_anomaly_msgs)*100/len(data), "%")

print("Anomalies, detected by AE:", len(ae_anomaly_msgs))
print("Anomaly percentage, detected by AE:", len(ae_anomaly_msgs)*100/len(data), "%")

print("Number of messages, that LOF detected, but AE did not:", len(lof_diff_logs))
print("Number of messages, that AE detected, but LOF did not:", len(ae_diff_logs))

In [None]:
lof_diff_logs

In [None]:
ae_diff_logs

### The result

AE detects more anomaly messages than LOF, but some of these messages are not actually anomalies.

Both LOF and AE detects the same types of anomaly messages, but AE marks all messages of such type (for example, login failed) as anomalies. In contrast, LOF detects only part of such messages

# Ensembling

In [None]:
lof_scores = abs(lof.score_samples(logs_as_vectors))
lof_scores_normalized = lof_scores/max(lof_scores)

In [None]:
lof_ae_scores = list(zip(list(lof_scores), list(map(float, ae_errors))))

In [None]:
threshold

In [None]:
ensemble_anomalies = []
for i in range(len(anomaly_lof)):
    if ae_errors[i] > threshold and lof_scores[i] > 1:
        ensemble_anomalies.append(data.iloc[i].message)

In [None]:
set(ae_anomaly_msgs) - set(ensemble_anomalies)

In [None]:
len(ensemble_anomalies)