In [17]:
import os
import os.path as path
from os.path import exists

from csv import (writer, DictWriter)

import pandas as pd
import math
import numpy as np

from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig

import fasttext

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import (confusion_matrix, precision_score, recall_score, f1_score)

In [18]:
data_dir = 'data'
input_dir = 'input'
output_dir = 'output'
log_file_name = 'bgl_2k.log'

WINDOW_SIZE = 5

## Setup Directories

In [19]:
project_root = os.getcwd()

log_csv_file_name = 'bgl_2k.csv'
log_csv_file = log_file = path.abspath(path.join(project_root, input_dir, log_csv_file_name))

if exists(log_csv_file):
    os.remove(log_csv_file)

main_structured_csv_filename = 'BGL_main_structured.csv'
main_structured_csv_file = path.abspath(path.join(project_root, output_dir, main_structured_csv_filename))

if exists(main_structured_csv_file):
    os.remove(main_structured_csv_file)

clusters_csv_filename = 'BGL_main_templates.csv'
clusters_csv_file = path.abspath(path.join(project_root, output_dir, clusters_csv_filename))

if exists(clusters_csv_file):
    os.remove(clusters_csv_file)

## Convert BGL.log file to CSV format.

In [20]:
log_file = path.abspath(path.join(project_root, '../', data_dir, log_file_name))
logs = open(log_file, 'r')

with open(log_csv_file, 'a') as log_csv_file_obj:
    log_csv_writer_obj = writer(log_csv_file_obj)
    for line in logs:
        split_data = line.rstrip('\n').split(' ')
        split_data[9] = ' '.join(split_data[9:])
        log_csv_writer_obj.writerow(split_data[0:10])
    log_csv_file_obj.close

## EDA

In [21]:
original_df = pd.read_csv(log_csv_file, names=['Anomaly Type', 'Timestamp (ms)', 'Date', 'Node', 'Timestamp', 'Node Repeat', 'Message Type', 'Component', 'Level', 'Content'])
original_df['Anomaly Label'] = np.where(original_df['Anomaly Type'] == '-', 0, 1)
original_df.head(n=20)

Unnamed: 0,Anomaly Type,Timestamp (ms),Date,Node,Timestamp,Node Repeat,Message Type,Component,Level,Content,Anomaly Label
0,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.675872,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
1,-,1117838573,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.53.276129,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
2,-,1117838976,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.49.36.156884,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
3,-,1117838978,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.49.38.026704,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
4,-,1117842440,2005.06.03,R23-M0-NE-C:J05-U01,2005-06-03-16.47.20.730545,R23-M0-NE-C:J05-U01,RAS,KERNEL,INFO,63543 double-hummer alignment exceptions,0
5,-,1117842974,2005.06.03,R24-M0-N1-C:J13-U11,2005-06-03-16.56.14.254137,R24-M0-N1-C:J13-U11,RAS,KERNEL,INFO,162 double-hummer alignment exceptions,0
6,-,1117843015,2005.06.03,R21-M1-N6-C:J08-U11,2005-06-03-16.56.55.309974,R21-M1-N6-C:J08-U11,RAS,KERNEL,INFO,141 double-hummer alignment exceptions,0
7,-,1117848119,2005.06.03,R16-M1-N2-C:J17-U01,2005-06-03-18.21.59.871925,R16-M1-N2-C:J17-U01,RAS,KERNEL,INFO,"CE sym 2, at 0x0b85eee0, mask 0x05",0
8,APPREAD,1117869872,2005.06.04,R04-M1-N4-I:J18-U11,2005-06-04-00.24.32.432192,R04-M1-N4-I:J18-U11,RAS,APP,FATAL,ciod: failed to read message prefix on control...,1
9,APPREAD,1117869876,2005.06.04,R27-M1-N4-I:J18-U01,2005-06-04-00.24.36.222560,R27-M1-N4-I:J18-U01,RAS,APP,FATAL,ciod: failed to read message prefix on control...,1


In [22]:
df = original_df[['Anomaly Label', 'Content']]
df.head(20)

Unnamed: 0,Anomaly Label,Content
0,0,instruction cache parity error corrected
1,0,instruction cache parity error corrected
2,0,instruction cache parity error corrected
3,0,instruction cache parity error corrected
4,0,63543 double-hummer alignment exceptions
5,0,162 double-hummer alignment exceptions
6,0,141 double-hummer alignment exceptions
7,0,"CE sym 2, at 0x0b85eee0, mask 0x05"
8,1,ciod: failed to read message prefix on control...
9,1,ciod: failed to read message prefix on control...


## Create Log Sequences and Labels for Sequences

In [23]:
n = math.floor(df['Content'].index.size/WINDOW_SIZE)
r = math.floor(df['Content'].index.size%WINDOW_SIZE)

if r != 0:
    log_seqs = np.array(np.split(np.array(df['Content'])[:-r], n))
else:
    log_seqs = np.array(np.split(np.array(df['Content']), n))

In [24]:
if r != 0:
    log_seq_idx = np.array(np.split(df.index.to_numpy()[:-r], n))
else:
    log_seq_idx = np.array(np.split(df.index.to_numpy(), n))

In [25]:
log_seq_anomaly_labels = np.empty([n], dtype=int)
i = 0
for seq in log_seq_idx:
    if np.sum(df.loc[seq]['Anomaly Label'].values) > 0:
        log_seq_anomaly_labels[i] = 1
    else:
        log_seq_anomaly_labels[i] = 0
    i += 1
log_seq_anomaly_labels.shape

(400,)

## Log Parsing & Numericalization

In [26]:
class DrainParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        drain_config_file = 'drain3.ini'
        config = TemplateMinerConfig()
        config.load(drain_config_file)

        self.template_miner = TemplateMiner(config=config)

    def fit(self, X, y = None):
        return self

    def transform(self, log_seqs, y = None):
        self.parsed = []
        log_seqs_list = log_seqs.reshape([-1]).tolist()

        for line in log_seqs_list:
            self.parsed.append(self.template_miner.add_log_message(line))

        with open(main_structured_csv_file, 'w') as main_structured_csv_file_obj:
            main_structured_csv_filewriter = DictWriter(main_structured_csv_file_obj, fieldnames=['template_mined', 'cluster_id', 'change_type', 'cluster_size', 'cluster_count'])
            main_structured_csv_filewriter.writeheader()
            for line in self.parsed:
                main_structured_csv_filewriter.writerow(line)
            main_structured_csv_file_obj.close
            
        clusters = self.template_miner.drain.clusters

        with open(clusters_csv_file, 'a') as clusters_csv_file_obj:
            clusters_csv_filewriter = writer(clusters_csv_file_obj)
            clusters_csv_filewriter.writerow(header for header in ['cluster_id', 'template', 'size'])
            for line in clusters:
                clusters_csv_filewriter.writerow(self.cluster_template_to_tuple(line))
            clusters_csv_file_obj.close
        
        template_seq = [x['cluster_id']-1 for x in self.parsed]
        n = math.floor(len(template_seq)/WINDOW_SIZE)
        template_seqs = np.array(np.split(np.array(template_seq), n))

        return template_seqs
    
    def cluster_template_to_tuple(self, cluster):
        return (cluster.cluster_id, cluster.get_template(), cluster.size,)

## Word Embedding

In [27]:
class WordEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.NUMBER_OF_DIMENSIONS = 100

    def fit(self, template_seqs, y = None):
        template_seqs_filename = 'bgl_train_seqs.txt'
        template_seqs_file = path.abspath(path.join(project_root, output_dir, template_seqs_filename))
        np.savetxt(template_seqs_file, template_seqs.astype(int), fmt='%i')
        self.fasttext_model = fasttext.train_unsupervised(template_seqs_file, model='cbow', minCount=1, dim=self.NUMBER_OF_DIMENSIONS)
        os.remove(template_seqs_file)

        return self

    def transform(self, template_seqs, y = None):
        template_seqs_ = template_seqs.copy()
        template_seqs_ = np.apply_along_axis(self.average_embeddings, 1, template_seqs)

        return template_seqs_

    def average_embeddings(self, num_lse_vector):
        w2v_vector = [self.fasttext_model.get_word_vector(word) for word in np.vectorize(str)(num_lse_vector)]
        return np.average(w2v_vector, axis=0)

## Evaluation

In [28]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
for train, test in cv.split(log_seqs, log_seq_anomaly_labels):
    print(train.shape, test.shape)

pipe = Pipeline(steps=[('parsing', DrainParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])

pred_log_seq_anomaly_labels = pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict(log_seqs[test])

(320,) (80,)
(320,) (80,)
(320,) (80,)
(320,) (80,)
(320,) (80,)
(320,) (80,)
(320,) (80,)
(320,) (80,)
(320,) (80,)
(320,) (80,)


Read 0M words
Number of words:  97
Number of labels: 0
Progress: 100.0% words/sec/thread:    8752 lr:  0.000000 avg.loss:  4.121424 ETA:   0h 0m 0s


In [29]:
print('Confusion Matrix: ', confusion_matrix(log_seq_anomaly_labels[test], pred_log_seq_anomaly_labels))
print('Precision: ', precision_score(log_seq_anomaly_labels[test], pred_log_seq_anomaly_labels))
print('Recall: ', recall_score(log_seq_anomaly_labels[test], pred_log_seq_anomaly_labels))
print('F1 Score: ', f1_score(log_seq_anomaly_labels[test], pred_log_seq_anomaly_labels))

Confusion Matrix:  [[63  6]
 [ 2  9]]
Precision:  0.6
Recall:  0.8181818181818182
F1 Score:  0.6923076923076923


In [30]:
parser = DrainParser()
parser.transform(log_seqs[test], [])

array([[ 0,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 2,  2,  2,  2,  2],
       [ 3,  4,  4,  5,  2],
       [ 5,  5,  5,  5,  6],
       [ 7,  7,  8,  9, 10],
       [11, 12, 12, 12, 13],
       [ 0, 14, 14, 14, 14],
       [14, 14,  0,  0,  0],
       [ 0,  0,  0,  0, 15],
       [14, 16, 14, 17, 17],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [14,  0, 18,  0,  5],
       [ 0,  0,  0, 17, 17],
       [ 0,  0,  0,  0,  0],
       [ 0, 19, 20, 17, 17],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [17, 17,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0, 17, 17, 17,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0