### Imports & Contants

In [1]:
import os
import os.path as path
import glob
import boto3
from datetime import datetime

from sagemaker import get_execution_role
from csv import (writer, DictWriter)
import pickle

import pandas as pd
import math
import numpy as np

from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig

from spellpy import spell

import fasttext

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (auc, roc_curve, average_precision_score, precision_recall_curve, f1_score, accuracy_score,
                            recall_score, precision_score)

In [2]:
project_root = os.getcwd()

spell_input_dir = 'input/spell'
spell_output_dir = 'output/spell'

if not path.abspath(path.join(project_root, spell_input_dir)):
    os.makedirs(path.abspath(path.join(project_root, spell_input_dir)))

if not path.abspath(path.join(project_root, spell_output_dir)):
    os.makedirs(path.abspath(path.join(project_root, spell_output_dir)))

config_dir = 'config'
bucket_name = 'sagemaker-studio-326787221562-jycpwz9gs3f'
log_file_key = 'BGL_no200507.csv'
pickle_key = 'Testing-Output-' + datetime.now().strftime("%d-%m-%Y-%H-%M-%S") + '.pkl'
    
WINDOW_SIZE = 5

## Convert BGL.log file to CSV format.

In [3]:
role = get_execution_role()
s3_client = boto3.client('s3')
log_file = s3_client.get_object(Bucket = bucket_name, Key = log_file_key)
log_csv_file = log_file['Body']

## EDA

In [4]:
original_df = pd.read_csv(log_csv_file)
print(original_df.shape)
print(original_df.info())
original_df.head(n=20)

(3043195, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3043195 entries, 0 to 3043194
Data columns (total 13 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   Unnamed: 0         int64 
 1   Anomaly Type       object
 2   Timestamp (ms)     int64 
 3   Date               object
 4   Node               object
 5   Timestamp          object
 6   Node Repeat        object
 7   Message Type       object
 8   Component          object
 9   Level              object
 10  Content            object
 11  Anomaly Label      int64 
 12  Non-Anomaly Label  int64 
dtypes: int64(4), object(9)
memory usage: 301.8+ MB
None


Unnamed: 0.1,Unnamed: 0,Anomaly Type,Timestamp (ms),Date,Node,Timestamp,Node Repeat,Message Type,Component,Level,Content,Anomaly Label,Non-Anomaly Label
0,0,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.363779,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1
1,1,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.527847,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1
2,2,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.675872,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1
3,3,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.823719,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1
4,4,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.982731,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1
5,5,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.131467,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1
6,6,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.293532,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1
7,7,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.428563,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1
8,8,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.601412,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1
9,9,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.749199,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0,1


In [5]:
df = original_df[['Anomaly Label', 'Content']]
df.head(20)

Unnamed: 0,Anomaly Label,Content
0,0,instruction cache parity error corrected
1,0,instruction cache parity error corrected
2,0,instruction cache parity error corrected
3,0,instruction cache parity error corrected
4,0,instruction cache parity error corrected
5,0,instruction cache parity error corrected
6,0,instruction cache parity error corrected
7,0,instruction cache parity error corrected
8,0,instruction cache parity error corrected
9,0,instruction cache parity error corrected


## Create Log Sequences and Labels for Sequences

In [6]:
n = math.floor(df['Content'].index.size/WINDOW_SIZE)
r = math.floor(df['Content'].index.size%WINDOW_SIZE)

if r != 0:
    log_seqs = np.array(np.split(np.array(df['Content'])[:-r], n))
else:
    log_seqs = np.array(np.split(np.array(df['Content']), n))

In [7]:
if r != 0:
    log_seq_idx = np.array(np.split(df.index.to_numpy()[:-r], n))
else:
    log_seq_idx = np.array(np.split(df.index.to_numpy(), n))

In [8]:
log_seq_anomaly_labels = np.empty([n], dtype=int)
i = 0
for seq in log_seq_idx:
    if np.sum(df.loc[seq]['Anomaly Label'].values) > 0:
        log_seq_anomaly_labels[i] = 1
    else:
        log_seq_anomaly_labels[i] = 0
    i += 1

KeyboardInterrupt: 

## Log Parsing & Numericalization

#### Spell Parser

In [None]:
log_content_csv_file_name = 'bgl_2k_content.csv'
log_content_csv_file = log_file = path.abspath(path.join(project_root, spell_input_dir, log_content_csv_file_name))

main_structured_csv_filename = 'BGL_main_structured.csv'
spell_main_structured_csv_file = path.abspath(path.join(project_root, spell_output_dir, main_structured_csv_filename))

templates_csv_filename = 'BGL_main_templates.csv'
spell_templates_csv_file = path.abspath(path.join(project_root, spell_output_dir, templates_csv_filename))

In [None]:
class SpellParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        log_format = '<Content>'
        tau = 0.5

        self.parser = spell.LogParser(indir=spell_input_dir, outdir=spell_output_dir,
                             log_format=log_format, tau=tau, logmain='BGL')

    def fit(self, X, y = None):
        return self

    def transform(self, log_seqs, y = None):
        log_seqs_list = log_seqs.reshape([-1]).tolist()

        ldf = pd.DataFrame(log_seqs_list, columns=['Content'])
        ldf.to_csv(log_content_csv_file, index=False, header=False)

        self.parser.parse(log_content_csv_file_name)

        nums = self.numericalize()
        n = math.floor(len(nums)/WINDOW_SIZE)
        nums = np.array(np.split(nums, n))
        
        # Comment during debug to view parser output
        #self.cleanup_files()
        
        return nums

    def numericalize(self):
        output_df = pd.read_csv(spell_main_structured_csv_file)

        return output_df['EventId'].to_numpy()

    def cleanup_files(self):
        files = glob.glob(spell_input_dir + '/*')
        for f in files:
            os.remove(f)

        files = glob.glob(spell_output_dir + '/*')
        for f in files:
            os.remove(f)

## Word Embedding

In [None]:
class WordEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.NUMBER_OF_DIMENSIONS = 100

    def fit(self, template_seqs, y = None):
        template_seqs_filename = 'bgl_train_seqs.txt'
        template_seqs_file = path.abspath(path.join(project_root, output_dir, template_seqs_filename))
        np.savetxt(template_seqs_file, template_seqs, fmt='%s')
        self.fasttext_model = fasttext.train_unsupervised(template_seqs_file, model='cbow', minCount=1, dim=self.NUMBER_OF_DIMENSIONS)
        
        # Comment during debug to view embedding input
        os.remove(template_seqs_file)

        return self

    def transform(self, template_seqs, y = None):
        template_seqs_ = template_seqs.copy()
        template_seqs_ = np.apply_along_axis(self.average_embeddings, 1, template_seqs)

        return template_seqs_

    def average_embeddings(self, num_lse_vector):
        s2v_vector = self.fasttext_model.get_sentence_vector(' '.join(np.vectorize(str)(num_lse_vector)))
        return s2v_vector

## Evaluation

In [None]:
drain_y_real = []
drain_y_proba = []

drain_precisions = []
drain_recalls = []
drain_avg_precisions = []

drain_tprs = []
drain_tprs2 = []
drain_fprs = []
drain_aucs = []
drain_mean_fpr = np.linspace(0, 1, 100)

drain_f1_scores = []
drain_accuracy_scores = []
drain_precision_scores = []
drain_recall_scores = []

drain_index = 0

spell_y_real = []
spell_y_proba = []

spell_precisions = []
spell_recalls = []
spell_avg_precisions = []

spell_tprs = []
spell_tprs2 = []
spell_fprs = []
spell_aucs = []
spell_mean_fpr = np.linspace(0, 1, 100)

spell_f1_scores = []
spell_accuracy_scores = []
spell_precision_scores = []
spell_recall_scores = []

spell_index = 0

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)
for train, test in cv.split(log_seqs, log_seq_anomaly_labels):
    print(train.shape, test.shape)

spell_pipe = Pipeline(steps=[('parsing', SpellParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])
#spell_pipe = pipe = Pipeline(steps=[('parsing', Parsing()), ('word_embedding', WordEmbedding()), ('random_forest', RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))
spell_probas_ = spell_pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict_proba(log_seqs[test])
spell_y_pred = spell_pipe.predict(log_seqs[test])

####### PR #######

spell_precision, spell_recall, _ = precision_recall_curve(log_seq_anomaly_labels[test], spell_probas_[:, 1])
spell_precisions.append(spell_precision)
spell_recalls.append(spell_recall)

spell_avg_precision = average_precision_score(log_seq_anomaly_labels[test], spell_probas_[:, 1])
spell_avg_precisions.append(spell_avg_precision)

spell_y_real.append(log_seq_anomaly_labels[test])
spell_y_proba.append(spell_probas_[:, 1])

####### ROC #######

spell_fpr, spell_tpr, _ = roc_curve(log_seq_anomaly_labels[test], spell_probas_[:, 1])
spell_tprs.append(np.interp(spell_mean_fpr, spell_fpr, spell_tpr))
spell_tprs2.append(spell_tpr)
spell_fprs.append(spell_fpr)

spell_tprs[-1][0] = 0.0
roc_auc = auc(spell_fpr, spell_tpr)
spell_aucs.append(roc_auc)

####### F1-score, Accuracy, Precision and Recall #######

spell_f1_scores.append(f1_score(log_seq_anomaly_labels[test], spell_y_pred))
spell_accuracy_scores.append(accuracy_score(log_seq_anomaly_labels[test], spell_y_pred))
spell_precision_scores.append(precision_score(log_seq_anomaly_labels[test], spell_y_pred))
spell_recall_scores.append(recall_score(log_seq_anomaly_labels[test], spell_y_pred))

spell_index += 1

spell_y_real = np.concatenate(spell_y_real)
spell_y_proba = np.concatenate(spell_y_proba)

#### Print Results

In [None]:
print('F1 Scores')
print('Spell: ', spell_f1_scores)
print('Average Scores - ', 'Spell: ', np.average(spell_f1_scores))
print('\n')

print('Accuracy Scores')
print('Spell: ', spell_accuracy_scores)
print('Average Scores - ', 'Spell: ', np.average(spell_accuracy_scores))
print('\n')

print('Precision Scores')
print('Spell: ', spell_precision_scores)
print('Average Scores - ', 'Spell: ', np.average(spell_precision_scores))
print('\n')

print('Recall Scores')
print('Spell: ', spell_recall_scores)
print('Average Scores - ', 'Spell: ', np.average(spell_recall_scores))

In [None]:
#pickle_byte_obj = pickle.dumps([spell_f1_scores, drain_f1_scores, spell_accuracy_scores, drain_accuracy_scores, spell_precision_scores, drain_precision_scores, spell_recall_scores, drain_recall_scores])

#s3_resource = boto3.resource("s3")
#s3_resource.Object(bucket_name, pickle_key).put(Body=pickle_byte_obj)

### Testing

In [None]:
files = glob.glob(spell_input_dir + '/*')
for f in files:
    os.remove(f)

files = glob.glob(spell_output_dir + '/*')
for f in files:
    os.remove(f)
    
test_log_seqs = log_seqs[train]
test_log_seq_anomaly_labels = log_seq_anomaly_labels[train]

print(test_log_seqs.shape)
print(test_log_seq_anomaly_labels.shape)

spell_parser = SpellParser()
spell_parser.fit(test_log_seqs, test_log_seq_anomaly_labels)
spell_template_seqs_output = spell_parser.transform(test_log_seqs, test_log_seq_anomaly_labels)
for templ_seq in spell_template_seqs_output:
    print(templ_seq)