### Imports & Contants

In [36]:
import os
import os.path as path
import glob

from csv import (writer, DictWriter)

import pandas as pd
import math
import numpy as np

from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig

from spellpy import spell
from fastai.text.all import Numericalize

import fasttext

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import (auc, roc_curve, average_precision_score, precision_recall_curve, f1_score, accuracy_score,
                            recall_score, precision_score)

In [37]:
project_root = os.getcwd()

data_dir = 'data'
log_file_name = 'bgl_2k.log'

config_dir = 'config'

input_dir = 'input'
spell_output_dir = 'output'

spell_input_dir = 'input/spell'
spell_output_dir = 'output/spell'

drain_input_dir = 'input/drain'
drain_output_dir = 'output/drain'

WINDOW_SIZE = 5

## Convert BGL.log file to CSV format.

In [38]:
log_csv_file_name = 'bgl_2k.csv'
log_csv_file = log_file = path.abspath(path.join(project_root, input_dir, log_csv_file_name))

if os.path.exists(log_csv_file):
    os.remove(log_csv_file)

log_file = path.abspath(path.join(project_root, data_dir, log_file_name))
logs = open(log_file, 'r')

with open(log_csv_file, 'a') as log_csv_file_obj:
    log_csv_writer_obj = writer(log_csv_file_obj)
    for line in logs:
        split_data = line.rstrip('\n').split(' ')
        split_data[9] = ' '.join(split_data[9:])
        log_csv_writer_obj.writerow(split_data[0:10])
    log_csv_file_obj.close

## EDA

In [39]:
original_df = pd.read_csv(log_csv_file, names=['Anomaly Type', 'Timestamp (ms)', 'Date', 'Node', 'Timestamp', 'Node Repeat', 'Message Type', 'Component', 'Level', 'Content'])
original_df['Anomaly Label'] = np.where(original_df['Anomaly Type'] == '-', 0, 1)
original_df['Timestamp'] = pd.to_datetime(original_df['Timestamp'], format='%Y-%m-%d-%H.%M.%S.%f')

original_df.head(n=20)

Unnamed: 0,Anomaly Type,Timestamp (ms),Date,Node,Timestamp,Node Repeat,Message Type,Component,Level,Content,Anomaly Label
0,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.675872,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
1,-,1117838573,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:42:53.276129,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
2,-,1117838976,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:49:36.156884,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
3,-,1117838978,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03 15:49:38.026704,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
4,-,1117842440,2005.06.03,R23-M0-NE-C:J05-U01,2005-06-03 16:47:20.730545,R23-M0-NE-C:J05-U01,RAS,KERNEL,INFO,63543 double-hummer alignment exceptions,0
5,-,1117842974,2005.06.03,R24-M0-N1-C:J13-U11,2005-06-03 16:56:14.254137,R24-M0-N1-C:J13-U11,RAS,KERNEL,INFO,162 double-hummer alignment exceptions,0
6,-,1117843015,2005.06.03,R21-M1-N6-C:J08-U11,2005-06-03 16:56:55.309974,R21-M1-N6-C:J08-U11,RAS,KERNEL,INFO,141 double-hummer alignment exceptions,0
7,-,1117848119,2005.06.03,R16-M1-N2-C:J17-U01,2005-06-03 18:21:59.871925,R16-M1-N2-C:J17-U01,RAS,KERNEL,INFO,"CE sym 2, at 0x0b85eee0, mask 0x05",0
8,APPREAD,1117869872,2005.06.04,R04-M1-N4-I:J18-U11,2005-06-04 00:24:32.432192,R04-M1-N4-I:J18-U11,RAS,APP,FATAL,ciod: failed to read message prefix on control stream (CioStream socket to 172.16.96.116:33569,1
9,APPREAD,1117869876,2005.06.04,R27-M1-N4-I:J18-U01,2005-06-04 00:24:36.222560,R27-M1-N4-I:J18-U01,RAS,APP,FATAL,ciod: failed to read message prefix on control stream (CioStream socket to 172.16.96.116:33370,1


In [40]:
filtered_df = original_df.loc[(original_df['Timestamp'] >= '2005-06-05 01:00:00.000000') & (original_df['Timestamp'] < '2005-06-05 10:00:00.000000')]
filtered_df['Timestamp']

15   2005-06-05 01:18:21.778604
16   2005-06-05 01:18:33.830595
17   2005-06-05 01:19:23.822135
18   2005-06-05 05:15:59.416717
19   2005-06-05 05:16:26.686603
20   2005-06-05 05:18:39.396608
21   2005-06-05 05:23:26.239153
22   2005-06-05 05:27:43.336565
23   2005-06-05 05:40:51.726735
24   2005-06-05 06:04:18.406158
25   2005-06-05 06:18:17.802159
26   2005-06-05 06:47:07.157021
27   2005-06-05 07:43:29.979844
28   2005-06-05 08:08:44.281729
29   2005-06-05 08:08:50.547117
30   2005-06-05 08:10:16.270131
31   2005-06-05 08:10:46.344235
32   2005-06-05 08:30:01.873693
33   2005-06-05 08:30:13.824307
34   2005-06-05 08:31:04.464776
35   2005-06-05 08:32:13.659715
36   2005-06-05 08:32:27.814949
37   2005-06-05 09:03:40.673488
38   2005-06-05 09:17:46.225683
39   2005-06-05 09:18:06.694851
40   2005-06-05 09:20:16.681318
41   2005-06-05 09:20:43.944594
42   2005-06-05 09:38:03.456120
43   2005-06-05 09:38:28.957918
44   2005-06-05 09:38:50.430385
45   2005-06-05 09:39:37.590924
46   200

In [41]:
df = original_df[['Anomaly Label', 'Content']]
df.head(20)

Unnamed: 0,Anomaly Label,Content
0,0,instruction cache parity error corrected
1,0,instruction cache parity error corrected
2,0,instruction cache parity error corrected
3,0,instruction cache parity error corrected
4,0,63543 double-hummer alignment exceptions
5,0,162 double-hummer alignment exceptions
6,0,141 double-hummer alignment exceptions
7,0,"CE sym 2, at 0x0b85eee0, mask 0x05"
8,1,ciod: failed to read message prefix on control stream (CioStream socket to 172.16.96.116:33569
9,1,ciod: failed to read message prefix on control stream (CioStream socket to 172.16.96.116:33370


## Create Log Sequences and Labels for Sequences

In [42]:
n = math.floor(df['Content'].index.size/WINDOW_SIZE)
r = math.floor(df['Content'].index.size%WINDOW_SIZE)

if r != 0:
    log_seqs = np.array(np.split(np.array(df['Content'])[:-r], n))
else:
    log_seqs = np.array(np.split(np.array(df['Content']), n))

In [43]:
if r != 0:
    log_seq_idx = np.array(np.split(df.index.to_numpy()[:-r], n))
else:
    log_seq_idx = np.array(np.split(df.index.to_numpy(), n))

In [44]:
log_seq_anomaly_labels = np.empty([n], dtype=int)
i = 0
for seq in log_seq_idx:
    if np.sum(df.loc[seq]['Anomaly Label'].values) > 0:
        log_seq_anomaly_labels[i] = 1
    else:
        log_seq_anomaly_labels[i] = 0
    i += 1

## Log Parsing & Numericalization

#### Drain Parser

In [45]:
files = glob.glob(drain_input_dir + '/*')
for f in files:
    os.remove(f)

files = glob.glob(drain_output_dir + '/*')
for f in files:
    os.remove(f)

drain_config_file = path.abspath(path.join(project_root, config_dir, 'drain3.ini'))

main_structured_csv_filename = 'BGL_main_structured.csv'
drain_main_structured_csv_file = path.abspath(path.join(project_root, drain_output_dir, main_structured_csv_filename))

templates_csv_filename = 'BGL_main_templates.csv'
drain_templates_csv_file = path.abspath(path.join(project_root, drain_output_dir, templates_csv_filename))

In [46]:
class DrainParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        config = TemplateMinerConfig()
        config.load(drain_config_file)

        self.template_miner = TemplateMiner(config=config)

    def fit(self, X, y = None):
        return self

    def transform(self, log_seqs, y = None):
        log_seqs_list = log_seqs.reshape([-1]).tolist()
        self.parsed = []

        for line in log_seqs_list:
            self.parsed.append(self.template_miner.add_log_message(line))

        # Uncomment during debug to view the parser output
        # self.write_output_to_csv()

        template_seq = [x['cluster_id']-1 for x in self.parsed]
        n = math.floor(len(template_seq)/WINDOW_SIZE)
        template_seqs = np.array(np.split(np.array(template_seq), n))

        return template_seqs
    
    def cluster_template_to_tuple(self, cluster):
        return (cluster.cluster_id, cluster.get_template(), cluster.size,)

    def write_output_to_csv(self):
        with open(drain_main_structured_csv_file, 'w') as drain_main_structured_csv_file_obj:
            main_structured_csv_filewriter = DictWriter(drain_main_structured_csv_file_obj, fieldnames=['template_mined', 'cluster_id', 'change_type', 'cluster_size', 'cluster_count'])
            main_structured_csv_filewriter.writeheader()
            for line in self.parsed:
                main_structured_csv_filewriter.writerow(line)
            drain_main_structured_csv_file_obj.close
            
        clusters = self.template_miner.drain.clusters

        with open(drain_templates_csv_file, 'a') as drain_templates_csv_file_obj:
            drain_templates_csv_filewriter = writer(drain_templates_csv_file_obj)
            drain_templates_csv_filewriter.writerow(header for header in ['cluster_id', 'template', 'size'])
            for line in clusters:
                drain_templates_csv_filewriter.writerow(self.cluster_template_to_tuple(line))
            drain_templates_csv_file_obj.close

#### Spell Parser

In [47]:
log_content_csv_file_name = 'bgl_2k_content.csv'
log_content_csv_file = log_file = path.abspath(path.join(project_root, spell_input_dir, log_content_csv_file_name))

main_structured_csv_filename = 'BGL_main_structured.csv'
spell_main_structured_csv_file = path.abspath(path.join(project_root, spell_output_dir, main_structured_csv_filename))

templates_csv_filename = 'BGL_main_templates.csv'
spell_templates_csv_file = path.abspath(path.join(project_root, spell_output_dir, templates_csv_filename))

In [48]:
class SpellParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        log_format = '<Content>'
        tau = 0.5

        self.parser = spell.LogParser(indir=spell_input_dir, outdir=spell_output_dir,
                             log_format=log_format, tau=tau, logmain='BGL')

    def fit(self, X, y = None):
        return self

    def transform(self, log_seqs, y = None):
        log_seqs_list = log_seqs.reshape([-1]).tolist()

        ldf = pd.DataFrame(log_seqs_list, columns=['Content'])
        ldf.to_csv(log_content_csv_file, index=False, header=False)

        self.parser.parse(log_content_csv_file_name)

        nums = self.numericalize()
        n = math.floor(len(nums)/WINDOW_SIZE)
        nums = np.array(np.split(nums, n))
        
        # Comment during debug to view parser output
        self.cleanup_files()
        
        return nums

    def numericalize(self):
        output_df = pd.read_csv(spell_main_structured_csv_file)

        return output_df['EventId'].to_numpy()

    def cleanup_files(self):
        files = glob.glob(spell_input_dir + '/*')
        for f in files:
            os.remove(f)

        files = glob.glob(spell_output_dir + '/*')
        for f in files:
            os.remove(f)

## Word Embedding

In [49]:
class WordEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.NUMBER_OF_DIMENSIONS = 100

    def fit(self, template_seqs, y = None):
        template_seqs_filename = 'bgl_train_seqs.txt'
        template_seqs_file = path.abspath(path.join(project_root, spell_output_dir, template_seqs_filename))
        np.savetxt(template_seqs_file, template_seqs, fmt='%s')
        self.fasttext_model = fasttext.train_unsupervised(template_seqs_file, model='cbow', minCount=1, dim=self.NUMBER_OF_DIMENSIONS)
        
        # Comment during debug to view embedding input
        os.remove(template_seqs_file)

        return self

    def transform(self, template_seqs, y = None):
        template_seqs_ = template_seqs.copy()
        template_seqs_ = np.apply_along_axis(self.average_embeddings, 1, template_seqs)

        return template_seqs_

    def average_embeddings(self, num_lse_vector):
        s2v_vector = self.fasttext_model.get_sentence_vector(' '.join(np.vectorize(str)(num_lse_vector)))
        return s2v_vector

## Evaluation

In [50]:
drain_y_real = []
drain_y_proba = []

drain_precisions = []
drain_recalls = []
drain_avg_precisions = []

drain_tprs = []
drain_tprs2 = []
drain_fprs = []
drain_aucs = []
drain_mean_fpr = np.linspace(0, 1, 100)

drain_f1_scores = []
drain_accuracy_scores = []
drain_precision_scores = []
drain_recall_scores = []

drain_index = 0

spell_y_real = []
spell_y_proba = []

spell_precisions = []
spell_recalls = []
spell_avg_precisions = []

spell_tprs = []
spell_tprs2 = []
spell_fprs = []
spell_aucs = []
spell_mean_fpr = np.linspace(0, 1, 100)

spell_f1_scores = []
spell_accuracy_scores = []
spell_precision_scores = []
spell_recall_scores = []

spell_index = 0

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)
for train, test in cv.split(log_seqs, log_seq_anomaly_labels):
    print(train.shape, test.shape)

    drain_pipe = Pipeline(steps=[('parsing', DrainParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])
    drain_probas_ = drain_pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict_proba(log_seqs[test])
    drain_y_pred = drain_pipe.predict(log_seqs[test])

    spell_pipe = Pipeline(steps=[('parsing', SpellParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])
    spell_probas_ = spell_pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict_proba(log_seqs[test])
    spell_y_pred = spell_pipe.predict(log_seqs[test])

    ####### PR #######

    drain_precision, drain_recall, _ = precision_recall_curve(log_seq_anomaly_labels[test], drain_probas_[:, 1])
    drain_precisions.append(drain_precision)
    drain_recalls.append(drain_recall)

    drain_avg_precision = average_precision_score(log_seq_anomaly_labels[test], drain_probas_[:, 1])
    drain_avg_precisions.append(drain_avg_precision)
        
    drain_y_real.append(log_seq_anomaly_labels[test])
    drain_y_proba.append(drain_probas_[:, 1])

    ####### ROC #######

    drain_fpr, drain_tpr, _ = roc_curve(log_seq_anomaly_labels[test], drain_probas_[:, 1])
    drain_tprs.append(np.interp(drain_mean_fpr, drain_fpr, drain_tpr))
    drain_tprs2.append(drain_tpr)
    drain_fprs.append(drain_fpr)
        
    drain_tprs[-1][0] = 0.0
    roc_auc = auc(drain_fpr, drain_tpr)
    drain_aucs.append(roc_auc)

    ####### F1-score, Accuracy, Precision and Recall #######

    drain_f1_scores.append(f1_score(log_seq_anomaly_labels[test], drain_y_pred))
    drain_accuracy_scores.append(accuracy_score(log_seq_anomaly_labels[test], drain_y_pred))
    drain_precision_scores.append(precision_score(log_seq_anomaly_labels[test], drain_y_pred))
    drain_recall_scores.append(recall_score(log_seq_anomaly_labels[test], drain_y_pred))

    drain_index += 1

    ####### PR #######

    spell_precision, spell_recall, _ = precision_recall_curve(log_seq_anomaly_labels[test], spell_probas_[:, 1])
    spell_precisions.append(spell_precision)
    spell_recalls.append(spell_recall)

    spell_avg_precision = average_precision_score(log_seq_anomaly_labels[test], spell_probas_[:, 1])
    spell_avg_precisions.append(spell_avg_precision)
        
    spell_y_real.append(log_seq_anomaly_labels[test])
    spell_y_proba.append(spell_probas_[:, 1])

    ####### ROC #######

    spell_fpr, spell_tpr, _ = roc_curve(log_seq_anomaly_labels[test], spell_probas_[:, 1])
    spell_tprs.append(np.interp(spell_mean_fpr, spell_fpr, spell_tpr))
    spell_tprs2.append(spell_tpr)
    spell_fprs.append(spell_fpr)
        
    spell_tprs[-1][0] = 0.0
    roc_auc = auc(spell_fpr, spell_tpr)
    spell_aucs.append(roc_auc)

    ####### F1-score, Accuracy, Precision and Recall #######

    spell_f1_scores.append(f1_score(log_seq_anomaly_labels[test], spell_y_pred))
    spell_accuracy_scores.append(accuracy_score(log_seq_anomaly_labels[test], spell_y_pred))
    spell_precision_scores.append(precision_score(log_seq_anomaly_labels[test], spell_y_pred))
    spell_recall_scores.append(recall_score(log_seq_anomaly_labels[test], spell_y_pred))

    spell_index += 1

drain_y_real = np.concatenate(drain_y_real)
drain_y_proba = np.concatenate(drain_y_proba)

spell_y_real = np.concatenate(spell_y_real)
spell_y_proba = np.concatenate(spell_y_proba)

[2022-11-25 15:17:22,130][INFO]: Starting Drain3 template miner


(266,) (134,)


Read 0M words
Number of words:  87
Number of labels: 0
Progress: 100.0% words/sec/thread:    7252 lr:  0.000000 avg.loss:  4.131035 ETA:   0h 0m 0s
[2022-11-25 15:17:22,883][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-25 15:17:22,896][INFO]: Loaded 100.0% of log lines.
[2022-11-25 15:17:22,899][INFO]: load_data() finished!
[2022-11-25 15:17:22,992][INFO]: Processed 100.0% of log lines.
[2022-11-25 15:17:23,405][INFO]: Output parse file
[2022-11-25 15:17:23,414][INFO]: Output main file for append
[2022-11-25 15:17:23,426][INFO]: lastestLindId: 1330
[2022-11-25 15:17:23,450][INFO]: rootNodePath: output/spell/rootNode.pkl
[2022-11-25 15:17:23,452][INFO]: logCluLPath: output/spell/logCluL.pkl
[2022-11-25 15:17:23,453][INFO]: Store objects done.
[2022-11-25 15:17:23,454][INFO]: Parsing done. [Time taken: 0:00:00.570817]
Read 0M words
Number of words:  69
Number of labels: 0
Progress: 100.0% words/sec/thread:  675231 lr:  0.000000 avg.loss:  4.137098 ETA:   0h 0m 0s
[2022-11

(267,) (133,)


Progress: 100.0% words/sec/thread:    7271 lr:  0.000000 avg.loss:  4.134593 ETA:   0h 0m 0s
[2022-11-25 15:17:25,229][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-25 15:17:25,242][INFO]: Loaded 100.0% of log lines.
[2022-11-25 15:17:25,245][INFO]: load_data() finished!
[2022-11-25 15:17:25,343][INFO]: Processed 100.0% of log lines.
[2022-11-25 15:17:25,391][INFO]: Output parse file
[2022-11-25 15:17:25,401][INFO]: Output main file for append
[2022-11-25 15:17:25,415][INFO]: lastestLindId: 1335
[2022-11-25 15:17:25,444][INFO]: rootNodePath: output/spell/rootNode.pkl
[2022-11-25 15:17:25,447][INFO]: logCluLPath: output/spell/logCluL.pkl
[2022-11-25 15:17:25,448][INFO]: Store objects done.
[2022-11-25 15:17:25,449][INFO]: Parsing done. [Time taken: 0:00:00.220105]
Read 0M words
Number of words:  73
Number of labels: 0
Progress: 100.0% words/sec/thread:    7419 lr:  0.000000 avg.loss:  4.136079 ETA:   0h 0m 0s
[2022-11-25 15:17:26,034][INFO]: Parsing file: input/spell/bgl_

(267,) (133,)


Progress: 100.0% words/sec/thread:  983113 lr:  0.000000 avg.loss:  4.128119 ETA:   0h 0m 0s
[2022-11-25 15:17:27,159][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-25 15:17:27,177][INFO]: Loaded 100.0% of log lines.
[2022-11-25 15:17:27,180][INFO]: load_data() finished!
[2022-11-25 15:17:27,284][INFO]: Processed 100.0% of log lines.
[2022-11-25 15:17:27,433][INFO]: Output parse file
[2022-11-25 15:17:27,442][INFO]: Output main file for append
[2022-11-25 15:17:27,458][INFO]: lastestLindId: 1335
[2022-11-25 15:17:27,485][INFO]: rootNodePath: output/spell/rootNode.pkl
[2022-11-25 15:17:27,487][INFO]: logCluLPath: output/spell/logCluL.pkl
[2022-11-25 15:17:27,489][INFO]: Store objects done.
[2022-11-25 15:17:27,490][INFO]: Parsing done. [Time taken: 0:00:00.331063]
Read 0M words
Number of words:  78
Number of labels: 0
Progress: 100.0% words/sec/thread:    7135 lr:  0.000000 avg.loss:  4.141453 ETA:   0h 0m 0s
[2022-11-25 15:17:28,075][INFO]: Parsing file: input/spell/bgl_

(266,) (134,)


Progress: 100.0% words/sec/thread: 1276777 lr:  0.000000 avg.loss:  4.131452 ETA:   0h 0m 0s
[2022-11-25 15:17:28,919][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-25 15:17:28,932][INFO]: Loaded 100.0% of log lines.
[2022-11-25 15:17:28,934][INFO]: load_data() finished!
[2022-11-25 15:17:29,033][INFO]: Processed 100.0% of log lines.
[2022-11-25 15:17:29,296][INFO]: Output parse file
[2022-11-25 15:17:29,304][INFO]: Output main file for append
[2022-11-25 15:17:29,318][INFO]: lastestLindId: 1330
[2022-11-25 15:17:29,346][INFO]: rootNodePath: output/spell/rootNode.pkl
[2022-11-25 15:17:29,349][INFO]: logCluLPath: output/spell/logCluL.pkl
[2022-11-25 15:17:29,351][INFO]: Store objects done.
[2022-11-25 15:17:29,352][INFO]: Parsing done. [Time taken: 0:00:00.432603]
Read 0M words
Number of words:  77
Number of labels: 0
Progress: 100.0% words/sec/thread:    7516 lr:  0.000000 avg.loss:  4.135534 ETA:   0h 0m 0s
[2022-11-25 15:17:29,941][INFO]: Parsing file: input/spell/bgl_

(267,) (133,)


Progress: 100.0% words/sec/thread:    7369 lr:  0.000000 avg.loss:  4.140278 ETA:   0h 0m 0s
[2022-11-25 15:17:30,872][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-25 15:17:30,897][INFO]: Loaded 100.0% of log lines.
[2022-11-25 15:17:30,900][INFO]: load_data() finished!
[2022-11-25 15:17:31,015][INFO]: Processed 100.0% of log lines.
[2022-11-25 15:17:31,169][INFO]: Output parse file
[2022-11-25 15:17:31,178][INFO]: Output main file for append
[2022-11-25 15:17:31,190][INFO]: lastestLindId: 1335
[2022-11-25 15:17:31,216][INFO]: rootNodePath: output/spell/rootNode.pkl
[2022-11-25 15:17:31,219][INFO]: logCluLPath: output/spell/logCluL.pkl
[2022-11-25 15:17:31,220][INFO]: Store objects done.
[2022-11-25 15:17:31,220][INFO]: Parsing done. [Time taken: 0:00:00.348666]
Read 0M words
Number of words:  76
Number of labels: 0
Progress: 100.0% words/sec/thread:    7369 lr:  0.000000 avg.loss:  4.133438 ETA:   0h 0m 0s
[2022-11-25 15:17:31,892][INFO]: Parsing file: input/spell/bgl_

(267,) (133,)


Progress: 100.0% words/sec/thread:    7473 lr:  0.000000 avg.loss:  4.136121 ETA:   0h 0m 0s
[2022-11-25 15:17:33,221][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-25 15:17:33,235][INFO]: Loaded 100.0% of log lines.
[2022-11-25 15:17:33,237][INFO]: load_data() finished!
[2022-11-25 15:17:33,332][INFO]: Processed 100.0% of log lines.
[2022-11-25 15:17:33,394][INFO]: Output parse file
[2022-11-25 15:17:33,404][INFO]: Output main file for append
[2022-11-25 15:17:33,417][INFO]: lastestLindId: 1335
[2022-11-25 15:17:33,437][INFO]: rootNodePath: output/spell/rootNode.pkl
[2022-11-25 15:17:33,439][INFO]: logCluLPath: output/spell/logCluL.pkl
[2022-11-25 15:17:33,441][INFO]: Store objects done.
[2022-11-25 15:17:33,442][INFO]: Parsing done. [Time taken: 0:00:00.221012]
Read 0M words
Number of words:  70
Number of labels: 0
Progress: 100.0% words/sec/thread:    7248 lr:  0.000000 avg.loss:  4.135736 ETA:   0h 0m 0s
[2022-11-25 15:17:34,041][INFO]: Parsing file: input/spell/bgl_

### Print Results

In [51]:
print('F1 Scores')
print('\n')
print('Spell: ', spell_f1_scores)
print('\n')
print('Drain: ',drain_f1_scores)
print('\n')
print('Average Scores')
print('Spell: ', np.average(spell_f1_scores), ' Drain: ', np.average(drain_f1_scores))

print('\n')
print('Accuracy Scores')
print('\n')
print('Spell: ', spell_accuracy_scores)
print('\n')
print('Drain: ', drain_accuracy_scores)
print('\n')
print('Average Scores')
print('Spell: ', np.average(spell_accuracy_scores), ' Drain: ', np.average(drain_accuracy_scores))

print('\n')
print('Precision Scores')
print('\n')
print('Spell: ', spell_precision_scores)
print('\n')
print('Drain: ', drain_precision_scores)
print('\n')
print('Spell: ', np.average(spell_precision_scores), ' Drain: ', np.average(drain_precision_scores))

print('\n')
print('Recall Scores')
print('\n')
print('Spell: ', spell_recall_scores)
print('\n')
print('Drain: ', drain_recall_scores)
print('\n')
print('Spell: ', np.average(spell_recall_scores), ' Drain: ', np.average(drain_recall_scores))

F1 Scores


Spell:  [0.6341463414634146, 0.55, 0.6046511627906976, 0.6666666666666667, 0.6222222222222222, 0.4827586206896552]


Drain:  [0.6666666666666667, 0.5925925925925927, 0.6511627906976745, 0.6206896551724138, 0.6153846153846154, 0.631578947368421]


Average Scores
Spell:  0.5934075023054427  Drain:  0.6296792113137307


Accuracy Scores


Spell:  [0.8880597014925373, 0.8646616541353384, 0.8721804511278195, 0.8805970149253731, 0.8721804511278195, 0.8872180451127819]


Drain:  [0.8805970149253731, 0.8345864661654135, 0.8872180451127819, 0.835820895522388, 0.849624060150376, 0.8947368421052632]


Average Scores
Spell:  0.8774828863202783  Drain:  0.8637638873302659


Precision Scores


Spell:  [0.5652173913043478, 0.4782608695652174, 0.52, 0.5333333333333333, 0.5, 0.6363636363636364]


Drain:  [0.5333333333333333, 0.43243243243243246, 0.56, 0.45, 0.45714285714285713, 0.6]


Spell:  0.5388625384277558  Drain:  0.5054847704847705


Recall Scores


Spell:  [0.7222222222222222, 0.6470