### Imports & Contants

In [18]:
import os
import os.path as path
import glob
import boto3

from sagemaker import get_execution_role
from csv import (writer, DictWriter)

import pandas as pd
import math
import numpy as np

from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig

from spellpy import spell

import fasttext

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import (auc, roc_curve, average_precision_score, precision_recall_curve, f1_score, accuracy_score,
                            recall_score, precision_score)

In [19]:
project_root = os.getcwd()

data_dir = 'data'
log_file_name = 'bgl_2k.log'

config_dir = 'config'

input_dir = 'input'
output_dir = 'output'

spell_input_dir = 'input/spell'
spell_output_dir = 'output/spell'

drain_input_dir = 'input/drain'
drain_output_dir = 'output/drain'

if not path.abspath(path.join(project_root, spell_input_dir)):
    os.makedirs(path.abspath(path.join(project_root, spell_input_dir)))

if not path.abspath(path.join(project_root, spell_output_dir)):
    os.makedirs(path.abspath(path.join(project_root, spell_output_dir)))

if not path.abspath(path.join(project_root, drain_input_dir)):
    os.makedirs(path.abspath(path.join(project_root, drain_input_dir)))

if not path.abspath(path.join(project_root, drain_output_dir)):
    os.makedirs(path.abspath(path.join(project_root, drain_output_dir)))

WINDOW_SIZE = 5

## Convert BGL.log file to CSV format.

In [20]:
role = get_execution_role()
s3 = boto3.client('s3')
log_file = s3.get_object(Bucket = 'sagemaker-studio-326787221562-jycpwz9gs3f', Key = 'BGL_200506.csv')
log_csv_file = log_file['Body']

## EDA

In [21]:
original_df = pd.read_csv(log_csv_file)
print(original_df.info())
original_df.head(n=20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1161151 entries, 0 to 1161150
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   Unnamed: 0      1161151 non-null  int64 
 1   Anomaly Type    1161151 non-null  object
 2   Timestamp (ms)  1161151 non-null  int64 
 3   Date            1161151 non-null  object
 4   Node            1161130 non-null  object
 5   Timestamp       1161151 non-null  object
 6   Node Repeat     1161130 non-null  object
 7   Message Type    1153780 non-null  object
 8   Component       1161151 non-null  object
 9   Level           1161151 non-null  object
 10  Content         1161151 non-null  object
 11  Anomaly Label   1161151 non-null  int64 
dtypes: int64(3), object(9)
memory usage: 106.3+ MB
None


Unnamed: 0.1,Unnamed: 0,Anomaly Type,Timestamp (ms),Date,Node,Timestamp,Node Repeat,Message Type,Component,Level,Content,Anomaly Label
0,0,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.363779,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
1,1,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.527847,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
2,2,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.675872,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
3,3,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.823719,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
4,4,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.982731,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
5,5,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.131467,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
6,6,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.293532,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
7,7,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.428563,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
8,8,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.601412,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
9,9,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.749199,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0


In [22]:
df = original_df[['Anomaly Label', 'Content']]
df.head(20)

Unnamed: 0,Anomaly Label,Content
0,0,instruction cache parity error corrected
1,0,instruction cache parity error corrected
2,0,instruction cache parity error corrected
3,0,instruction cache parity error corrected
4,0,instruction cache parity error corrected
5,0,instruction cache parity error corrected
6,0,instruction cache parity error corrected
7,0,instruction cache parity error corrected
8,0,instruction cache parity error corrected
9,0,instruction cache parity error corrected


## Create Log Sequences and Labels for Sequences

In [6]:
n = math.floor(df['Content'].index.size/WINDOW_SIZE)
r = math.floor(df['Content'].index.size%WINDOW_SIZE)

if r != 0:
    log_seqs = np.array(np.split(np.array(df['Content'])[:-r], n))
else:
    log_seqs = np.array(np.split(np.array(df['Content']), n))

In [7]:
if r != 0:
    log_seq_idx = np.array(np.split(df.index.to_numpy()[:-r], n))
else:
    log_seq_idx = np.array(np.split(df.index.to_numpy(), n))

In [8]:
log_seq_anomaly_labels = np.empty([n], dtype=int)
i = 0
for seq in log_seq_idx:
    if np.sum(df.loc[seq]['Anomaly Label'].values) > 0:
        log_seq_anomaly_labels[i] = 1
    else:
        log_seq_anomaly_labels[i] = 0
    i += 1

## Log Parsing & Numericalization

#### Drain Parser

In [9]:
files = glob.glob(drain_input_dir + '/*')
for f in files:
    os.remove(f)

files = glob.glob(drain_output_dir + '/*')
for f in files:
    os.remove(f)

drain_config_file = path.abspath(path.join(project_root, config_dir, 'drain3.ini'))

main_structured_csv_filename = 'BGL_main_structured.csv'
drain_main_structured_csv_file = path.abspath(path.join(project_root, drain_output_dir, main_structured_csv_filename))

templates_csv_filename = 'BGL_main_templates.csv'
drain_templates_csv_file = path.abspath(path.join(project_root, drain_output_dir, templates_csv_filename))

In [10]:
class DrainParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        config = TemplateMinerConfig()
        config.load(drain_config_file)

        self.template_miner = TemplateMiner(config=config)

    def fit(self, X, y = None):
        return self

    def transform(self, log_seqs, y = None):
        log_seqs_list = log_seqs.reshape([-1]).tolist()
        self.parsed = []

        for line in log_seqs_list:
            self.parsed.append(self.template_miner.add_log_message(line))

        # Uncomment during debug to view the parser output
        # self.write_output_to_csv()

        template_seq = [x['cluster_id']-1 for x in self.parsed]
        n = math.floor(len(template_seq)/WINDOW_SIZE)
        template_seqs = np.array(np.split(np.array(template_seq), n))

        return template_seqs
    
    def cluster_template_to_tuple(self, cluster):
        return (cluster.cluster_id, cluster.get_template(), cluster.size,)

    def write_output_to_csv(self):
        with open(drain_main_structured_csv_file, 'w') as drain_main_structured_csv_file_obj:
            main_structured_csv_filewriter = DictWriter(drain_main_structured_csv_file_obj, fieldnames=['template_mined', 'cluster_id', 'change_type', 'cluster_size', 'cluster_count'])
            main_structured_csv_filewriter.writeheader()
            for line in self.parsed:
                main_structured_csv_filewriter.writerow(line)
            drain_main_structured_csv_file_obj.close
            
        clusters = self.template_miner.drain.clusters

        with open(drain_templates_csv_file, 'a') as drain_templates_csv_file_obj:
            drain_templates_csv_filewriter = writer(drain_templates_csv_file_obj)
            drain_templates_csv_filewriter.writerow(header for header in ['cluster_id', 'template', 'size'])
            for line in clusters:
                drain_templates_csv_filewriter.writerow(self.cluster_template_to_tuple(line))
            drain_templates_csv_file_obj.close

#### Spell Parser

In [11]:
log_content_csv_file_name = 'bgl_2k_content.csv'
log_content_csv_file = log_file = path.abspath(path.join(project_root, spell_input_dir, log_content_csv_file_name))

main_structured_csv_filename = 'BGL_main_structured.csv'
spell_main_structured_csv_file = path.abspath(path.join(project_root, spell_output_dir, main_structured_csv_filename))

templates_csv_filename = 'BGL_main_templates.csv'
spell_templates_csv_file = path.abspath(path.join(project_root, spell_output_dir, templates_csv_filename))

In [12]:
class SpellParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        log_format = '<Content>'
        tau = 0.5

        self.parser = spell.LogParser(indir=spell_input_dir, outdir=spell_output_dir,
                             log_format=log_format, tau=tau, logmain='BGL')

    def fit(self, X, y = None):
        return self

    def transform(self, log_seqs, y = None):
        log_seqs_list = log_seqs.reshape([-1]).tolist()

        ldf = pd.DataFrame(log_seqs_list, columns=['Content'])
        ldf.to_csv(log_content_csv_file, index=False, header=False)

        self.parser.parse(log_content_csv_file_name)

        nums = self.numericalize()
        n = math.floor(len(nums)/WINDOW_SIZE)
        nums = np.array(np.split(nums, n))
        
        # Comment during debug to view parser output
        self.cleanup_files()
        
        return nums

    def numericalize(self):
        output_df = pd.read_csv(spell_main_structured_csv_file)

        return output_df['EventId'].to_numpy()

    def cleanup_files(self):
        files = glob.glob(spell_input_dir + '/*')
        for f in files:
            os.remove(f)

        files = glob.glob(spell_output_dir + '/*')
        for f in files:
            os.remove(f)

## Word Embedding

In [13]:
class WordEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.NUMBER_OF_DIMENSIONS = 100

    def fit(self, template_seqs, y = None):
        template_seqs_filename = 'bgl_train_seqs.txt'
        template_seqs_file = path.abspath(path.join(project_root, output_dir, template_seqs_filename))
        np.savetxt(template_seqs_file, template_seqs, fmt='%s')
        self.fasttext_model = fasttext.train_unsupervised(template_seqs_file, model='cbow', minCount=1, dim=self.NUMBER_OF_DIMENSIONS)
        
        # Comment during debug to view embedding input
        os.remove(template_seqs_file)

        return self

    def transform(self, template_seqs, y = None):
        template_seqs_ = template_seqs.copy()
        template_seqs_ = np.apply_along_axis(self.average_embeddings, 1, template_seqs)

        return template_seqs_

    def average_embeddings(self, num_lse_vector):
        s2v_vector = self.fasttext_model.get_sentence_vector(' '.join(np.vectorize(str)(num_lse_vector)))
        return s2v_vector

## Evaluation

In [14]:
drain_y_real = []
drain_y_proba = []

drain_precisions = []
drain_recalls = []
drain_avg_precisions = []

drain_tprs = []
drain_tprs2 = []
drain_fprs = []
drain_aucs = []
drain_mean_fpr = np.linspace(0, 1, 100)

drain_f1_scores = []
drain_accuracy_scores = []
drain_precision_scores = []
drain_recall_scores = []

drain_index = 0

spell_y_real = []
spell_y_proba = []

spell_precisions = []
spell_recalls = []
spell_avg_precisions = []

spell_tprs = []
spell_tprs2 = []
spell_fprs = []
spell_aucs = []
spell_mean_fpr = np.linspace(0, 1, 100)

spell_f1_scores = []
spell_accuracy_scores = []
spell_precision_scores = []
spell_recall_scores = []

spell_index = 0

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)
for train, test in cv.split(log_seqs, log_seq_anomaly_labels):
    print(train.shape, test.shape)

    drain_pipe = Pipeline(steps=[('parsing', DrainParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])
    drain_probas_ = drain_pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict_proba(log_seqs[test])
    drain_y_pred = drain_pipe.predict(log_seqs[test])

    spell_pipe = Pipeline(steps=[('parsing', SpellParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])
    spell_probas_ = spell_pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict_proba(log_seqs[test])
    spell_y_pred = spell_pipe.predict(log_seqs[test])

    ####### PR #######

    drain_precision, drain_recall, _ = precision_recall_curve(log_seq_anomaly_labels[test], drain_probas_[:, 1])
    drain_precisions.append(drain_precision)
    drain_recalls.append(drain_recall)

    drain_avg_precision = average_precision_score(log_seq_anomaly_labels[test], drain_probas_[:, 1])
    drain_avg_precisions.append(drain_avg_precision)
        
    drain_y_real.append(log_seq_anomaly_labels[test])
    drain_y_proba.append(drain_probas_[:, 1])

    ####### ROC #######

    drain_fpr, drain_tpr, _ = roc_curve(log_seq_anomaly_labels[test], drain_probas_[:, 1])
    drain_tprs.append(np.interp(drain_mean_fpr, drain_fpr, drain_tpr))
    drain_tprs2.append(drain_tpr)
    drain_fprs.append(drain_fpr)
        
    drain_tprs[-1][0] = 0.0
    roc_auc = auc(drain_fpr, drain_tpr)
    drain_aucs.append(roc_auc)

    ####### F1-score, Accuracy, Precision and Recall #######

    drain_f1_scores.append(f1_score(log_seq_anomaly_labels[test], drain_y_pred))
    drain_accuracy_scores.append(accuracy_score(log_seq_anomaly_labels[test], drain_y_pred))
    drain_precision_scores.append(precision_score(log_seq_anomaly_labels[test], drain_y_pred))
    drain_recall_scores.append(recall_score(log_seq_anomaly_labels[test], drain_y_pred))

    drain_index += 1

    

    ####### PR #######

    spell_precision, spell_recall, _ = precision_recall_curve(log_seq_anomaly_labels[test], spell_probas_[:, 1])
    spell_precisions.append(spell_precision)
    spell_recalls.append(spell_recall)

    spell_avg_precision = average_precision_score(log_seq_anomaly_labels[test], spell_probas_[:, 1])
    spell_avg_precisions.append(spell_avg_precision)
        
    spell_y_real.append(log_seq_anomaly_labels[test])
    spell_y_proba.append(spell_probas_[:, 1])

    ####### ROC #######

    spell_fpr, spell_tpr, _ = roc_curve(log_seq_anomaly_labels[test], spell_probas_[:, 1])
    spell_tprs.append(np.interp(spell_mean_fpr, spell_fpr, spell_tpr))
    spell_tprs2.append(spell_tpr)
    spell_fprs.append(spell_fpr)
        
    spell_tprs[-1][0] = 0.0
    roc_auc = auc(spell_fpr, spell_tpr)
    spell_aucs.append(roc_auc)

    ####### F1-score, Accuracy, Precision and Recall #######

    spell_f1_scores.append(f1_score(log_seq_anomaly_labels[test], spell_y_pred))
    spell_accuracy_scores.append(accuracy_score(log_seq_anomaly_labels[test], spell_y_pred))
    spell_precision_scores.append(precision_score(log_seq_anomaly_labels[test], spell_y_pred))
    spell_recall_scores.append(recall_score(log_seq_anomaly_labels[test], spell_y_pred))

    spell_index += 1

drain_y_real = np.concatenate(drain_y_real)
drain_y_proba = np.concatenate(drain_y_proba)

spell_y_real = np.concatenate(spell_y_real)
spell_y_proba = np.concatenate(spell_y_proba)

[2022-11-26 04:31:58,480][INFO]: Starting Drain3 template miner


(154820,) (77410,)


Read 0M words
Number of words:  274
Number of labels: 0
Progress: 100.0% words/sec/thread: 8982062 lr:  0.000000 avg.loss:  0.815366 ETA:   0h 0m 0s


total          : took    19.78 s (100.00%),  1,075,417 samples,   18.39 ms / 1000 samples,       54,365.71 hz
drain          : took    16.01 s ( 80.96%),  1,075,417 samples,   14.89 ms / 1000 samples,       67,152.86 hz
tree_search    : took    11.05 s ( 55.85%),  1,075,417 samples,   10.27 ms / 1000 samples,       97,333.89 hz
cluster_exist  : took     1.65 s (  8.33%),  1,075,108 samples,    1.53 ms / 1000 samples,      652,595.30 hz
mask           : took     0.55 s (  2.77%),  1,075,417 samples,    0.51 ms / 1000 samples,    1,961,101.59 hz
create_cluster : took     0.00 s (  0.02%),        309 samples,   11.44 ms / 1000 samples,       87,440.29 hz


[2022-11-26 04:32:49,950][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-26 04:32:50,259][INFO]: Loaded 1.3% of log lines.
[2022-11-26 04:32:50,358][INFO]: Loaded 2.6% of log lines.
[2022-11-26 04:32:50,454][INFO]: Loaded 3.9% of log lines.
[2022-11-26 04:32:50,666][INFO]: Loaded 5.2% of log lines.
[2022-11-26 04:32:50,761][INFO]: Loaded 6.5% of log lines.
[2022-11-26 04:32:50,855][INFO]: Loaded 7.8% of log lines.
[2022-11-26 04:32:50,949][INFO]: Loaded 9.0% of log lines.
[2022-11-26 04:32:51,044][INFO]: Loaded 10.3% of log lines.
[2022-11-26 04:32:51,145][INFO]: Loaded 11.6% of log lines.
[2022-11-26 04:32:51,245][INFO]: Loaded 12.9% of log lines.
[2022-11-26 04:32:51,342][INFO]: Loaded 14.2% of log lines.
[2022-11-26 04:32:51,437][INFO]: Loaded 15.5% of log lines.
[2022-11-26 04:32:51,654][INFO]: Loaded 16.8% of log lines.
[2022-11-26 04:32:51,749][INFO]: Loaded 18.1% of log lines.
[2022-11-26 04:32:51,845][INFO]: Loaded 19.4% of log lines.
[2022-11-26 04:32:51,940][INF

(154820,) (77410,)


Read 0M words
Number of words:  263
Number of labels: 0
Progress: 100.0% words/sec/thread: 8893106 lr:  0.000000 avg.loss:  0.821353 ETA:   0h 0m 0s


total          : took    19.65 s (100.00%),  1,078,845 samples,   18.21 ms / 1000 samples,       54,903.89 hz
drain          : took    15.87 s ( 80.76%),  1,078,845 samples,   14.71 ms / 1000 samples,       67,980.65 hz
tree_search    : took    10.89 s ( 55.43%),  1,078,845 samples,   10.10 ms / 1000 samples,       99,045.26 hz
cluster_exist  : took     1.63 s (  8.27%),  1,078,530 samples,    1.51 ms / 1000 samples,      663,298.62 hz
mask           : took     0.55 s (  2.79%),  1,078,845 samples,    0.51 ms / 1000 samples,    1,967,618.85 hz
create_cluster : took     0.00 s (  0.02%),        315 samples,   11.29 ms / 1000 samples,       88,534.86 hz


[2022-11-26 04:37:15,092][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-26 04:37:15,410][INFO]: Loaded 1.3% of log lines.
[2022-11-26 04:37:15,509][INFO]: Loaded 2.6% of log lines.
[2022-11-26 04:37:15,608][INFO]: Loaded 3.9% of log lines.
[2022-11-26 04:37:15,701][INFO]: Loaded 5.2% of log lines.
[2022-11-26 04:37:15,796][INFO]: Loaded 6.5% of log lines.
[2022-11-26 04:37:15,889][INFO]: Loaded 7.8% of log lines.
[2022-11-26 04:37:16,094][INFO]: Loaded 9.0% of log lines.
[2022-11-26 04:37:16,189][INFO]: Loaded 10.3% of log lines.
[2022-11-26 04:37:16,301][INFO]: Loaded 11.6% of log lines.
[2022-11-26 04:37:16,466][INFO]: Loaded 12.9% of log lines.
[2022-11-26 04:37:16,631][INFO]: Loaded 14.2% of log lines.
[2022-11-26 04:37:16,787][INFO]: Loaded 15.5% of log lines.
[2022-11-26 04:37:16,951][INFO]: Loaded 16.8% of log lines.
[2022-11-26 04:37:17,105][INFO]: Loaded 18.1% of log lines.
[2022-11-26 04:37:17,262][INFO]: Loaded 19.4% of log lines.
[2022-11-26 04:37:17,577][INF

(154820,) (77410,)


Read 0M words
Number of words:  282
Number of labels: 0
Progress: 100.0% words/sec/thread: 9007407 lr:  0.000000 avg.loss:  0.814617 ETA:   0h 0m 0s


total          : took    19.69 s (100.00%),  1,066,034 samples,   18.47 ms / 1000 samples,       54,134.65 hz
drain          : took    15.95 s ( 81.00%),  1,066,034 samples,   14.96 ms / 1000 samples,       66,834.24 hz
tree_search    : took    11.02 s ( 55.98%),  1,066,034 samples,   10.34 ms / 1000 samples,       96,708.40 hz
cluster_exist  : took     1.61 s (  8.17%),  1,065,720 samples,    1.51 ms / 1000 samples,      662,185.24 hz
mask           : took     0.55 s (  2.79%),  1,066,034 samples,    0.51 ms / 1000 samples,    1,942,731.63 hz
create_cluster : took     0.00 s (  0.02%),        314 samples,   10.86 ms / 1000 samples,       92,118.03 hz


[2022-11-26 04:41:43,728][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-26 04:41:44,007][INFO]: Loaded 1.3% of log lines.
[2022-11-26 04:41:44,104][INFO]: Loaded 2.6% of log lines.
[2022-11-26 04:41:44,198][INFO]: Loaded 3.9% of log lines.
[2022-11-26 04:41:44,297][INFO]: Loaded 5.2% of log lines.
[2022-11-26 04:41:44,392][INFO]: Loaded 6.5% of log lines.
[2022-11-26 04:41:44,486][INFO]: Loaded 7.8% of log lines.
[2022-11-26 04:41:44,580][INFO]: Loaded 9.0% of log lines.
[2022-11-26 04:41:44,673][INFO]: Loaded 10.3% of log lines.
[2022-11-26 04:41:44,773][INFO]: Loaded 11.6% of log lines.
[2022-11-26 04:41:44,870][INFO]: Loaded 12.9% of log lines.
[2022-11-26 04:41:44,967][INFO]: Loaded 14.2% of log lines.
[2022-11-26 04:41:45,060][INFO]: Loaded 15.5% of log lines.
[2022-11-26 04:41:45,159][INFO]: Loaded 16.8% of log lines.
[2022-11-26 04:41:45,259][INFO]: Loaded 18.1% of log lines.
[2022-11-26 04:41:45,473][INFO]: Loaded 19.4% of log lines.
[2022-11-26 04:41:45,568][INF

(154820,) (77410,)


Read 0M words
Number of words:  277
Number of labels: 0
Progress: 100.0% words/sec/thread: 8957513 lr:  0.000000 avg.loss:  0.827650 ETA:   0h 0m 0s


total          : took    19.63 s (100.00%),  1,071,283 samples,   18.33 ms / 1000 samples,       54,566.92 hz
drain          : took    15.89 s ( 80.94%),  1,071,283 samples,   14.83 ms / 1000 samples,       67,419.49 hz
tree_search    : took    10.80 s ( 55.02%),  1,071,283 samples,   10.08 ms / 1000 samples,       99,178.93 hz
cluster_exist  : took     1.73 s (  8.81%),  1,070,974 samples,    1.62 ms / 1000 samples,      619,193.49 hz
mask           : took     0.55 s (  2.78%),  1,071,283 samples,    0.51 ms / 1000 samples,    1,960,365.27 hz
create_cluster : took     0.00 s (  0.02%),        309 samples,   11.07 ms / 1000 samples,       90,373.05 hz


[2022-11-26 04:46:04,593][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-26 04:46:04,896][INFO]: Loaded 1.3% of log lines.
[2022-11-26 04:46:04,994][INFO]: Loaded 2.6% of log lines.
[2022-11-26 04:46:05,090][INFO]: Loaded 3.9% of log lines.
[2022-11-26 04:46:05,188][INFO]: Loaded 5.2% of log lines.
[2022-11-26 04:46:05,283][INFO]: Loaded 6.5% of log lines.
[2022-11-26 04:46:05,376][INFO]: Loaded 7.8% of log lines.
[2022-11-26 04:46:05,471][INFO]: Loaded 9.0% of log lines.
[2022-11-26 04:46:05,565][INFO]: Loaded 10.3% of log lines.
[2022-11-26 04:46:05,665][INFO]: Loaded 11.6% of log lines.
[2022-11-26 04:46:05,765][INFO]: Loaded 12.9% of log lines.
[2022-11-26 04:46:05,860][INFO]: Loaded 14.2% of log lines.
[2022-11-26 04:46:05,954][INFO]: Loaded 15.5% of log lines.
[2022-11-26 04:46:06,053][INFO]: Loaded 16.8% of log lines.
[2022-11-26 04:46:06,151][INFO]: Loaded 18.1% of log lines.
[2022-11-26 04:46:06,363][INFO]: Loaded 19.4% of log lines.
[2022-11-26 04:46:06,459][INF

(154820,) (77410,)


Read 0M words
Number of words:  255
Number of labels: 0
Progress: 100.0% words/sec/thread: 8914093 lr:  0.000000 avg.loss:  0.816402 ETA:   0h 0m 0s


total          : took    19.62 s (100.00%),  1,080,897 samples,   18.15 ms / 1000 samples,       55,088.41 hz
drain          : took    15.84 s ( 80.74%),  1,080,897 samples,   14.66 ms / 1000 samples,       68,229.36 hz
tree_search    : took    10.86 s ( 55.37%),  1,080,897 samples,   10.05 ms / 1000 samples,       99,500.22 hz
cluster_exist  : took     1.65 s (  8.39%),  1,080,582 samples,    1.52 ms / 1000 samples,      656,331.61 hz
mask           : took     0.56 s (  2.86%),  1,080,897 samples,    0.52 ms / 1000 samples,    1,928,418.07 hz
create_cluster : took     0.00 s (  0.02%),        315 samples,   10.90 ms / 1000 samples,       91,750.40 hz


[2022-11-26 04:50:19,393][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-26 04:50:19,678][INFO]: Loaded 1.3% of log lines.
[2022-11-26 04:50:19,775][INFO]: Loaded 2.6% of log lines.
[2022-11-26 04:50:19,869][INFO]: Loaded 3.9% of log lines.
[2022-11-26 04:50:19,967][INFO]: Loaded 5.2% of log lines.
[2022-11-26 04:50:20,063][INFO]: Loaded 6.5% of log lines.
[2022-11-26 04:50:20,156][INFO]: Loaded 7.8% of log lines.
[2022-11-26 04:50:20,250][INFO]: Loaded 9.0% of log lines.
[2022-11-26 04:50:20,344][INFO]: Loaded 10.3% of log lines.
[2022-11-26 04:50:20,443][INFO]: Loaded 11.6% of log lines.
[2022-11-26 04:50:20,542][INFO]: Loaded 12.9% of log lines.
[2022-11-26 04:50:20,748][INFO]: Loaded 14.2% of log lines.
[2022-11-26 04:50:20,842][INFO]: Loaded 15.5% of log lines.
[2022-11-26 04:50:20,941][INFO]: Loaded 16.8% of log lines.
[2022-11-26 04:50:21,040][INFO]: Loaded 18.1% of log lines.
[2022-11-26 04:50:21,135][INFO]: Loaded 19.4% of log lines.
[2022-11-26 04:50:21,230][INF

(154820,) (77410,)


Read 0M words
Number of words:  282
Number of labels: 0
Progress: 100.0% words/sec/thread: 8892149 lr:  0.000000 avg.loss:  0.819730 ETA:   0h 0m 0s


total          : took    19.70 s (100.00%),  1,050,744 samples,   18.75 ms / 1000 samples,       53,324.41 hz
drain          : took    16.04 s ( 81.38%),  1,050,744 samples,   15.26 ms / 1000 samples,       65,526.55 hz
tree_search    : took    11.22 s ( 56.95%),  1,050,744 samples,   10.68 ms / 1000 samples,       93,637.75 hz
cluster_exist  : took     1.58 s (  8.02%),  1,050,449 samples,    1.50 ms / 1000 samples,      664,705.06 hz
mask           : took     0.55 s (  2.78%),  1,050,744 samples,    0.52 ms / 1000 samples,    1,914,873.08 hz
create_cluster : took     0.00 s (  0.02%),        295 samples,   10.20 ms / 1000 samples,       98,067.66 hz


[2022-11-26 04:54:40,507][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-26 04:54:40,800][INFO]: Loaded 1.3% of log lines.
[2022-11-26 04:54:40,901][INFO]: Loaded 2.6% of log lines.
[2022-11-26 04:54:40,994][INFO]: Loaded 3.9% of log lines.
[2022-11-26 04:54:41,088][INFO]: Loaded 5.2% of log lines.
[2022-11-26 04:54:41,182][INFO]: Loaded 6.5% of log lines.
[2022-11-26 04:54:41,275][INFO]: Loaded 7.8% of log lines.
[2022-11-26 04:54:41,474][INFO]: Loaded 9.0% of log lines.
[2022-11-26 04:54:41,566][INFO]: Loaded 10.3% of log lines.
[2022-11-26 04:54:41,665][INFO]: Loaded 11.6% of log lines.
[2022-11-26 04:54:41,761][INFO]: Loaded 12.9% of log lines.
[2022-11-26 04:54:41,859][INFO]: Loaded 14.2% of log lines.
[2022-11-26 04:54:41,951][INFO]: Loaded 15.5% of log lines.
[2022-11-26 04:54:42,048][INFO]: Loaded 16.8% of log lines.
[2022-11-26 04:54:42,140][INFO]: Loaded 18.1% of log lines.
[2022-11-26 04:54:42,234][INFO]: Loaded 19.4% of log lines.
[2022-11-26 04:54:42,445][INF

#### Print Results

In [15]:
print('F1 Scores')
print('\n')
print('Spell: ', spell_f1_scores)
print('\n')
print('Drain: ',drain_f1_scores)
print('\n')
print('Average Scores')
print('Spell: ', np.average(spell_f1_scores), ' Drain: ', np.average(drain_f1_scores))

print('\n')
print('Accuracy Scores')
print('\n')
print('Spell: ', spell_accuracy_scores)
print('\n')
print('Drain: ', drain_accuracy_scores)
print('\n')
print('Average Scores')
print('Spell: ', np.average(spell_accuracy_scores), ' Drain: ', np.average(drain_accuracy_scores))

print('\n')
print('Precision Scores')
print('\n')
print('Spell: ', spell_precision_scores)
print('\n')
print('Drain: ', drain_precision_scores)
print('\n')
print('Spell: ', np.average(spell_precision_scores), ' Drain: ', np.average(drain_precision_scores))

print('\n')
print('Recall Scores')
print('\n')
print('Spell: ', spell_recall_scores)
print('\n')
print('Drain: ', drain_recall_scores)
print('\n')
print('Spell: ', np.average(spell_recall_scores), ' Drain: ', np.average(drain_recall_scores))

F1 Scores


Spell:  [0.0, 0.00012069277653732425, 0.0, 0.0, 0.8090517729692988, 0.7790087463556852]


Drain:  [0.9316949765088542, 0.9562978736929091, 0.9390963975320108, 0.9555794901922386, 0.9586480854389163, 0.9688547846272891]


Average Scores
Spell:  0.2646968686835869  Drain:  0.9516952679987029


Accuracy Scores


Spell:  [0.7933600310037463, 0.7859578865779615, 0.7862162511303449, 0.7862550058132024, 0.9129053093915515, 0.8922878181113552]


Drain:  [0.9731430047797442, 0.9822374370236403, 0.9762821340912027, 0.9824182922103087, 0.9835938509236533, 0.9875209921198812]


Average Scores
Spell:  0.8261637170046936  Drain:  0.9808659518580717


Precision Scores


Spell:  [0.0, 0.0017241379310344827, 0.0, 0.0, 0.7394005280323032, 0.6760200561203368]


Drain:  [0.9815173750519175, 0.9723371251292657, 1.0, 0.999385581649372, 1.0, 1.0]


Spell:  0.23619078701394572  Drain:  0.9922066803050926


Recall Scores


Spell:  [0.0, 6.253517603652055e-05, 0.0, 0.0, 0.8931899193296229, 0.9190169

In [16]:
print('F1 Scores')
print('\n')
print('Spell: ', spell_f1_scores)
print('\n')
print('Drain: ',drain_f1_scores)
print('\n')
print('Average Scores')
print('Spell: ', np.average(spell_f1_scores), ' Drain: ', np.average(drain_f1_scores))

print('\n')
print('Accuracy Scores')
print('\n')
print('Spell: ', spell_accuracy_scores)
print('\n')
print('Drain: ', drain_accuracy_scores)
print('\n')
print('Average Scores')
print('Spell: ', np.average(spell_accuracy_scores), ' Drain: ', np.average(drain_accuracy_scores))

print('\n')
print('Precision Scores')
print('\n')
print('Spell: ', spell_precision_scores)
print('\n')
print('Drain: ', drain_precision_scores)
print('\n')
print('Spell: ', np.average(spell_precision_scores), ' Drain: ', np.average(drain_precision_scores))

print('\n')
print('Recall Scores')
print('\n')
print('Spell: ', spell_recall_scores)
print('\n')
print('Drain: ', drain_recall_scores)
print('\n')
print('Spell: ', np.average(spell_recall_scores), ' Drain: ', np.average(drain_recall_scores))

F1 Scores


Spell:  [0.0, 0.00012069277653732425, 0.0, 0.0, 0.8090517729692988, 0.7790087463556852]


Drain:  [0.9316949765088542, 0.9562978736929091, 0.9390963975320108, 0.9555794901922386, 0.9586480854389163, 0.9688547846272891]


Average Scores
Spell:  0.2646968686835869  Drain:  0.9516952679987029


Accuracy Scores


Spell:  [0.7933600310037463, 0.7859578865779615, 0.7862162511303449, 0.7862550058132024, 0.9129053093915515, 0.8922878181113552]


Drain:  [0.9731430047797442, 0.9822374370236403, 0.9762821340912027, 0.9824182922103087, 0.9835938509236533, 0.9875209921198812]


Average Scores
Spell:  0.8261637170046936  Drain:  0.9808659518580717


Precision Scores


Spell:  [0.0, 0.0017241379310344827, 0.0, 0.0, 0.7394005280323032, 0.6760200561203368]


Drain:  [0.9815173750519175, 0.9723371251292657, 1.0, 0.999385581649372, 1.0, 1.0]


Spell:  0.23619078701394572  Drain:  0.9922066803050926


Recall Scores


Spell:  [0.0, 6.253517603652055e-05, 0.0, 0.0, 0.8931899193296229, 0.9190169