### Imports & Contants

In [1]:
import os
import os.path as path
import glob
import boto3
from datetime import datetime

from sagemaker import get_execution_role
from csv import (writer, DictWriter)
import pickle

import pandas as pd
import math
import numpy as np

from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig

from spellpy import spell

import fasttext

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (auc, roc_curve, average_precision_score, precision_recall_curve, f1_score, accuracy_score,
                            recall_score, precision_score)

In [2]:
project_root = os.getcwd()

input_dir = 'input'
output_dir = 'output'

spell_input_dir = 'input/spell'
spell_output_dir = 'output/spell'

drain_input_dir = 'input/drain'
drain_output_dir = 'output/drain'

if not path.abspath(path.join(project_root, spell_input_dir)):
    os.makedirs(path.abspath(path.join(project_root, spell_input_dir)))

if not path.abspath(path.join(project_root, spell_output_dir)):
    os.makedirs(path.abspath(path.join(project_root, spell_output_dir)))

if not path.abspath(path.join(project_root, drain_input_dir)):
    os.makedirs(path.abspath(path.join(project_root, drain_input_dir)))

if not path.abspath(path.join(project_root, drain_output_dir)):
    os.makedirs(path.abspath(path.join(project_root, drain_output_dir)))

config_dir = 'config'
bucket_name = 'sagemaker-studio-326787221562-jycpwz9gs3f'
log_file_key = 'BGL.csv'
pickle_key = 'Output-' + datetime.now().strftime("%d-%m-%Y-%H-%M-%S") + '.pkl'
    
WINDOW_SIZE = 5

## Convert BGL.log file to CSV format.

In [3]:
role = get_execution_role()
s3_client = boto3.client('s3')
log_file = s3_client.get_object(Bucket = bucket_name, Key = log_file_key)
log_csv_file = log_file['Body']

## EDA

In [4]:
original_df = pd.read_csv(log_csv_file)
print(original_df.shape)
print(original_df.info())
original_df.head(n=20)

(4713493, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4713493 entries, 0 to 4713492
Data columns (total 10 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   Anomaly Type    object
 1   Timestamp (ms)  int64 
 2   Date            object
 3   Node            object
 4   Timestamp       object
 5   Node Repeat     object
 6   Message Type    object
 7   Component       object
 8   Level           object
 9   Content         object
dtypes: int64(1), object(9)
memory usage: 359.6+ MB
None


Unnamed: 0,Anomaly Type,Timestamp (ms),Date,Node,Timestamp,Node Repeat,Message Type,Component,Level,Content
0,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.363779,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
1,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.527847,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
2,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.675872,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
3,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.823719,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
4,-,1117838570,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.50.982731,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
5,-,1117838571,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.51.131467,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
6,-,1117838571,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.51.293532,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
7,-,1117838571,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.51.428563,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
8,-,1117838571,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.51.601412,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected
9,-,1117838571,2005.06.03,R02-M1-N0-C:J12-U11,2005-06-03-15.42.51.749199,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected


In [5]:
original_df.drop(['Timestamp (ms)', 'Node Repeat'], axis = 1)
original_df['Anomaly Label'] = np.where(original_df['Anomaly Type'] == '-', 0, 1)
original_df['Timestamp'] = pd.to_datetime(original_df['Timestamp'], format='%Y-%m-%d-%H.%M.%S.%f')

df = original_df[['Anomaly Label', 'Content']]
df.head(20)

Unnamed: 0,Anomaly Label,Content
0,0,instruction cache parity error corrected
1,0,instruction cache parity error corrected
2,0,instruction cache parity error corrected
3,0,instruction cache parity error corrected
4,0,instruction cache parity error corrected
5,0,instruction cache parity error corrected
6,0,instruction cache parity error corrected
7,0,instruction cache parity error corrected
8,0,instruction cache parity error corrected
9,0,instruction cache parity error corrected


## Create Log Sequences and Labels for Sequences

In [6]:
n = math.floor(df['Content'].index.size/WINDOW_SIZE)
r = math.floor(df['Content'].index.size%WINDOW_SIZE)

if r != 0:
    log_seqs = np.array(np.split(np.array(df['Content'])[:-r], n))
else:
    log_seqs = np.array(np.split(np.array(df['Content']), n))

In [7]:
if r != 0:
    log_seq_idx = np.array(np.split(df.index.to_numpy()[:-r], n))
else:
    log_seq_idx = np.array(np.split(df.index.to_numpy(), n))

In [8]:
log_seq_anomaly_labels = np.empty([n], dtype=int)
i = 0
for seq in log_seq_idx:
    if np.sum(df.loc[seq]['Anomaly Label'].values) > 0:
        log_seq_anomaly_labels[i] = 1
    else:
        log_seq_anomaly_labels[i] = 0
    i += 1

## Log Parsing & Numericalization

#### Drain Parser

In [9]:
files = glob.glob(drain_input_dir + '/*')
for f in files:
    os.remove(f)

files = glob.glob(drain_output_dir + '/*')
for f in files:
    os.remove(f)

drain_config_file = path.abspath(path.join(project_root, config_dir, 'drain3.ini'))

main_structured_csv_filename = 'BGL_main_structured.csv'
drain_main_structured_csv_file = path.abspath(path.join(project_root, drain_output_dir, main_structured_csv_filename))

templates_csv_filename = 'BGL_main_templates.csv'
drain_templates_csv_file = path.abspath(path.join(project_root, drain_output_dir, templates_csv_filename))

In [10]:
class DrainParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        config = TemplateMinerConfig()
        config.load(drain_config_file)

        self.template_miner = TemplateMiner(config=config)

    def fit(self, X, y = None):
        return self

    def transform(self, log_seqs, y = None):
        log_seqs_list = log_seqs.reshape([-1]).tolist()
        self.parsed = []

        for line in log_seqs_list:
            self.parsed.append(self.template_miner.add_log_message(line))

        # Uncomment during debug to view the parser output
        # self.write_output_to_csv()

        template_seq = [x['cluster_id']-1 for x in self.parsed]
        n = math.floor(len(template_seq)/WINDOW_SIZE)
        template_seqs = np.array(np.split(np.array(template_seq), n))

        return template_seqs
    
    def cluster_template_to_tuple(self, cluster):
        return (cluster.cluster_id, cluster.get_template(), cluster.size,)

    def write_output_to_csv(self):
        with open(drain_main_structured_csv_file, 'w') as drain_main_structured_csv_file_obj:
            main_structured_csv_filewriter = DictWriter(drain_main_structured_csv_file_obj, fieldnames=['template_mined', 'cluster_id', 'change_type', 'cluster_size', 'cluster_count'])
            main_structured_csv_filewriter.writeheader()
            for line in self.parsed:
                main_structured_csv_filewriter.writerow(line)
            drain_main_structured_csv_file_obj.close
            
        clusters = self.template_miner.drain.clusters

        with open(drain_templates_csv_file, 'a') as drain_templates_csv_file_obj:
            drain_templates_csv_filewriter = writer(drain_templates_csv_file_obj)
            drain_templates_csv_filewriter.writerow(header for header in ['cluster_id', 'template', 'size'])
            for line in clusters:
                drain_templates_csv_filewriter.writerow(self.cluster_template_to_tuple(line))
            drain_templates_csv_file_obj.close

#### Spell Parser

In [11]:
log_content_csv_file_name = 'bgl_2k_content.csv'
log_content_csv_file = log_file = path.abspath(path.join(project_root, spell_input_dir, log_content_csv_file_name))

main_structured_csv_filename = 'BGL_main_structured.csv'
spell_main_structured_csv_file = path.abspath(path.join(project_root, spell_output_dir, main_structured_csv_filename))

templates_csv_filename = 'BGL_main_templates.csv'
spell_templates_csv_file = path.abspath(path.join(project_root, spell_output_dir, templates_csv_filename))

In [12]:
class SpellParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        log_format = '<Content>'
        tau = 0.5

        self.parser = spell.LogParser(indir=spell_input_dir, outdir=spell_output_dir,
                             log_format=log_format, tau=tau, logmain='BGL')

    def fit(self, X, y = None):
        return self

    def transform(self, log_seqs, y = None):
        log_seqs_list = log_seqs.reshape([-1]).tolist()

        ldf = pd.DataFrame(log_seqs_list, columns=['Content'])
        ldf.to_csv(log_content_csv_file, index=False, header=False)

        self.parser.parse(log_content_csv_file_name)

        nums = self.numericalize()
        n = math.floor(len(nums)/WINDOW_SIZE)
        nums = np.array(np.split(nums, n))
        
        # Comment during debug to view parser output
        self.cleanup_files()
        
        return nums

    def numericalize(self):
        output_df = pd.read_csv(spell_main_structured_csv_file)

        return output_df['EventId'].to_numpy()

    def cleanup_files(self):
        files = glob.glob(spell_input_dir + '/*')
        for f in files:
            os.remove(f)

        files = glob.glob(spell_output_dir + '/*')
        for f in files:
            os.remove(f)

## Word Embedding

In [13]:
class WordEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.NUMBER_OF_DIMENSIONS = 100

    def fit(self, template_seqs, y = None):
        template_seqs_filename = 'bgl_train_seqs.txt'
        template_seqs_file = path.abspath(path.join(project_root, output_dir, template_seqs_filename))
        np.savetxt(template_seqs_file, template_seqs, fmt='%s')
        self.fasttext_model = fasttext.train_unsupervised(template_seqs_file, model='cbow', minCount=1, dim=self.NUMBER_OF_DIMENSIONS)
        
        # Comment during debug to view embedding input
        os.remove(template_seqs_file)

        return self

    def transform(self, template_seqs, y = None):
        template_seqs_ = template_seqs.copy()
        template_seqs_ = np.apply_along_axis(self.average_embeddings, 1, template_seqs)

        return template_seqs_

    def average_embeddings(self, num_lse_vector):
        s2v_vector = self.fasttext_model.get_sentence_vector(' '.join(np.vectorize(str)(num_lse_vector)))
        return s2v_vector

## Evaluation

In [None]:
drain_y_real = []
drain_y_proba = []

drain_precisions = []
drain_recalls = []
drain_avg_precisions = []

drain_tprs = []
drain_tprs2 = []
drain_fprs = []
drain_aucs = []
drain_mean_fpr = np.linspace(0, 1, 100)

drain_f1_scores = []
drain_accuracy_scores = []
drain_precision_scores = []
drain_recall_scores = []

drain_index = 0

spell_y_real = []
spell_y_proba = []

spell_precisions = []
spell_recalls = []
spell_avg_precisions = []

spell_tprs = []
spell_tprs2 = []
spell_fprs = []
spell_aucs = []
spell_mean_fpr = np.linspace(0, 1, 100)

spell_f1_scores = []
spell_accuracy_scores = []
spell_precision_scores = []
spell_recall_scores = []

spell_index = 0

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=0)
for train, test in cv.split(log_seqs, log_seq_anomaly_labels):
    print(train.shape, test.shape)

    #drain_pipe = Pipeline(steps=[('parsing', DrainParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])
    drain_pipe = Pipeline(steps=[('parsing', DrainParser()), ('word_embedding', WordEmbedding()), ('random_forest', RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))])
    drain_probas_ = drain_pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict_proba(log_seqs[test])
    drain_y_pred = drain_pipe.predict(log_seqs[test])

    #spell_pipe = Pipeline(steps=[('parsing', SpellParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])
    spell_pipe = Pipeline(steps=[('parsing', SpellParser()), ('word_embedding', WordEmbedding()), ('random_forest', RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))])
    spell_probas_ = spell_pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict_proba(log_seqs[test])
    spell_y_pred = spell_pipe.predict(log_seqs[test])

    ####### PR #######

    drain_precision, drain_recall, _ = precision_recall_curve(log_seq_anomaly_labels[test], drain_probas_[:, 1])
    drain_precisions.append(drain_precision)
    drain_recalls.append(drain_recall)

    drain_avg_precision = average_precision_score(log_seq_anomaly_labels[test], drain_probas_[:, 1])
    drain_avg_precisions.append(drain_avg_precision)
        
    drain_y_real.append(log_seq_anomaly_labels[test])
    drain_y_proba.append(drain_probas_[:, 1])

    ####### ROC #######

    drain_fpr, drain_tpr, _ = roc_curve(log_seq_anomaly_labels[test], drain_probas_[:, 1])
    drain_tprs.append(np.interp(drain_mean_fpr, drain_fpr, drain_tpr))
    drain_tprs2.append(drain_tpr)
    drain_fprs.append(drain_fpr)
        
    drain_tprs[-1][0] = 0.0
    roc_auc = auc(drain_fpr, drain_tpr)
    drain_aucs.append(roc_auc)

    ####### F1-score, Accuracy, Precision and Recall #######

    drain_f1_scores.append(f1_score(log_seq_anomaly_labels[test], drain_y_pred))
    drain_accuracy_scores.append(accuracy_score(log_seq_anomaly_labels[test], drain_y_pred))
    drain_precision_scores.append(precision_score(log_seq_anomaly_labels[test], drain_y_pred))
    drain_recall_scores.append(recall_score(log_seq_anomaly_labels[test], drain_y_pred))

    drain_index += 1

    

    ####### PR #######

    spell_precision, spell_recall, _ = precision_recall_curve(log_seq_anomaly_labels[test], spell_probas_[:, 1])
    spell_precisions.append(spell_precision)
    spell_recalls.append(spell_recall)

    spell_avg_precision = average_precision_score(log_seq_anomaly_labels[test], spell_probas_[:, 1])
    spell_avg_precisions.append(spell_avg_precision)
        
    spell_y_real.append(log_seq_anomaly_labels[test])
    spell_y_proba.append(spell_probas_[:, 1])

    ####### ROC #######

    spell_fpr, spell_tpr, _ = roc_curve(log_seq_anomaly_labels[test], spell_probas_[:, 1])
    spell_tprs.append(np.interp(spell_mean_fpr, spell_fpr, spell_tpr))
    spell_tprs2.append(spell_tpr)
    spell_fprs.append(spell_fpr)
        
    spell_tprs[-1][0] = 0.0
    roc_auc = auc(spell_fpr, spell_tpr)
    spell_aucs.append(roc_auc)

    ####### F1-score, Accuracy, Precision and Recall #######

    spell_f1_scores.append(f1_score(log_seq_anomaly_labels[test], spell_y_pred))
    spell_accuracy_scores.append(accuracy_score(log_seq_anomaly_labels[test], spell_y_pred))
    spell_precision_scores.append(precision_score(log_seq_anomaly_labels[test], spell_y_pred))
    spell_recall_scores.append(recall_score(log_seq_anomaly_labels[test], spell_y_pred))

    spell_index += 1

drain_y_real = np.concatenate(drain_y_real)
drain_y_proba = np.concatenate(drain_y_proba)

spell_y_real = np.concatenate(spell_y_real)
spell_y_proba = np.concatenate(spell_y_proba)

[2022-11-27 06:09:32,610][INFO]: Starting Drain3 template miner


(628465,) (314233,)
total          : took    27.40 s (100.00%),    842,128 samples,   32.53 ms / 1000 samples,       30,739.51 hz
drain          : took    22.20 s ( 81.03%),    842,128 samples,   26.36 ms / 1000 samples,       37,937.21 hz
tree_search    : took    15.06 s ( 54.96%),    842,128 samples,   17.88 ms / 1000 samples,       55,935.46 hz
cluster_exist  : took     2.49 s (  9.08%),    841,866 samples,    2.95 ms / 1000 samples,      338,456.90 hz
mask           : took     0.63 s (  2.29%),    842,128 samples,    0.74 ms / 1000 samples,    1,344,488.04 hz
create_cluster : took     0.00 s (  0.01%),        262 samples,   14.09 ms / 1000 samples,       70,988.87 hz
total          : took    55.48 s (100.00%),  1,392,639 samples,   39.84 ms / 1000 samples,       25,102.44 hz
drain          : took    46.60 s ( 84.00%),  1,392,639 samples,   33.46 ms / 1000 samples,       29,885.14 hz
tree_search    : took    34.58 s ( 62.33%),  1,392,639 samples,   24.83 ms / 1000 samples,       40,

Read 3M words
Number of words:  1339
Number of labels: 0
Progress: 100.0% words/sec/thread: 4381809 lr:  0.000000 avg.loss:  0.446006 ETA:   0h 0m 0s


total          : took   117.43 s (100.00%),  3,142,326 samples,   37.37 ms / 1000 samples,       26,759.73 hz
drain          : took    97.52 s ( 83.04%),  3,142,326 samples,   31.03 ms / 1000 samples,       32,223.18 hz
tree_search    : took    68.85 s ( 58.64%),  3,142,326 samples,   21.91 ms / 1000 samples,       45,637.39 hz
cluster_exist  : took    10.51 s (  8.95%),  3,140,988 samples,    3.35 ms / 1000 samples,      298,768.95 hz
mask           : took     2.37 s (  2.02%),  3,142,326 samples,    0.76 ms / 1000 samples,    1,324,157.49 hz
create_cluster : took     0.09 s (  0.08%),      1,338 samples,   68.37 ms / 1000 samples,       14,626.37 hz
total          : took   146.54 s (100.00%),  3,442,388 samples,   42.57 ms / 1000 samples,       23,491.89 hz
drain          : took   124.81 s ( 85.18%),  3,442,388 samples,   36.26 ms / 1000 samples,       27,580.15 hz
tree_search    : took    93.63 s ( 63.89%),  3,442,388 samples,   27.20 ms / 1000 samples,       36,767.43 hz
cluster_ex

[2022-11-27 06:26:20,655][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-27 06:26:21,761][INFO]: Loaded 0.3% of log lines.
[2022-11-27 06:26:21,856][INFO]: Loaded 0.6% of log lines.
[2022-11-27 06:26:21,948][INFO]: Loaded 1.0% of log lines.
[2022-11-27 06:26:22,041][INFO]: Loaded 1.3% of log lines.
[2022-11-27 06:26:22,395][INFO]: Loaded 1.6% of log lines.
[2022-11-27 06:26:22,487][INFO]: Loaded 1.9% of log lines.
[2022-11-27 06:26:22,578][INFO]: Loaded 2.2% of log lines.
[2022-11-27 06:26:22,670][INFO]: Loaded 2.5% of log lines.
[2022-11-27 06:26:22,769][INFO]: Loaded 2.9% of log lines.
[2022-11-27 06:26:22,865][INFO]: Loaded 3.2% of log lines.
[2022-11-27 06:26:22,959][INFO]: Loaded 3.5% of log lines.
[2022-11-27 06:26:23,052][INFO]: Loaded 3.8% of log lines.
[2022-11-27 06:26:23,155][INFO]: Loaded 4.1% of log lines.
[2022-11-27 06:26:23,516][INFO]: Loaded 4.5% of log lines.
[2022-11-27 06:26:23,609][INFO]: Loaded 4.8% of log lines.
[2022-11-27 06:26:23,702][INFO]: Load

(628465,) (314233,)
total          : took    27.83 s (100.00%),  1,030,521 samples,   27.01 ms / 1000 samples,       37,026.81 hz
drain          : took    24.04 s ( 86.38%),  1,030,521 samples,   23.33 ms / 1000 samples,       42,864.27 hz
tree_search    : took    18.88 s ( 67.83%),  1,030,521 samples,   18.32 ms / 1000 samples,       54,586.19 hz
cluster_exist  : took     1.74 s (  6.23%),  1,030,211 samples,    1.68 ms / 1000 samples,      593,767.48 hz
mask           : took     0.49 s (  1.77%),  1,030,521 samples,    0.48 ms / 1000 samples,    2,087,888.89 hz
create_cluster : took     0.00 s (  0.01%),        310 samples,   10.61 ms / 1000 samples,       94,240.36 hz
total          : took    54.63 s (100.00%),  2,071,725 samples,   26.37 ms / 1000 samples,       37,920.22 hz
drain          : took    44.39 s ( 81.25%),  2,071,725 samples,   21.43 ms / 1000 samples,       46,671.81 hz
tree_search    : took    30.24 s ( 55.35%),  2,071,725 samples,   14.60 ms / 1000 samples,       68,

Read 3M words
Number of words:  1468
Number of labels: 0
Progress: 100.0% words/sec/thread: 4177087 lr:  0.000000 avg.loss:  0.452049 ETA:   0h 0m 0s


total          : took   106.12 s (100.00%),  3,142,326 samples,   33.77 ms / 1000 samples,       29,610.03 hz
drain          : took    88.02 s ( 82.94%),  3,142,326 samples,   28.01 ms / 1000 samples,       35,699.30 hz
tree_search    : took    61.39 s ( 57.85%),  3,142,326 samples,   19.54 ms / 1000 samples,       51,184.00 hz
cluster_exist  : took     9.98 s (  9.40%),  3,140,859 samples,    3.18 ms / 1000 samples,      314,827.14 hz
mask           : took     2.23 s (  2.10%),  3,142,326 samples,    0.71 ms / 1000 samples,    1,410,024.42 hz
create_cluster : took     0.12 s (  0.12%),      1,467 samples,   84.91 ms / 1000 samples,       11,777.31 hz
total          : took   135.56 s (100.00%),  3,457,192 samples,   39.21 ms / 1000 samples,       25,502.59 hz
drain          : took   116.34 s ( 85.82%),  3,457,192 samples,   33.65 ms / 1000 samples,       29,716.98 hz
tree_search    : took    88.15 s ( 65.03%),  3,457,192 samples,   25.50 ms / 1000 samples,       39,218.29 hz
cluster_ex

[2022-11-27 07:13:48,011][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-27 07:13:49,178][INFO]: Loaded 0.3% of log lines.
[2022-11-27 07:13:49,273][INFO]: Loaded 0.6% of log lines.
[2022-11-27 07:13:49,364][INFO]: Loaded 1.0% of log lines.
[2022-11-27 07:13:49,455][INFO]: Loaded 1.3% of log lines.
[2022-11-27 07:13:49,546][INFO]: Loaded 1.6% of log lines.
[2022-11-27 07:13:49,638][INFO]: Loaded 1.9% of log lines.
[2022-11-27 07:13:49,728][INFO]: Loaded 2.2% of log lines.
[2022-11-27 07:13:49,819][INFO]: Loaded 2.5% of log lines.
[2022-11-27 07:13:49,921][INFO]: Loaded 2.9% of log lines.
[2022-11-27 07:13:50,015][INFO]: Loaded 3.2% of log lines.
[2022-11-27 07:13:50,106][INFO]: Loaded 3.5% of log lines.
[2022-11-27 07:13:50,197][INFO]: Loaded 3.8% of log lines.
[2022-11-27 07:13:50,293][INFO]: Loaded 4.1% of log lines.
[2022-11-27 07:13:50,384][INFO]: Loaded 4.5% of log lines.
[2022-11-27 07:13:50,476][INFO]: Loaded 4.8% of log lines.
[2022-11-27 07:13:50,567][INFO]: Load

#### Print Results

In [None]:
print('F1 Scores')
print('Spell: ', spell_f1_scores)
print('Drain: ',drain_f1_scores)
print('Average Scores - ', 'Spell: ', np.average(spell_f1_scores), ' Drain: ', np.average(drain_f1_scores))
print('\n')

print('Accuracy Scores')
print('Spell: ', spell_accuracy_scores)
print('Drain: ', drain_accuracy_scores)
print('Average Scores - ', 'Spell: ', np.average(spell_accuracy_scores), ' Drain: ', np.average(drain_accuracy_scores))
print('\n')

print('Precision Scores')
print('Spell: ', spell_precision_scores)
print('Drain: ', drain_precision_scores)
print('Average Scores - ', 'Spell: ', np.average(spell_precision_scores), ' Drain: ', np.average(drain_precision_scores))
print('\n')

print('Recall Scores')
print('Spell: ', spell_recall_scores)
print('Drain: ', drain_recall_scores)
print('Average Scores - ', 'Spell: ', np.average(spell_recall_scores), ' Drain: ', np.average(drain_recall_scores))

In [None]:
pickle_byte_obj = pickle.dumps([spell_f1_scores, drain_f1_scores, spell_accuracy_scores, drain_accuracy_scores, spell_precision_scores, drain_precision_scores, spell_recall_scores, drain_recall_scores])

s3_resource = boto3.resource("s3")
s3_resource.Object(bucket_name, pickle_key).put(Body=pickle_byte_obj)