In [5]:
import os
import os.path as path
import glob
import boto3

from csv import (writer, DictWriter)

import pandas as pd
import math
import numpy as np

from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig

from spellpy import spell
#from fastai.text.all import Numericalize

import fasttext

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import (confusion_matrix, precision_score, recall_score, f1_score)

In [6]:
project_root = os.getcwd()

data_dir = 'data'
log_file_name = 'bgl_2k.log'

config_dir = 'config'

input_dir = 'input'
spell_output_dir = 'output'

spell_input_dir = 'input/spell'
spell_output_dir = 'output/spell'

drain_input_dir = 'input/drain'
drain_output_dir = 'output/drain'

WINDOW_SIZE = 5

## Convert BGL.log file to CSV format.

In [7]:
s3 = boto3.client('s3')
log_file = s3.get_object(Bucket = 'sagemaker-studio-326787221562-jycpwz9gs3f', Key = 'BGL_200506.csv')
log_csv_file = log_file['Body']

## EDA

In [8]:
original_df = pd.read_csv(log_csv_file)
print(original_df.info())
original_df.head(n=20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1161151 entries, 0 to 1161150
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   Unnamed: 0      1161151 non-null  int64 
 1   Anomaly Type    1161151 non-null  object
 2   Timestamp (ms)  1161151 non-null  int64 
 3   Date            1161151 non-null  object
 4   Node            1161130 non-null  object
 5   Timestamp       1161151 non-null  object
 6   Node Repeat     1161130 non-null  object
 7   Message Type    1153780 non-null  object
 8   Component       1161151 non-null  object
 9   Level           1161151 non-null  object
 10  Content         1161151 non-null  object
 11  Anomaly Label   1161151 non-null  int64 
dtypes: int64(3), object(9)
memory usage: 106.3+ MB
None


Unnamed: 0.1,Unnamed: 0,Anomaly Type,Timestamp (ms),Date,Node,Timestamp,Node Repeat,Message Type,Component,Level,Content,Anomaly Label
0,0,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.363779,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
1,1,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.527847,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
2,2,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.675872,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
3,3,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.823719,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
4,4,-,1117838570,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:50.982731,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
5,5,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.131467,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
6,6,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.293532,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
7,7,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.428563,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
8,8,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.601412,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0
9,9,-,1117838571,2005-06-03 00:00:00.000000,R02-M1-N0-C:J12-U11,2005-06-03 15:42:51.749199,R02-M1-N0-C:J12-U11,RAS,KERNEL,INFO,instruction cache parity error corrected,0


In [9]:
df = original_df[['Anomaly Label', 'Content']]
df.head(20)

Unnamed: 0,Anomaly Label,Content
0,0,instruction cache parity error corrected
1,0,instruction cache parity error corrected
2,0,instruction cache parity error corrected
3,0,instruction cache parity error corrected
4,0,instruction cache parity error corrected
5,0,instruction cache parity error corrected
6,0,instruction cache parity error corrected
7,0,instruction cache parity error corrected
8,0,instruction cache parity error corrected
9,0,instruction cache parity error corrected


## Create Log Sequences and Labels for Sequences

In [10]:
n = math.floor(df['Content'].index.size/WINDOW_SIZE)
r = math.floor(df['Content'].index.size%WINDOW_SIZE)

if r != 0:
    log_seqs = np.array(np.split(np.array(df['Content'])[:-r], n))
else:
    log_seqs = np.array(np.split(np.array(df['Content']), n))

In [11]:
if r != 0:
    log_seq_idx = np.array(np.split(df.index.to_numpy()[:-r], n))
else:
    log_seq_idx = np.array(np.split(df.index.to_numpy(), n))

In [12]:
log_seq_anomaly_labels = np.empty([n], dtype=int)
i = 0
for seq in log_seq_idx:
    if np.sum(df.loc[seq]['Anomaly Label'].values) > 0:
        log_seq_anomaly_labels[i] = 1
    else:
        log_seq_anomaly_labels[i] = 0
    i += 1

## Log Parsing & Numericalization

#### Drain Parser

In [13]:
files = glob.glob(drain_input_dir + '/*')
for f in files:
    os.remove(f)

files = glob.glob(drain_output_dir + '/*')
for f in files:
    os.remove(f)

drain_config_file = path.abspath(path.join(project_root, '../', config_dir, 'drain3.ini'))

main_structured_csv_filename = 'BGL_main_structured.csv'
drain_main_structured_csv_file = path.abspath(path.join(project_root, drain_output_dir, main_structured_csv_filename))

templates_csv_filename = 'BGL_main_templates.csv'
drain_templates_csv_file = path.abspath(path.join(project_root, drain_output_dir, templates_csv_filename))

In [14]:
class DrainParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        config = TemplateMinerConfig()
        config.load(drain_config_file)

        self.template_miner = TemplateMiner(config=config)

    def fit(self, X, y = None):
        return self

    def transform(self, log_seqs, y = None):
        log_seqs_list = log_seqs.reshape([-1]).tolist()
        self.parsed = []

        for line in log_seqs_list:
            self.parsed.append(self.template_miner.add_log_message(line))

        # Uncomment during debug to view the parser output
        # self.write_output_to_csv()

        template_seq = [str(x['cluster_id']-1) for x in self.parsed]
        n = math.floor(len(template_seq)/WINDOW_SIZE)
        template_seqs = np.array(np.split(np.array(template_seq), n))

        return template_seqs
    
    def cluster_template_to_tuple(self, cluster):
        return (cluster.cluster_id, cluster.get_template(), cluster.size,)

    def write_output_to_csv(self):
        with open(drain_main_structured_csv_file, 'w') as drain_main_structured_csv_file_obj:
            main_structured_csv_filewriter = DictWriter(drain_main_structured_csv_file_obj, fieldnames=['template_mined', 'cluster_id', 'change_type', 'cluster_size', 'cluster_count'])
            main_structured_csv_filewriter.writeheader()
            for line in self.parsed:
                main_structured_csv_filewriter.writerow(line)
            drain_main_structured_csv_file_obj.close
            
        clusters = self.template_miner.drain.clusters

        with open(drain_templates_csv_file, 'a') as drain_templates_csv_file_obj:
            drain_templates_csv_filewriter = writer(drain_templates_csv_file_obj)
            drain_templates_csv_filewriter.writerow(header for header in ['cluster_id', 'template', 'size'])
            for line in clusters:
                drain_templates_csv_filewriter.writerow(self.cluster_template_to_tuple(line))
            drain_templates_csv_file_obj.close

#### Spell Parser

In [15]:
log_content_csv_file_name = 'bgl_200506_content.csv'
log_content_csv_file = log_file = path.abspath(path.join(project_root, spell_input_dir, log_content_csv_file_name))

main_structured_csv_filename = 'BGL_main_structured.csv'
spell_main_structured_csv_file = path.abspath(path.join(project_root, spell_output_dir, main_structured_csv_filename))

templates_csv_filename = 'BGL_main_templates.csv'
spell_templates_csv_file = path.abspath(path.join(project_root, spell_output_dir, templates_csv_filename))

In [20]:
class SpellParser(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        
        log_format = '<Content>'
        tau = 0.5

        self.parser = spell.LogParser(indir=spell_input_dir, outdir=spell_output_dir,
                             log_format=log_format, tau=tau, logmain='BGL')
        self.batch_size = 100

    def fit(self, X, y = None):
        self.template_seqs = []
        return self

    def transform(self, log_seqs, y = None):
        log_seqs_list = log_seqs.reshape([-1]).tolist()
        log_seqs_list = [log_seqs_list[i * self.batch_size :(i+1) * self.batch_size] for i in range((len(log_seqs_list) + self.batch_size - 1) // self.batch_size)]

        for log_seqs in log_seqs_list:
            ldf = pd.DataFrame(log_seqs, columns=['Content'])
            ldf.to_csv(log_content_csv_file, index=False, header=False)

            self.parser.parse(log_content_csv_file_name)

            nums = self.numericalize()
            n = math.floor(len(nums)/WINDOW_SIZE)
            self.template_seqs.append(np.array(np.split(nums, n)).reshape([-1]).tolist())
            
            # Comment during debug to view parser output
            #self.cleanup_files()
        
        print('template seqs: ', self.template_seqs)

        return self.template_seqs

    def numericalize(self):
        #output_df = pd.read_csv(spell_main_structured_csv_file)
        #template_df = pd.read_csv(spell_templates_csv_file)

        #vocab = template_df['EventId']
        #text = output_df['EventId']

        #num = Numericalize(vocab.to_numpy(), min_freq=1)
        #num.setup()
        #nums = np.array(num(text.to_numpy()))

        #return nums
        output_df = pd.read_csv(spell_main_structured_csv_file)

        return output_df['EventId'].to_numpy()

    def cleanup_files(self):
        files = glob.glob(spell_input_dir + '/*')
        for f in files:
            os.remove(f)

        files = glob.glob(spell_output_dir + '/*')
        for f in files:
            os.remove(f)

## Word Embedding

In [15]:
class WordEmbedding(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.NUMBER_OF_DIMENSIONS = 100

    def fit(self, template_seqs, y = None):
        template_seqs_filename = 'bgl_train_seqs.txt'
        template_seqs_file = path.abspath(path.join(project_root, spell_output_dir, template_seqs_filename))
        np.savetxt(template_seqs_file, template_seqs)
        self.fasttext_model = fasttext.train_unsupervised(template_seqs_file, model='cbow', minCount=1, dim=self.NUMBER_OF_DIMENSIONS)
        
        # Comment during debug to view embedding input
        os.remove(template_seqs_file)

        return self

    def transform(self, template_seqs, y = None):
        template_seqs_ = template_seqs.copy()
        template_seqs_ = np.apply_along_axis(self.average_embeddings, 1, template_seqs)

        return template_seqs_

    def average_embeddings(self, num_lse_vector):
        w2v_vector = [self.fasttext_model.get_word_vector(word) for word in np.vectorize(str)(num_lse_vector)]
        return np.average(w2v_vector, axis=0)

In [16]:
class WordEmbedding_2(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.NUMBER_OF_DIMENSIONS = 100

    def fit(self, template_seqs, y = None):
        template_seqs_filename = 'bgl_train_seqs.txt'
        template_seqs_file = path.abspath(path.join(project_root, spell_output_dir, template_seqs_filename))
        np.savetxt(template_seqs_file, template_seqs.astype(int), fmt='%i')
        self.fasttext_model = fasttext.train_unsupervised(template_seqs_file, model='cbow', minCount=1, dim=self.NUMBER_OF_DIMENSIONS)
        
        # Comment during debug to view embedding input
        os.remove(template_seqs_file)

        return self

    def transform(self, template_seqs, y = None):
        template_seqs_ = template_seqs.copy()
        template_seqs_ = np.apply_along_axis(self.average_embeddings, 1, template_seqs)

        return template_seqs_

    def average_embeddings(self, num_lse_vector):
        s2v_vector = self.fasttext_model.get_sentence_vector(' '.join(np.vectorize(str)(num_lse_vector)))
        return s2v_vector

## Evaluation

In [None]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
for train, test in cv.split(log_seqs, log_seq_anomaly_labels):
    print(train.shape, test.shape)

drain_pipe = Pipeline(steps=[('parsing', DrainParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])
drain_pred_log_seq_anomaly_labels = drain_pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict(log_seqs[test])

spell_pipe = Pipeline(steps=[('parsing', SpellParser()), ('word_embedding', WordEmbedding()), ('gnb', GaussianNB())])
spell_pred_log_seq_anomaly_labels = spell_pipe.fit(log_seqs[train], log_seq_anomaly_labels[train]).predict(log_seqs[test])

print(log_seq_anomaly_labels[test])
print(drain_pred_log_seq_anomaly_labels)
print(spell_pred_log_seq_anomaly_labels)

#### Testing

In [None]:
output_df = pd.read_csv(spell_main_structured_csv_file)
nums = output_df['EventId'].to_numpy()
n = math.floor(len(nums)/WINDOW_SIZE)
nums = np.array(np.split(nums, n))

nums

In [17]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
for train, test in cv.split(log_seqs, log_seq_anomaly_labels):
    print(train.shape, test.shape)

(185784,) (46446,)
(185784,) (46446,)
(185784,) (46446,)
(185784,) (46446,)
(185784,) (46446,)
(185784,) (46446,)
(185784,) (46446,)
(185784,) (46446,)
(185784,) (46446,)
(185784,) (46446,)


In [18]:
files = glob.glob(spell_input_dir + '/*')
for f in files:
    os.remove(f)

files = glob.glob(spell_output_dir + '/*')
for f in files:
    os.remove(f)

test_log_seqs = log_seqs[train]
test_log_seq_anomaly_labels = log_seq_anomaly_labels[train]

In [33]:
drain_parser = DrainParser()
drain_parser.fit(test_log_seqs, test_log_seq_anomaly_labels)
drain_template_seqs_output = drain_parser.transform(test_log_seqs, test_log_seq_anomaly_labels)
for templ_seq in drain_template_seqs_output:
print(type(templ_seq[0]))

[2022-11-24 15:38:21,349][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-24 15:38:21,351][INFO]: Loaded 100.0% of log lines.
[2022-11-24 15:38:21,353][INFO]: load_data() finished!
[2022-11-24 15:38:21,362][INFO]: Processed 100.0% of log lines.
[2022-11-24 15:38:21,368][INFO]: Output parse file
[2022-11-24 15:38:21,372][INFO]: Output main file for append
[2022-11-24 15:38:21,377][INFO]: lastestLindId: 100
[2022-11-24 15:38:21,386][INFO]: rootNodePath: output/spell/rootNode.pkl
[2022-11-24 15:38:21,388][INFO]: logCluLPath: output/spell/logCluL.pkl
[2022-11-24 15:38:21,389][INFO]: Store objects done.
[2022-11-24 15:38:21,390][INFO]: Parsing done. [Time taken: 0:00:00.041576]
[2022-11-24 15:38:21,399][INFO]: Parsing file: input/spell/bgl_2k_content.csv
[2022-11-24 15:38:21,401][INFO]: Loaded 100.0% of log lines.
[2022-11-24 15:38:21,403][INFO]: load_data() finished!
[2022-11-24 15:38:21,414][INFO]: Processed 100.0% of log lines.
[2022-11-24 15:38:21,420][INFO]: Output parse f

template seqs:  [[0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 2, 7, 2, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 8, 8, 8, 8, 8, 8, 8], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 5, 0, 0, 0, 0, 3, 0, 3, 0, 5, 0], [0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 3, 4, 4, 4, 4, 5, 5, 0, 4, 2, 6, 2, 6, 5, 5, 6, 7, 6, 8, 6, 6, 9, 9, 9, 10, 2, 2, 11, 5, 12, 11, 13, 13, 0, 0, 5, 7, 5, 5, 14, 14, 5, 5, 5, 14, 5, 15, 16, 7, 7, 16, 16, 7, 17, 18, 19, 11, 11, 17, 11, 7, 7, 7, 7, 7, 6, 6, 6, 20, 21, 22, 22, 22, 22, 22, 22, 22, 20, 22, 22, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20], [0, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2

In [None]:
spell_parser = SpellParser()
spell_parser.fit(test_log_seqs, test_log_seq_anomaly_labels)
spell_template_seqs_output = spell_parser.transform(test_log_seqs, test_log_seq_anomaly_labels)
for templ_seq in spell_template_seqs_output:
    print(templ_seq)

[2022-11-26 06:58:51,270][INFO]: Parsing file: input/spell/bgl_200506_content.csv
[2022-11-26 06:58:51,275][INFO]: Loaded 100.0% of log lines.
[2022-11-26 06:58:51,279][INFO]: load_data() finished!
[2022-11-26 06:58:51,282][INFO]: Load objects done, lastestLineId: 100
[2022-11-26 06:58:51,293][INFO]: Processed 100.0% of log lines.
[2022-11-26 06:58:51,300][INFO]: Output parse file
[2022-11-26 06:58:51,307][INFO]: lastestLindId: 100
[2022-11-26 06:58:51,316][INFO]: rootNodePath: output/spell/rootNode.pkl
[2022-11-26 06:58:51,318][INFO]: logCluLPath: output/spell/logCluL.pkl
[2022-11-26 06:58:51,321][INFO]: Store objects done.
[2022-11-26 06:58:51,323][INFO]: Parsing done. [Time taken: 0:00:00.052650]
[2022-11-26 06:58:51,329][INFO]: Parsing file: input/spell/bgl_200506_content.csv
[2022-11-26 06:58:51,333][INFO]: Loaded 100.0% of log lines.
[2022-11-26 06:58:51,336][INFO]: load_data() finished!
[2022-11-26 06:58:51,338][INFO]: Load objects done, lastestLineId: 200
[2022-11-26 06:58:51,3

In [None]:
for templ_seq in drain_template_seqs_output:
    print(type(str(templ_seq[0])))

In [None]:
template_seqs_filename = 'bgl_train_seqs.txt'
template_seqs_file = path.abspath(path.join(project_root, spell_output_dir, template_seqs_filename))
np.savetxt(template_seqs_file, drain_template_seqs_output,fmt='%s')

type(drain_template_seqs_output[0][0])

In [None]:
drain_embedding = WordEmbedding()
drain_embedding.fit(drain_template_seqs_output, log_seq_anomaly_labels[train])
drain_word_vector_avg = drain_embedding.transform(drain_template_seqs_output, log_seq_anomaly_labels[train])
for wv_avg in drain_word_vector_avg:
    print(wv_avg)

In [36]:
spell_embedding = WordEmbedding()
spell_embedding.fit(spell_template_seqs_output, log_seq_anomaly_labels[train])
spell_word_vector_avg = spell_embedding.transform(spell_template_seqs_output, log_seq_anomaly_labels[train])

for wv_avg in spell_word_vector_avg:
    print(wv_avg)

# spell_embedding = WordEmbedding_2()
# spell_embedding.fit(spell_template_seqs_output, log_seq_anomaly_labels[train])
# spell_word_vector_avg = spell_embedding.transform(spell_template_seqs_output, log_seq_anomaly_labels[train])

# for wv_avg in spell_word_vector_avg:
#     print(wv_avg)

*****************************************************************************************************


Read 0M words
Number of words:  24
Number of labels: 0


[-0.00062661 -0.00100449  0.00370418  0.00038678  0.00227279 -0.00143606
 -0.00223128 -0.004049    0.00394509  0.00098402  0.00115099  0.00123817
 -0.00401311  0.00186972 -0.00444877 -0.00429534 -0.00176497  0.00443959
 -0.00344338 -0.0059298  -0.00554004  0.00340126  0.006772   -0.00475374
 -0.00313804 -0.00455745  0.00353823 -0.00450204  0.00616771  0.00184145
  0.00056252  0.00199752  0.00200198 -0.00331916  0.00348326 -0.00140754
  0.00313814  0.00035792 -0.00516464  0.00032468  0.00594882  0.00211248
  0.00417276  0.00576983 -0.00227197  0.00149693 -0.00075591  0.00269199
  0.00360714 -0.00021159 -0.00232833 -0.00198005  0.00600783 -0.00513688
  0.00362718 -0.00390191 -0.00388863 -0.00132237 -0.00265183 -0.00426759
 -0.00479631 -0.00279707  0.002307   -0.00020779 -0.00490932  0.00570803
  0.00234922 -0.00472652 -0.0041118   0.00453423  0.00037178  0.00383281
 -0.00389498 -0.00059335  0.00707066 -0.00395696 -0.00088     0.00798378
 -0.0062622   0.00162935 -0.00306117  0.00514472 -0

Progress: 100.0% words/sec/thread:    8174 lr:  0.000000 avg.loss:  3.247241 ETA:   0h 0m 0s


In [None]:
len(spell_word_vector_avg)

In [None]:
def fastai_numericalize():
    output_df = pd.read_csv(spell_main_structured_csv_file)
    template_df = pd.read_csv(spell_templates_csv_file)

    vocab = template_df['EventId']
    text = output_df['EventId']

    num = Numericalize(vocab.to_numpy(), min_freq=1)
    num.setup()
    nums = np.array(num(text.to_numpy()))

    return nums

len(fastai_numericalize())/WINDOW_SIZE

In [None]:
def custom_numericalize():
    output_df = pd.read_csv(spell_main_structured_csv_file)
    template_df = pd.read_csv(spell_templates_csv_file)

    vocab = template_df['EventId']
    text = output_df['EventId']

    print(vocab)
    print(text)
    nums = []

    return nums

len(custom_numericalize())

In [10]:
import pickle
import boto3
from datetime import datetime

bucket_name = 'sagemaker-studio-326787221562-jycpwz9gs3f'
pickle_key = 'Output_' + datetime.now().strftime("%Y%m%d%H%M%S") + '.pkl'
print(pickle_key)

pickle_byte_obj = pickle.dumps([2, 3, 5, 4, 0])

s3_resource = boto3.resource("s3")
s3_resource.Object(bucket_name, pickle_key).put(Body=pickle_byte_obj)

Output_20221126184419.pkl


{'ResponseMetadata': {'RequestId': '3DH36RHS0KPZ2TVD',
  'HostId': 'A1kxeG99gWIuCLKMYjxKz75CslivKy51lbPXgbNlqZYE7Xt5NHXkLhfxkANbyDuOQZ74Z8SspHQ=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'A1kxeG99gWIuCLKMYjxKz75CslivKy51lbPXgbNlqZYE7Xt5NHXkLhfxkANbyDuOQZ74Z8SspHQ=',
   'x-amz-request-id': '3DH36RHS0KPZ2TVD',
   'date': 'Sat, 26 Nov 2022 18:44:20 GMT',
   'etag': '"64b91ce85c480df6c8082e73c06c137e"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"64b91ce85c480df6c8082e73c06c137e"'}