<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/LogADEmpirical2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate_embeddings

In [2]:
!wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip'


--2023-07-22 08:39:40--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.65.229.46, 18.65.229.89, 18.65.229.121, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.65.229.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2023-07-22 08:40:25 (32.6 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]



In [3]:
!unzip "/content/crawl-300d-2M.vec.zip" -d "/content/"

Archive:  /content/crawl-300d-2M.vec.zip
  inflating: /content/crawl-300d-2M.vec  


In [4]:
import sys

import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from typing import List
from time import time
import json

In [9]:
template_df = pd.read_csv('/content/HDFS.log_templates.csv')
templates = template_df['EventTemplate'].tolist()
print(templates[:5])
print(len(templates))

['Receiving block <*> src: <*> dest: <*>', 'BLOCK* NameSystem.allocateBlock: <*> <*>', 'PacketResponder <*> for block <*> <*>', 'Received block <*> of size <*> from <*>', 'BLOCK* NameSystem.addStoredBlock: blockMap updated: <*> is added to <*> size <*>']
48


In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
print("Loading word2vec model...")
st = time()
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('./crawl-300d-2M.vec', binary=False)
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
print("Loaded word2vec model in {:.2f} seconds".format(time() - st))

Loading word2vec model...
Loaded word2vec model in 0.00 seconds


In [13]:
# EXAMPLE
import re

#The r'\w+' regular expression matches sequences of alphanumeric characters and underscores. This means that
#any word containing only letters, digits, or underscores will be treated as a token,
# and all other characters (e.g., punctuation) will be ignored.

text = "The quick:brown_fox_jumps dog@19 #x!sd877 "
matches = re.findall(r'\w+', text)
print(matches)

['The', 'quick', 'brown_fox_jumps', 'dog', '19', 'x', 'sd877']


In [21]:
import re

template = 'Receiving block NameSystem.allocateBlock PacketResponder'

def replace_uppercase_with_space(match):
    print("x:", match)       # Print the match object
    print("x.group():", match.group())     # Print the matched substring
    print("x.group(0):", match.group(0))   # Print the first matched substring (same as match.group())
    return " " + match.group(0)   # Replace the uppercase letter with a space and the uppercase letter itself

# Replace each uppercase letter with a space followed by the uppercase letter
template = re.sub('[A-Z]', replace_uppercase_with_space, template)
print("Result:", template)


x: <re.Match object; span=(0, 1), match='R'>
x.group(): R
x.group(0): R
x: <re.Match object; span=(16, 17), match='N'>
x.group(): N
x.group(0): N
x: <re.Match object; span=(20, 21), match='S'>
x.group(): S
x.group(0): S
x: <re.Match object; span=(35, 36), match='B'>
x.group(): B
x.group(0): B
x: <re.Match object; span=(41, 42), match='P'>
x.group(): P
x.group(0): P
x: <re.Match object; span=(47, 48), match='R'>
x.group(): R
x.group(0): R
Result:  Receiving block  Name System.allocate Block  Packet Responder


In [22]:
template = 'Receiving block NameSystem.allocateBlock PacketResponder  '
template = re.sub('[A-Z]', lambda x: " " + x.group(0), template)
print(template)

 Receiving block  Name System.allocate Block  Packet Responder  


In [31]:
template = 'Receiving-block  Name System.allocate Responder swq '
template = tokenizer.tokenize(template)
print(template)
template_clean = " ".join(template)
print(template_clean)


['Receiving', 'block', 'Name', 'System', 'allocate', 'Responder', 'swq']
Receiving block Name System allocate Responder swq


In [66]:
# remove stop word and  punctuation, split by camel case
def clean_template(template: str, remove_stop_words: bool = True):
    template = " ".join([word.lower() if word.isupper() else word for word in template.strip().split()])

    # camel case: The purpose of this regular expression substitution is to split words in camel case notation by inserting spaces before each uppercase letter.
    template = re.sub('[A-Z]', lambda x: " " + x.group(0), template)

    word_tokens = tokenizer.tokenize(template)  # tokenize
    word_tokens = [w for w in word_tokens if not w.isdigit()]  # remove digital

    if remove_stop_words:  # remove stop words, we can close this function
        filtered_sentence = [w.lower() for w in word_tokens if w not in stop_words]
    else:
        filtered_sentence = [w.lower() for w in word_tokens]

    template_clean = " ".join(filtered_sentence)
    return template_clean  # return string


In [33]:
def log_key2vec(log_template: str, weight: List[float] = None):
    """
    Get word vec of words in log key, using weight
    Parameters
    ----------
    log_template
    weight

    Returns
    -------
    log_template_vec: list of word vec
    """

    #The strip() function removes any leading or trailing whitespace from the log template
    words = log_template.strip().split()
    log_template_vec = []

    if not weight:  # if not weight, uniform weight
        weight = [1] * len(words)

    for index, word in enumerate(words):
        try:  # catch the exception when word not in pre-trained word vector dictionary
            log_template_vec.append(word2vec_model[word] * weight[index])
        except Exception as _:
            pass
    if len(log_template_vec) == 0: #in the condition , we dont have any meaningful word(vector) in the template
        log_template_vec = np.zeros(300)
    return log_template_vec

In [44]:
# If an exception occurs (i.e., the word is not present in the pre-trained word vector dictionary),
# the code will not raise an error but simply continue to the next word in the loop.

try:
  word2vec_model['swq']
except Exception as e: # if we use _ instead of e , we throw away variable
  print(e)
  print('error')

"Key 'swq' not present"
error


In [45]:
try:
  word2vec_model['swq']
except Exception as _: # if we use _ instead of e , we throw away variable
  print('error')

error


In [40]:
template_clean = 'Receiving block Name System allocate Responder swq'
print(len(log_key2vec(template_clean)))
print(len(log_key2vec(template_clean)[0]))

6
300


In [68]:
# be careful, not mix with function clean_template()
templates = ['Receiving block <*> src: <*> dest: <*>', 'BLOCK* NameSystem.allocateBlock: <*> <*>', 'PacketResponder <*> for block <*> <*>']
cleaned_templates = [clean_template(template) for template in templates]
zipp = zip(cleaned_templates, templates)

embeddings = {}
for cleaned_template, template in zipp:
    print(template)
    embeddings[template] = np.mean(log_key2vec(cleaned_template), axis=0).tolist()


Receiving block <*> src: <*> dest: <*>
BLOCK* NameSystem.allocateBlock: <*> <*>
PacketResponder <*> for block <*> <*>
Received block <*> of size <*> from <*>


In [69]:
len(embeddings['Receiving block <*> src: <*> dest: <*>'])

300

In [79]:
#It learns the vocabulary from the cleaned_templates and builds the document-term matrix (the word frequency representation of the sentences).
#The output will be a 2-dimensional numpy array, where each row represents a cleaned template and each column represents a word in the vocabulary.
#The values in the array represent the word frequency

templates = ['Receiving block <*> src: <*> dest: <*>', 'BLOCK* NameSystem.allocateBlock: <*> <*>', 'PacketResponder <*> for block <*> <*>','Receiving block PacketResponder']
cleaned_templates = [clean_template(template) for template in templates]
print(cleaned_templates)
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
X = vectorizer.fit_transform(cleaned_templates)
print(vectorizer.vocabulary_)
print(X)
print(X.toarray())

['receiving block src dest', 'block name system allocate block', 'packet responder block', 'receiving block packet responder']
{'receiving': 5, 'block': 1, 'src': 7, 'dest': 2, 'name': 3, 'system': 8, 'allocate': 0, 'packet': 4, 'responder': 6}
  (0, 5)	1
  (0, 1)	1
  (0, 7)	1
  (0, 2)	1
  (1, 1)	2
  (1, 3)	1
  (1, 8)	1
  (1, 0)	1
  (2, 1)	1
  (2, 4)	1
  (2, 6)	1
  (3, 5)	1
  (3, 1)	1
  (3, 4)	1
  (3, 6)	1
[[0 1 1 0 0 1 0 1 0]
 [1 2 0 1 0 0 0 0 1]
 [0 1 0 0 1 0 1 0 0]
 [0 1 0 0 1 1 1 0 0]]


In [None]:
templates = ['Receiving block <*> src: <*> dest: <*>', 'BLOCK* NameSystem.allocateBlock: <*> <*>', 'PacketResponder <*> for block <*> <*>']
cleaned_templates = [clean_template(template) for template in templates]

vectorizer = CountVectorizer()
transformer = TfidfTransformer()
X = vectorizer.fit_transform(cleaned_templates)
tfidf = transformer.fit_transform(X)
tfidf = tfidf.toarray()
words = vectorizer.get_feature_names()
single_weights = []
for i, (template, k) in enumerate(templates):
    for word in template.strip().split():
        if word in words:
            single_weights.append(tfidf[i][words.index(word)])
        else:
            single_weights.append(0)
    embeddings[k] = np.mean(log_key2vec(template, single_weights), axis=0).tolist()

In [None]:
def generate_embeddings_fasttext(templates: List[str], strategy: str = 'average') -> dict:
    """
    Generate embeddings for templates using fasttext
    Parameters
    ----------
    templates: list of templates
    strategy: average or tfidf

    Returns
    -------
    embeddings: dict of embeddings
    """
    clean_templates = [clean_template(template) for template in templates]
    templates = zip(clean_templates, templates)
    embeddings = {}
    if strategy == 'average':
        for template, k in templates:
            embeddings[k] = np.mean(log_key2vec(template), axis=0).tolist()
    elif strategy == 'tfidf':
        vectorizer = CountVectorizer()
        transformer = TfidfTransformer()
        X = vectorizer.fit_transform(clean_templates)
        tfidf = transformer.fit_transform(X)
        tfidf = tfidf.toarray()
        words = vectorizer.get_feature_names()
        single_weights = []
        for i, (template, k) in enumerate(templates):
            for word in template.strip().split():
                if word in words:
                    single_weights.append(tfidf[i][words.index(word)])
                else:
                    single_weights.append(0)
            embeddings[k] = np.mean(log_key2vec(template, single_weights), axis=0).tolist()
    else:
        raise ValueError('Invalid strategy')

    return embeddings
