<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/semantic_vector6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import pandas as pd

In [4]:
# Read log templates file into a DataFrame
df = pd.read_csv('/content/HDFS_templates.csv')
df.head(3)

Unnamed: 0,EventId,EventTemplate
0,E1,<*>Adding an already existing block<*>
1,E2,<*>Verification succeeded for<*>
2,E3,<*>Served block<*>to<*>


In [None]:
# we keep some stop words such as on, over, not, .. which can have significant meaning
stop_words = {
    'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
    "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by',
    'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don',
    "don't", 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't",
    'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how',
    'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me',
    'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'nor', 'now', 'o',
    'of', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'own', 're', 's', 'same', 'shan',
    "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than',
    'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this',
    'those', 'through', 'to', 'too', 'until', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren',
    "weren't", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', "won't",
    'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself',
    'yourselves'
}

# Pre-compiling the regular expression pattern using re.compile() can improve the performance of the regular expression operations
pattern = re.compile(r'\W+|\d')

In [5]:
def tokenized(text):
    """
    Normalize text to extract most salient tokens
    """
    # Replace special characters with space and remove digits
    text = pattern.sub(' ', text)

    # Convert camel case to snake case, then replace _ with space
    text = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', text)
    text = re.sub('([a-z0-9])([A-Z])', r'\1_\2', text).lower().replace('_', ' ')

    normalized_tokens = [w for w in text.split() if w not in stop_words]

    # Return the filtered sentence, our output will be sentences not a list of words
    return ' '.join(normalized_tokens)


In [6]:
tokenized_template = [tokenized(sentence) for sentence in df['EventTemplate'] ]
print(tokenized_template)

['adding already existing block', 'verification succeeded', 'served block', 'got exception serving', 'receiving block src dest', 'received block src dest size', 'write block received exception', 'packet responder block interrupted', 'received block size', 'packet responder exception', 'packet responder block terminating', 'exception writing block mirror', 'receiving empty packet block', 'exception receive block block', 'changing block file offset block meta file offset', 'transmitted block', 'failed transfer got', 'starting thread transfer block', 'reopen block', 'unexpected error trying delete block block info not found volume map', 'deleting block file', 'block name system allocate block', 'block name system delete added invalid set', 'block removing block needed replications not belong file', 'block ask replicate', 'block name system add stored block block map updated added size', 'block name system add stored block redundant add stored block request received on size', 'block name s

# **PART1 : Using light sentence-transformers model**

In [8]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [9]:
templates2vec = [model.encode(sentence) for sentence in tokenized_template]
templates2vec.shape

In [19]:
similarities = cosine_similarity(templates2vec[0].reshape(1, -1), templates2vec[:30]) # similarities.shape = (1, 30)
for i, similarity in enumerate(similarities[0]):
    print("Similarity:", similarity)
    print(tokenized_template[0])
    print(tokenized_template[i])
    print('------------------------------------')


Similarity: 1.0000002
adding already existing block
adding already existing block
------------------------------------
Similarity: 0.021313272
adding already existing block
verification succeeded
------------------------------------
Similarity: 0.309552
adding already existing block
served block
------------------------------------
Similarity: 0.008829558
adding already existing block
got exception serving
------------------------------------
Similarity: 0.34241754
adding already existing block
receiving block src dest
------------------------------------
Similarity: 0.293378
adding already existing block
received block src dest size
------------------------------------
Similarity: 0.38903505
adding already existing block
write block received exception
------------------------------------
Similarity: 0.2293287
adding already existing block
packet responder block interrupted
------------------------------------
Similarity: 0.35335255
adding already existing block
received block size
---

In [26]:

# Compute pairwise similarities
similarities = cosine_similarity(templates2vec)
similarities = np.triu(similarities, k=1)  # Exclude diagonal and lower triangular elements, change them into zero

# Find indices where similarity exceeds threshold
indices = np.where(similarities > 0.75)

# Iterate over similar template pairs
for i, j in zip(indices[0], indices[1]):
    similarity = similarities[i, j]
    print('similarity =', similarity)
    print('template', i, ':', tokenized_template[i])
    print('template', j, ':', tokenized_template[j])
    print('--------------------------------')


similarity = 0.78556585
template 4 : receiving block src dest
template 5 : received block src dest size
--------------------------------
similarity = 0.81611705
template 5 : received block src dest size
template 8 : received block size
--------------------------------
similarity = 0.87190074
template 7 : packet responder block interrupted
template 10 : packet responder block terminating
--------------------------------
similarity = 0.7676006
template 25 : block name system add stored block block map updated added size
template 26 : block name system add stored block redundant add stored block request received on size
--------------------------------
similarity = 0.8484187
template 26 : block name system add stored block redundant add stored block request received on size
template 27 : block name system add stored block add stored block request received on size not belong file
--------------------------------


# **PART2 : using word2vec and tff-idf**

In [33]:
import gensim.downloader

In [34]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [35]:
# Download the 'word2vec-google-news-300' embeddings
word2vec = gensim.downloader.load('word2vec-google-news-300')



In [36]:
word2vec.most_similar('add')

[('Adding', 0.578197181224823),
 ('Add', 0.5681695938110352),
 ('adds', 0.5657491683959961),
 ('bring', 0.5600515604019165),
 ('augment', 0.5254836678504944),
 ('create', 0.521090567111969),
 ('incorporate', 0.504277765750885),
 ('expand', 0.49926939606666565),
 ('combine', 0.49364253878593445),
 ('elevate', 0.4872509241104126)]

In [46]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Compute TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(tokenized_template)

# Access the TF-IDF feature matrix
print(tfidf_features.toarray().shape)

(30, 66)


In [47]:
# Get the vocabulary dictionary and word indices
dic = tfidf_vectorizer.vocabulary_

matrix_weight = tfidf_features.toarray()

embedding_shape = word2vec.get_vector('word').shape
num_templates = len(tokenized_template)
templates2vec = np.zeros((num_templates, embedding_shape[0]))

missing_vectors = [] # Collect words without proper vectors
for i, sentence in enumerate(tokenized_template):
    vector = np.zeros(embedding_shape)
    for word in sentence.split():
        j = dic.get(word)  # If the key is not present, dic.get(word)(or dic.get(word, default_value)) will return None (or any default value you provide), while dic[word] will raise a KeyError if the key is not found.
        if j is not None:
            vector += matrix_weight[i, j] * word2vec.get_vector(word)
        else:
            missing_vectors.append(word)
    templates2vec[i] = vector

In [48]:
tokenized_template[1]

'verification succeeded'

In [49]:
matrix_weight[1]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.70710678, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.70710678, 0.        , 0.        ,
       0.        ])

In [50]:
similarities = cosine_similarity(templates2vec[0].reshape(1, -1), templates2vec[:30]) # similarities.shape = (1, 30)
for i, similarity in enumerate(similarities[0]):
    print("Similarity:", similarity)
    print(tokenized_template[0])
    print(tokenized_template[i])
    print('------------------------------------')

Similarity: 1.0000000000000004
adding already existing block
adding already existing block
------------------------------------
Similarity: 0.21925066637897228
adding already existing block
verification succeeded
------------------------------------
Similarity: 0.1504715362121909
adding already existing block
served block
------------------------------------
Similarity: 0.24609330762729548
adding already existing block
got exception serving
------------------------------------
Similarity: 0.15985740844471213
adding already existing block
receiving block src dest
------------------------------------
Similarity: 0.1781226566225514
adding already existing block
received block src dest size
------------------------------------
Similarity: 0.24656939608111597
adding already existing block
write block received exception
------------------------------------
Similarity: 0.1065335677729374
adding already existing block
packet responder block interrupted
------------------------------------
Simi

In [51]:
# Compute pairwise similarities
similarities = cosine_similarity(templates2vec)
similarities = np.triu(similarities, k=1)  # Exclude diagonal and lower triangular elements, change them into zero

# Find indices where similarity exceeds threshold
indices = np.where(similarities > 0.75)

# Iterate over similar template pairs
for i, j in zip(indices[0], indices[1]):
    similarity = similarities[i, j]
    print('similarity =', similarity)
    print('template', i, ':', tokenized_template[i])
    print('template', j, ':', tokenized_template[j])
    print('--------------------------------')

similarity = 0.8470709133866174
template 4 : receiving block src dest
template 5 : received block src dest size
--------------------------------
similarity = 0.7563605133545468
template 7 : packet responder block interrupted
template 9 : packet responder exception
--------------------------------
similarity = 0.7578512705674404
template 9 : packet responder exception
template 10 : packet responder block terminating
--------------------------------
similarity = 0.7589474554165527
template 21 : block name system allocate block
template 25 : block name system add stored block block map updated added size
--------------------------------
similarity = 0.8244604201949224
template 25 : block name system add stored block block map updated added size
template 26 : block name system add stored block redundant add stored block request received on size
--------------------------------
similarity = 0.830150535591593
template 25 : block name system add stored block block map updated added size
templ