<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/semantic_vector4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Pretrained sentence-transformers models](https://www.sbert.net/docs/pretrained_models.html)

[Our chosen light model in hugging face](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

[ Convert a collection of raw documents to a matrix of TF-IDF features ](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
)

In [7]:
!pip install -U sentence-transformers




In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util  #util for importing cosine similarity
import numpy as np
import re
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
# Read log templates file into a DataFrame
df = pd.read_csv('/content/HDFS_templates.csv')
df.head(3)

Unnamed: 0,EventId,EventTemplate
0,E1,<*>Adding an already existing block<*>
1,E2,<*>Verification succeeded for<*>
2,E3,<*>Served block<*>to<*>


In [17]:
stop_words = set(stopwords.words('english'))

def normalize_template(text):
  """
  Normalize text to extract most salient tokens
  """
  # replace special characters with space and remove digits
  text = re.sub(r'\W+', ' ', text) # replaces one or more non-alphanumeric characters (\W+) with a single space in the text
  text = re.sub('\d', '', text)    #  replaces any digit (\d) with an empty string in the text


  word_tokens = word_tokenize(text)

  # converts the words in word_tokens to lower case and then checks whether
  #they are present in stop_words or not
  lemmatizer = WordNetLemmatizer()
  normalized_tokens = [lemmatizer.lemmatize(w.lower(), pos='v') for w in word_tokens if w not in stop_words]

  # Reconstruct the sentence
  filtered_sentence = ' '.join(normalized_tokens) # our output will be sentences not a list of words
  return filtered_sentence

In [18]:
example_sent = '<*>BLOCK* NameSystem<*>addStoredBlock: addStoredBlock request received for<*>on<*>size<*>But it does not belong to any file remove  removing removed. '
normalize_template(example_sent)

'block namesystem addstoredblock addstoredblock request receive size but belong file remove remove remove'

In [21]:
normalized_templates = [normalize_template(sentence) for sentence in df['EventTemplate'] ]
print(normalized_templates)

['add already exist block', 'verification succeed', 'serve block', 'get exception serve', 'receive block src dest', 'receive block src dest size', 'writeblock receive exception', 'packetresponder block interrupt', 'receive block size', 'packetresponder exception', 'packetresponder block terminate', 'exception write block mirror', 'receive empty packet block', 'exception receiveblock block', 'change block file offset block meta file offset', 'transmit block', 'fail transfer get', 'start thread transfer block', 'reopen block', 'unexpected error try delete block blockinfo find volumemap', 'delete block file', 'block namesystem allocateblock', 'block namesystem delete add invalidset', 'block remove block neededreplications belong file', 'block ask replicate', 'block namesystem addstoredblock blockmap update add size', 'block namesystem addstoredblock redundant addstoredblock request receive size', 'block namesystem addstoredblock addstoredblock request receive size but belong file', 'pendi

In [22]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Compute TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(normalized_templates)

# Access the TF-IDF feature matrix
print(tfidf_features.toarray().shape)

(30, 55)


In [23]:
tfidf_vectorizer.vocabulary_

{'add': 0,
 'already': 3,
 'exist': 16,
 'block': 6,
 'verification': 51,
 'succeed': 42,
 'serve': 38,
 'get': 20,
 'exception': 15,
 'receive': 31,
 'src': 40,
 'dest': 12,
 'size': 39,
 'writeblock': 54,
 'packetresponder': 29,
 'interrupt': 21,
 'terminate': 43,
 'write': 53,
 'mirror': 24,
 'empty': 13,
 'packet': 28,
 'receiveblock': 32,
 'change': 10,
 'file': 18,
 'offset': 27,
 'meta': 23,
 'transmit': 47,
 'fail': 17,
 'transfer': 46,
 'start': 41,
 'thread': 44,
 'reopen': 35,
 'unexpected': 49,
 'error': 14,
 'try': 48,
 'delete': 11,
 'blockinfo': 7,
 'find': 19,
 'volumemap': 52,
 'namesystem': 25,
 'allocateblock': 2,
 'invalidset': 22,
 'remove': 34,
 'neededreplications': 26,
 'belong': 5,
 'ask': 4,
 'replicate': 36,
 'addstoredblock': 1,
 'blockmap': 8,
 'update': 50,
 'redundant': 33,
 'request': 37,
 'but': 9,
 'pendingreplicationmonitor': 30,
 'time': 45}

In [25]:
normalized_templates[0]

'add already exist block'

In [32]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [31]:
dic = tfidf_vectorizer.vocabulary_
matrix_weight = tfidf_features.toarray()

dic_templates = {}

for i, sentence in enumerate(normalized_templates):
  vector = np.zeros(384) # same shape of embedding word : model.encode(word).shape
  for word in sentence.split():
    j = dic[word]
    vector += matrix_weight[i,j] * model.encode(word)
  dic_templates[i] = vector

In [33]:
normalized_templates[0]

'add already exist block'

In [34]:
matrix_weight[0]

array([0.49016957, 0.        , 0.        , 0.6016505 , 0.        ,
       0.        , 0.18912202, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.6016505 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [36]:
dic_templates[0].shape

(384,)

In [37]:
for i in range(30):
  print("Similarity:", util.cos_sim(dic_templates[0], dic_templates[i]))
  print(normalized_templates[0])
  print(normalized_templates[i])
  print('------------------------------------')

Similarity: tensor([[1.0000]], dtype=torch.float64)
add already exist block
add already exist block
------------------------------------
Similarity: tensor([[0.4779]], dtype=torch.float64)
add already exist block
verification succeed
------------------------------------
Similarity: tensor([[0.4161]], dtype=torch.float64)
add already exist block
serve block
------------------------------------
Similarity: tensor([[0.5236]], dtype=torch.float64)
add already exist block
get exception serve
------------------------------------
Similarity: tensor([[0.5459]], dtype=torch.float64)
add already exist block
receive block src dest
------------------------------------
Similarity: tensor([[0.5706]], dtype=torch.float64)
add already exist block
receive block src dest size
------------------------------------
Similarity: tensor([[0.5051]], dtype=torch.float64)
add already exist block
writeblock receive exception
------------------------------------
Similarity: tensor([[0.3108]], dtype=torch.float64)


In [39]:
len(normalized_templates)

30

In [47]:
max_similarity = 0

for i in range(len(normalized_templates)):
  for j in range(len(normalized_templates)):
    if i != j :
      similarity = util.cos_sim(dic_templates[i], dic_templates[j])
      if similarity > max_similarity :
        max_similarity = similarity
        index = (i,j)

print(normalized_templates[index[0]])
print(normalized_templates[index[1]])
print("Similarity:", util.cos_sim(dic_templates[index[0]], dic_templates[index[1]]))

receive block src dest
receive block src dest size
Similarity: tensor([[0.9561]], dtype=torch.float64)
