<a href="https://colab.research.google.com/github/mostafa-ja/Anomaly-detection/blob/main/semantic_vector4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Pretrained sentence-transformers models](https://www.sbert.net/docs/pretrained_models.html)

[Our chosen light model in hugging face](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

[ Convert a collection of raw documents to a matrix of TF-IDF features ](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
)

In [None]:
!pip install -U sentence-transformers
!pip install stop_words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util  #util for importing cosine similarity
import numpy as np
import re
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")

In [None]:
# Read log templates file into a DataFrame
df = pd.read_csv('/content/HDFS_templates.csv')
df.head(3)

In [None]:
def normalize_template(text):
  """
  Normalize text to extract most salient tokens
  """
  # replace special characters with space and remove digits
  text = re.sub(r'\W+', ' ', text) # replaces one or more non-alphanumeric characters (\W+) with a single space in the text
  text = re.sub('\d', '', text)    #  replaces any digit (\d) with an empty string in the text


  word_tokens = word_tokenize(text)

  # converts the words in word_tokens to lower case and then checks whether
  #they are present in stop_words or not
  lemmatizer = WordNetLemmatizer()
  normalized_tokens = [lemmatizer.lemmatize(w.lower(), pos='v') for w in word_tokens if w not in stop_words]

  # Reconstruct the sentence
  filtered_sentence = ' '.join(normalized_tokens) # our output will be sentences not a list of words
  return filtered_sentence

In [None]:
example_sent = '<*>BLOCK* NameSystem<*>addStoredBlock: addStoredBlock request received for<*>on<*>size<*>But it does not belong to any file remove  removing removed. '
normalize_template(example_sent)

In [None]:
normalized_templates = [normalize_template(sentence) for sentence in df['EventTemplate'] ]
print(normalized_templates)

In [None]:
normalized_templates = []
for sentence in df['EventTemplate']:
  normalized_templates.append(normalize_template(sentence))

print(normalized_templates)

In [None]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Compute TF-IDF features
tfidf_features = tfidf_vectorizer.fit_transform(normalized_templates)

# Access the TF-IDF feature matrix
print(tfidf_features.toarray().shape)

In [None]:
tfidf_vectorizer.vocabulary_