In [231]:
!pip -q install transformers

In [4]:
from google.colab import drive
drive.mount("/content/drive/")
%cd /content/drive/MyDrive/event

Mounted at /content/drive/


In [203]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from transformers import AutoConfig, AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertModel
import torch
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import json
from sklearn.preprocessing import StandardScaler

pd.options.mode.chained_assignment = None

In [216]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
# Load the list from the JSON file
with open('event_data.json', 'r') as json_file:
    loaded_event_data = json.load(json_file)


In [141]:
# find number of unique values in each feature
for Key_name in loaded_event_data[0].keys():
  unique_values = []
  for i in range(len(loaded_event_data)):
    if loaded_event_data[i][Key_name] not in unique_values:
      unique_values.append(loaded_event_data[i][Key_name])
  print(f" '{Key_name}' unique values : {len(unique_values)}")


 'EventID' unique values : 106
 'Level' unique values : 4
 'EventRecordID' unique values : 34956
 'Task' unique values : 34
 'Keywords' unique values : 21
 'TimeCreated' unique values : 34381
 'ProcessID' unique values : 1743
 'ThreadID' unique values : 3869
 'SecurityUserID' unique values : 5
 'DataValues' unique values : 5160


In [144]:
df = pd.DataFrame()
for Key_name in loaded_event_data[0].keys():
  values = []
  for i in range(len(loaded_event_data)):
    values.append(loaded_event_data[i][Key_name])

  df[Key_name] = values


In [213]:
def get_top_ngrams(record_list, n):
    """
    Uses the scikit-learn TfidfVectorizer to compute the top 10 n-grams
    :param list record_list: the list of all log records
    :param int n: the number n of n-grams. 3 would show trigrams.
    :rtype: words_freq[:10] - a list of the top 10 n-grams
    """
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(n, n), lowercase=False, token_pattern=r'\S+', smooth_idf=False)
    bag_of_words = tfidf_vectorizer.fit_transform(record_list)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in tfidf_vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:10]


In [191]:
def clean_text(input_text):

    # Remove commas
    cleaned_text = input_text.replace(",", "")
    cleaned_text = cleaned_text.replace(":", "")

    # Remove words starting with "param"
    cleaned_text = re.sub(r'\bparam\d+\b', '', cleaned_text)

    # Remove unintelligible words (non-alphanumeric characters)
    cleaned_text = ' '.join(word for word in cleaned_text.split() if word.isalnum())
    cleaned_text = cleaned_text.replace("jahannama", "")
    cleaned_text = re.sub(r'\b\w*0{10,}\w*\b', '', cleaned_text)

    return cleaned_text

data_list = []
for i in tqdm(range(len(loaded_event_data))):
  result = clean_text(loaded_event_data[i]["DataValues"])
  data_list.append(result)


100%|██████████| 34956/34956 [00:00<00:00, 41549.02it/s]


In [200]:
vec = TfidfVectorizer()
X = vec.fit_transform(data_list)
tfidf = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())

In [232]:
get_top_ngrams(data_list, n = 2 )

[('Local Activation', 1673.1629965212733),
 ('Activation LocalHost', 1642.9419731605637),
 ('demand start', 1130.770319622504),
 ('auto start', 1129.4434772972743),
 ('Background Intelligent', 1121.616750826648),
 ('Intelligent Transfer', 1121.616750826648),
 ('Transfer Service', 1121.616750826648),
 ('start BITS', 1121.5774484992417),
 ('HiveName KeysUpdated', 981.7839480079349),
 ('DirtyPages 1', 853.9226740065247)]

In [218]:
print('Encoding Data text')
title_encoding = []
model.eval()
for text in tqdm(data_list):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    title_encoding.append(output.pooler_output[0].detach().numpy())

title_encoding = [t.astype('float64') for t in title_encoding]
df.insert(loc=10, column='data_content_bert_embedding', value=title_encoding)


Encoding Data text


100%|██████████| 34956/34956 [1:13:01<00:00,  7.98it/s]


In [257]:
# convert string column to integer and change Format the datetime column

df['EventID'] = df['EventID'].astype(int)
df['Level'] = df['Level'].astype(int)
df['EventRecordID'] = df['EventRecordID'].astype(int)
df['Task'] = df['Task'].astype(int)
df['TimeCreated'] = pd.to_datetime(df['TimeCreated'])
df['TimeCreated'] = df['TimeCreated'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Create interaction feature
df['Interaction'] = df['ProcessID'] * df['ThreadID']

# Encode categorical variables (if any)
# Example: One-hot encoding for 'SecurityUserID'
df = pd.get_dummies(df, columns=['SecurityUserID'])

# Scale numeric features
scaler = StandardScaler()
df[['EventID', 'Level', 'EventRecordID', "Interaction", "Task"]] = scaler.fit_transform(df[['EventID', 'Level', 'EventRecordID', "Interaction", "Task"]])
df.drop(columns=["ProcessID", "ThreadID", "Keywords", "DataValues", "Unnamed: 0"], inplace = True)


df.rename(columns={"SecurityUserID_S-1-5-18":"Security1", "SecurityUserID_S-1-5-19":"Security2", "SecurityUserID_S-1-5-20":"Security3", "SecurityUserID_S-1-5-21-4294070095-3813609185-3657462159-1001":"Security4", }, inplace=True)

In [259]:
df.head()

Unnamed: 0,EventID,Level,EventRecordID,Task,TimeCreated,data_content_bert_embedding,Interaction,Security1,Security2,Security3,Security4
0,0.580343,-1.292298,-1.732001,-0.267517,2023-11-03 19:30:15,[-0.81045526 -0.27032033 -0.587677 0.538323...,-0.300998,False,False,False,True
1,0.580343,-1.292298,-1.731902,-0.267517,2023-11-03 20:12:26,[-0.81045526 -0.27032033 -0.587677 0.538323...,-0.101638,False,False,False,True
2,0.580343,-1.292298,-1.731803,-0.267517,2023-11-03 20:12:38,[-0.81045526 -0.27032033 -0.587677 0.538323...,-0.101638,False,False,False,True
3,0.580343,-1.292298,-1.731704,-0.267517,2023-11-03 21:15:34,[-0.81045526 -0.27032033 -0.587677 0.538323...,-0.246294,False,False,False,True
4,0.580343,-1.292298,-1.731605,-0.267517,2023-11-03 21:20:57,[-0.81045526 -0.27032033 -0.587677 0.538323...,-0.110807,False,False,False,True


In [263]:
df.to_csv("embedded_event_data2.csv")