In [54]:
"""
This is a simple application for sentence embeddings: clustering

Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
"""
from sklearn.neighbors import NearestCentroid
import numpy as np
import pandas as pd
import os
import re
from sentence_transformers import SentenceTransformer, util
import time
from sklearn.cluster import AgglomerativeClustering

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Corpus with example sentences
# corpus = ['A man is eating food.',
#           'A man is eating a piece of bread.',
#           'A man is eating pasta.',
#           'The girl is carrying a baby.',
#           'The baby is carried by the woman',
#           'A man is riding a horse.',
#           'A man is riding a white horse on an enclosed ground.',
#           'A monkey is playing drums.',
#           'Someone in a gorilla costume is playing a set of drums.',
#           'A cheetah is running behind its prey.',
#           'A cheetah chases prey on across a field.'
#           ]
root_folder = '../CREME_dataset/modify'
r_state = 42

df = pd.read_csv(os.path.join(root_folder, "original_relabel_syslog.csv"))
# df = df.sample(n=1000, random_state=r_state)
content = list(df['Content'])
print(len(content))
for i in range(len(content)):
    content[i] = re.sub('(?:[0-9]{1,3}\.){3}[0-9]{1,3}', '<*>', content[i])
    content[i] = re.sub('port [0-9]{1,5}', 'port <*>', content[i])
corpus = list(set(content))
print(len(corpus))

#Two parameters to tune:
#min_cluster_size: Only consider cluster that have at least 25 elements
#threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
corpus_embeddings = embedder.encode(corpus)
print(corpus_embeddings.shape)


# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

print(len(clustered_sentences))

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(len(cluster))
    for sentence_id in cluster[0:3]:
        print("\t", sentence_id)
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", sentence_id)
    print("")

clustering_model2 = AgglomerativeClustering(n_clusters=None, distance_threshold=3) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model2.fit(corpus_embeddings)
cluster_assignment2 = clustering_model2.labels_
print(clustering_model2.n_clusters_)

235952
130


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

torch.Size([130, 384])
65
Cluster  51
4
	 "GET /chat/read_log.php HTTP/1.1" 404 505 "about:blank" "Node.js (linux; U; rv:v4.9.1) AppleWebKit/537.36 (KHTML, like Gecko)"
	 "GET /chat/read_log.php HTTP/1.1" 404 504 "about:blank" "Node.js (linux; U; rv:v4.9.1) AppleWebKit/537.36 (KHTML, like Gecko)"
	 "GET /chat/read_log.php HTTP/1.1" 404 503 "about:blank" "Node.js (linux; U; rv:v4.9.1) AppleWebKit/537.36 (KHTML, like Gecko)"
	 ...
	 "GET /chat/read_log.php HTTP/1.1" 404 504 "about:blank" "Node.js (linux; U; rv:v4.9.1) AppleWebKit/537.36 (KHTML, like Gecko)"
	 "GET /chat/read_log.php HTTP/1.1" 404 503 "about:blank" "Node.js (linux; U; rv:v4.9.1) AppleWebKit/537.36 (KHTML, like Gecko)"
	 "GET /chat/read_log.php HTTP/1.1" 404 468 "about:blank" "Node.js (linux; U; rv:v4.9.1) AppleWebKit/537.36 (KHTML, like Gecko)"

Cluster  33
1
	 (root) CMD (   cd / && run-parts --report /etc/cron.hourly)
	 ...
	 (root) CMD (   cd / && run-parts --report /etc/cron.hourly)

Cluster  37
1
	 pam_unix(samba:ses

In [55]:
import random
random.seed(r_state)

center = [0]*len(clustered_sentences)
for i, cluster in clustered_sentences.items():
    center[i] = random.choice(cluster)

d = dict(zip(corpus, list(map(lambda i: center[i],cluster_assignment))))
d2 = dict(zip(corpus, cluster_assignment2))

df['EventTemplate'] = list(map(lambda s:d[s], content))
df['EventTemplate2'] = list(map(lambda s:d2[s], content))
df.to_csv(os.path.join(root_folder, "NLP_label_syslog.csv"), index=False)