In [6]:
# coding=utf-8
#
# This file is part of b-swarm
# Licensed under the GNU General Public License version 3 (GPLv3)
#
# Author: BB, 2024
# __version__ = "code_version"
__license__ = "gplv3"
__author__ = "bb"
__version__ = "0.3"


from binascii import unhexlify
from io import BytesIO
from math import sqrt, prod, log2
from collections import Counter

import numpy as np
import pandas as pd
from ppdeep import compare as ppcompare
from PIL import Image, ImageChops


def ppdeep_diff(ppdeep1, ppdeep2):
    if ppdeep1 and ppdeep2:
        return ppcompare(ppdeep1, ppdeep2)
    else:
        return None

def sha256_match(hash1, hash2):
    if hash1 and hash2:
        return hash1 == hash2
    else:
        return None

def ssl_match(fprint1, fprint2):
    if fprint1 and fprint2:
        return fprint1 == fprint2
    else:
        return None

def url_match(url1, url2):
    if url1 and url2:
        if url1.endswith('/'):
            url1 = url1[:-1]
        if url2.endswith('/'):
            url2 = url2[:-1]
        return url1 == url2
    else:
        return None

def image_load(image, grayscale=True, grayscaleconvert=False, size=(1500, 3000), resize=False, subsample=False, factor=2):
    if image:
        image = unhexlify(image)
        pilImage = Image.open(BytesIO(image))
        if resize:
            resizedImage = pilImage.resize(size)
        if grayscaleconvert:
            grayscaleImage = pilImage.convert("L")
    else:
        if grayscale:
            pilImage = Image.new('L', (size[0], size[1]))
        else:
            pilImage = Image.new('RGB', (size[0], size[1]), (0xff, 0xff, 0xff))
    if subsample:
        imageArray = np.array(pilImage)
        pilImage = Image.fromarray(imageArray[::factor, ::factor])
    return pilImage

def image_mse(image1, image2):
    npImage1 = np.array(image1)
    npImage2 = np.array(image2)
    squared_diff = np.square(npImage1 - npImage2)
    mse = np.mean(squared_diff)
    return mse

def image_diff(image1, image2):
    diff = ImageChops.difference(image1, image2)
    return diff

def calculate_rmse(mseValues):
    sqMseSum = sum(mse ** 2 for mse in mseValues)
    rmse = sqrt(sqMseSum / len(mseValues))
    return rmse

In [12]:
from time import time

import clickhouse_connect as cc


def db_connect(db_host="localhost", db_username="default", db_password=""):
    client = cc.get_client(host=db_host, username=db_username, password=db_password)
    return client

def load_db(cc_client, snapshot_db="harvester", snapshot_table="snapshot", diff_table="cluster", snapshot_file="report*.parquet", diff_file="*_cluster.parquet", load_diff=False, clean_init=True):
    """
    Initialize the Clickhouse database.
    Load the Parquet files in a database.
    """
    if clean_init:
        cc_client.command(f"DROP TABLE IF EXISTS {snapshot_db}.{snapshot_table}")
        cc_client.command(f"DROP TABLE IF EXISTS {snapshot_db}.{diff_table}")
        cc_client.command(f"DROP DATABASE IF EXISTS {snapshot_db}")
        cc_client.command(f"CREATE DATABASE {snapshot_db}")
    try:
        cc_client.command(f"CREATE TABLE {snapshot_db}.{snapshot_table} ENGINE = MergeTree ORDER BY tuple() AS SELECT * FROM file('{snapshot_file}', Parquet)")
        if load_diff:
            cc_client.command(f"CREATE TABLE {snapshot_db}.{diff_table} ENGINE = MergeTree ORDER BY tuple() AS SELECT * FROM file('{diff_file}', Parquet)")
    except Exception as err:
        return err

def insert_cluster_into_table(cc_client, dataframe=None, snapshot_db="harvester", diff_table="cluster", force_schema=True, init_table=False):
    if init_table:
        cc_client.command(f"DROP TABLE IF EXISTS {snapshot_db}.{diff_table}")
        cc_client.command(f"""CREATE TABLE IF NOT EXISTS {snapshot_db}.{diff_table} (
                              meta_taskid String,
                              http_url String,
                              http_document_embedding_type String,
                              http_document_embeddings Array(Float64),
                              http_image_embedding_type String,
                              http_image_embeddings Array(Float64),
                              cluster_type String,
                              cluster_data Array(UInt64)
                          ) ENGINE = MergeTree()
                          ORDER BY meta_taskid;
                          """)
    if not dataframe and init_table:
        return "Init"
    if not dataframe:
        return False
    if force_schema:
        schema = {
            "meta_taskid": "string",
            "http_url": "string",
            "http_document_embedding_type": "string",
            "http_document_embeddings": "object",
            "http_image_embedding_type": "string",
            "http_image_embeddings": "object",
            "cluster_type": "string",
            "cluster_data": "object"
        }
        dataframe = dataframe.astype(schema)
    status = cc_client.insert_df(database=snapshot_db, table=diff_table, df=dataframe)
    return status

def export_cluster_to_parquet(cc_client, diff_file=None, snapshot_db="harvester", diff_table="cluster", method="clickhouse"):
    """
    Export the clustering Table to Parquet file
    """
    if not diff_file:
        agentId = cc_client.command(f"SELECT meta_taskid from {snapshot_db}.{diff_table} LIMIT 1")
        taskId = agentId.split(":")[0]
        diff_file = f"{taskId}_cluster.parquet"
    if method.lower() == "clickhouse":
        try:
            status = cc_client.command(f"SET engine_file_allow_create_multiple_files = 1")
            status = cc_client.command(f"INSERT INTO FUNCTION file('{diff_file}') SELECT * FROM {snapshot_db}.{diff_table} FORMAT Parquet SETTINGS compression = 'snappy'")
            return status
        except Exception as err:
            return err
    if method.lower() == "pandas":
        df = cc_client.query_df(f"SELECT * FROM {snapshot_db}.{diff_table} ORDER BY meta_taskid ASC")
        try:
            df.to_parquet(diff_file, engine="fastparquet", index=False, compression="snappy")
        except Exception as err:
            return err


client = db_connect()
load_db(client)

In [13]:
insert_cluster_into_table(client, init_table=True)

'Init'

In [91]:
%%time
#import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as bs
from html_sanitizer import Sanitizer
from readability import Document
from markdownify import markdownify as md

from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input as vgg16_preprocess_input
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input as mnv2_preprocess_input
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.applications.resnet_v2 import preprocess_input as rn50v2_preprocess_input
from tensorflow.keras.preprocessing import image as kimage
from tensorflow.keras.models import Model

from ipywidgets import HBox
from ipywidgets import Image as ipyImage
from IPython.display import display


def get_html_content(htmlDoc, sanitize=True, content="content", postprocess=True, markdown=False):
    noDoc = "NONE" # FIXME: No embedding extraction possible on empty set
    if not htmlDoc:
        return noDoc
    if sanitize:
        sanitizer = Sanitizer()
        htmlDoc = sanitizer.sanitize(htmlDoc)
        if not htmlDoc:
            return noDoc
    htmlDoc = Document(htmlDoc)
    if content.lower() == "summary":
        htmlContent = htmlDoc.summary()
    else:
        htmlContent = htmlDoc.content()
    if postprocess:
        soup = bs(htmlContent, 'html.parser')
        htmlContent = soup.get_text(separator=' ', strip=True)
    if markdown:
        htmlContent = md(htmlContent)
    return str(htmlContent)


def display_image_cluster(image_list, cluster_list, image_ratio=5):
    imgWidgets = []
    for image in image_list:
        img = image_load(image)
        IMAGE_RATIO = image_ratio
        reimg = ( img.resize((
            int(img.width // IMAGE_RATIO),
            int(img.height // IMAGE_RATIO)
            )) )
        imgWidgets.append(ipyImage(value=reimg._repr_png_(), format='png'))
    clusters = {}
    for i in range(0, len(cluster_list)):
        cluster = cluster_list[i]
        widget = imgWidgets[i]
        if cluster not in clusters.keys():
            clusters[cluster] = []
        clusters[cluster].append(widget)
    imgHboxes = []
    for key, val in clusters.items():
        print(f"[+] Cluster: {key} length {len(val)}")
        display(HBox(val))



snapshotDb = "harvester"
snapshotTable = "snapshot"
urlList = client.query(f"SELECT DISTINCT http_url FROM {snapshotDb}.{snapshotTable} ORDER BY http_url ASC").result_columns[0]
agentList = client.query(f"SELECT DISTINCT meta_agentid FROM {snapshotDb}.{snapshotTable} ORDER BY meta_agentid ASC").result_columns[0]

url = urlList[12]
print(url)
frame = client.query_df(f"SELECT * FROM {snapshotDb}.{snapshotTable} WHERE http_url=='{url}' ORDER BY meta_agentid ASC")
frame['http_document_content'] = frame['http_document'].apply(get_html_content, postprocess=False)


# Extract embeddings
### HTML embeddings TFIDF
print("Extracting HTML document embeddings...")

vectorizer = TfidfVectorizer()
html_embeddings = vectorizer.fit_transform(frame['http_document_content'])
html_embeddings = html_embeddings.toarray()
html_embeddings = html_embeddings.tolist()

print("Extracting PNG image embeddings...")

### PNG image embeddings - VGG16
vgg16Model = VGG16(weights='imagenet', input_shape=(224, 224, 3), include_top=False, pooling='avg')

def get_image_embedding_vgg16(binImage, IMAGE_SIZE = 224):
    if binImage == None:
        return np.zeros((vgg16Model.output_shape[1],))
    try:
        binImage = unhexlify(binImage)
        img = Image.open(BytesIO(binImage))
        if img.mode == 'L':
            img = Image.merge("RGB", (img, img, img))
        img = img.resize((IMAGE_SIZE, IMAGE_SIZE))
        img_array = np.array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = vgg16_preprocess_input(img_array)
        assert imgArray.shape == (1, IMAGE_SIZE, IMAGE_SIZE, 3)
        assert imgArray.dtype == np.float32
        features = vggModel.predict(img_array)
        return features.flatten()
    except Exception as err:
        print(err)
        return np.zeros((vgg16Model.output_shape[1],))

### PNG image embeddings - ResNet50V2
base_model = ResNet50V2(weights='imagenet', input_shape=(224, 224, 3))
rn50v2Model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

def get_image_embedding_rn50v2(binImage, IMAGE_SIZE = 224, RN_DIMENSIONS = 2048):
    if binImage == None:
        return np.zeros((RN_DIMENSIONS,))
    try:
        binImage = unhexlify(binImage)
        img = Image.open(BytesIO(binImage))
        if img.mode == 'L':
            img = Image.merge("RGB", (img, img, img))
        img = img.resize((IMAGE_SIZE, IMAGE_SIZE))
        imgArray = np.array(img)
        imgArray = np.expand_dims(imgArray, axis=0)
        imgArray = rn50v2_preprocess_input(imgArray)
        assert imgArray.shape == (1, IMAGE_SIZE, IMAGE_SIZE, 3)
        assert imgArray.dtype == np.float32
        features = rn50v2Model.predict(imgArray)
        return features.flatten()
    except Exception as err:
        print(err)
        return np.zeros((RN_DIMENSIONS,))

### PNG image embeddings - MobileNetV2
base_model = MobileNetV2(weights='imagenet', input_shape=(224, 224, 3), include_top=False, pooling='avg')
mnv2Model = Model(inputs=base_model.input, outputs=base_model.output)

def get_image_embedding_mnv2(binImage, IMAGE_SIZE = 224, MN_DIMENSIONS = 1280):
    if binImage == None:
        return np.zeros((MN_DIMENSIONS,))
    try:
        binImage = unhexlify(binImage)
        img = Image.open(BytesIO(binImage))
        if img.mode == 'L':
            img = Image.merge("RGB", (img, img, img))
        img = img.resize((IMAGE_SIZE, IMAGE_SIZE))
        imgArray = np.array(img)
        imgArray = np.expand_dims(imgArray, axis=0)
        imgArray = mnv2_preprocess_input(imgArray)
        assert imgArray.shape == (1, IMAGE_SIZE, IMAGE_SIZE, 3)
        assert imgArray.dtype == np.float32
        features = mnv2Model.predict(imgArray)
        return features.flatten()
    except Exception as err:
        print(err)
        return np.zeros((MN_DIMENSIONS,))

image_embeddings = [get_image_embedding_rn50v2(image) for image in frame['http_image']]

print("Done")

https://google.com
Extracting HTML document embeddings...
Extracting PNG image embeddings...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [94]:
%%time
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.cluster import HDBSCAN, OPTICS


#combined_embeddings = [np.concatenate((html_emb, img_emb)) for html_emb, img_emb in zip(html_embeddings, image_embeddings)]
combined_embeddings = np.hstack((html_embeddings, image_embeddings))

scaler = StandardScaler()
scaledEmbeddings = scaler.fit_transform(combined_embeddings)
algorithm = HDBSCAN(min_cluster_size=3, allow_single_cluster=False,
                    cluster_selection_method="eom", leaf_size=20,
                    n_jobs=2)
#algorithm = OPTICS(min_samples=3, leaf_size=10, n_jobs=2)
labels = algorithm.fit_predict(scaledEmbeddings)
print(labels)

images = client.query(f"select http_image from harvester.snapshot where http_url=='{url}' order by meta_agentid asc").result_columns[0]
display_image_cluster(images, labels)

[ 0  1  0  0 -1  0  0  1  0 -1  0  0  0  0  0  1  0 -1]
[+] Cluster: 0 length 12


HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01,\x00\x00\x02X\x08\x00\x00\x00\x00…

[+] Cluster: 1 length 3


HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01,\x00\x00\x02X\x08\x00\x00\x00\x00…

[+] Cluster: -1 length 3


HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01,\x00\x00\x02X\x08\x00\x00\x00\x00…

CPU times: user 704 ms, sys: 9.65 ms, total: 714 ms
Wall time: 767 ms


In [101]:
%%time
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.cluster import DBSCAN, HDBSCAN, OPTICS


def get_dbscan_clusters(dataFrame, eps=0.2, samples=3):
    scaler = RobustScaler()
    scaledFrame = scaler.fit_transform(dataFrame)
    engine = DBSCAN(eps=eps, min_samples=samples)
    return list(engine.fit(scaledFrame).labels_)

def get_hdbscan_clusters(dataFrame, samples=3):
    scaler = RobustScaler()
    scaledFrame = scaler.fit_transform(dataFrame)
    engine = HDBSCAN(min_cluster_size=samples)
    return list(engine.fit(scaledFrame).labels_)

def get_optics_clusters(dataFrame, eps=0.2, samples=3):
    scaler = RobustScaler()
    scaledFrame = scaler.fit_transform(dataFrame)
    engine = OPTICS(max_eps=eps*2, min_samples=samples)
    return list(engine.fit(scaledFrame).labels_)

def get_clusters(dataFrame, scaler="robust", algorithm="hdbscan", samples=3, eps=0.5, maxeps=None):
    if scaler.lower() == "none":
        scaler = None
    elif scaler.lower() == "robust":
        scaler = RobustScaler()
    elif scaler.lower() == "minmax":
        scaler = MinMaxScaler()
    elif scaler.lower() == "standard":
        scaler = StandardScaler()
    else:
        scaler = None

    if algorithm.lower() == "hdbscan":
        algorithm = HDBSCAN(min_cluster_size=samples)
    elif algorithm.lower() == "dbscan":
        algorithm = DBSCAN(eps=eps, min_samples=samples)
    elif algorithm.lower() == "optics":
        if not maxeps:
            maxeps = eps * 2
        algorithm = OPTICS(max_eps=maxeps, min_samples=samples)
    else:
        algorithm = None

    if scaler:
        dataFrame = scaler.fit_transform(dataFrame)
    if algorithm:
        clustering = algorithm.fit(dataFrame)
    else:
        return None

    return list(clustering.labels_)

print(contentFrame.describe())
print(imageFrame.describe())

txtlabels = get_clusters(contentFrame, scaler="robust", algorithm="hdbscan")
imglabels = get_clusters(imageFrame, scaler="robust", algorithm="hdbscan")

print(txtlabels, '\n', imglabels)

labels = imglabels

images = client.query(f"select http.image from harvester.snapshot where url.init=='{url}' order by meta.agentid asc").result_columns[0]
display_image_cluster(images, labels)

       http.entropy  http_content_bytesize  http_content_tags  \
count     18.000000              18.000000          18.000000   
mean       5.371936          291614.555556         136.222222   
std        0.003512           10200.699747           6.847546   
min        5.367097          278051.000000         127.000000   
25%        5.368451          280318.750000         130.000000   
50%        5.371776          297984.000000         140.000000   
75%        5.374728          300451.000000         141.000000   
max        5.377534          302640.000000         145.000000   

       http_content_uniquetags  http_content_tagdepth  http_content_attributes  
count                     18.0              18.000000                18.000000  
mean                      10.0               3.036793               446.055556  
std                        0.0               0.001865                25.521553  
min                       10.0               3.034483               412.000000  
25%      

HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01,\x00\x00\x02X\x08\x00\x00\x00\x00…

CPU times: user 720 ms, sys: 18.4 ms, total: 739 ms
Wall time: 858 ms


In [None]:
threshold = 0.9
selectList = []
for cluster in clusterList:
    dbscanVal = sum(cluster[0][0])
    fuzzyVal = cluster[0][1][1]
    mseVal = cluster[0][1][3]
    #print(dbscanVal, fuzzyVal, mseVal, dbscanVal == 0 or (fuzzyVal == 100.0 or mseVal == 0.0))
    if dbscanVal == 0 or (fuzzyVal >= 100.0 - threshold * 10 or mseVal <= 0.0 + threshold):
        continue
    selectList.append((cluster[1], fuzzyVal, mseVal, cluster[0][0]))
    print(cluster[1], fuzzyVal, mseVal, cluster[0][0])

print(len(selectList))

In [None]:
import ipywidgets as widgets
from IPython.display import display
from IPython.display import display, clear_output


def load_url(button_click):
    global snapshotDf
    snapshotDf = client.query_df(f"select * from {snapshotDb}.{snapshotTable} where url.init=='{urlLayer.value}' ORDER BY meta.agentid ASC")
    with output1:
        current = {}
        current['new'] = agentLayer1.value
        update_output1(current)
    with output2:
        current = {}
        current['new'] = agentLayer2.value
        update_output2(current)
    with output3:
        output3.clear_output()
        metadata3.value = ""
    dbscanVal = str(clusterList[urllist.index(urlLayer.value)][0][0])
    frameDiff = str(clusterList[urllist.index(urlLayer.value)][0][1])
    hdbscanVal = str(clusterList[urllist.index(urlLayer.value)][0][2])
    opticsVal = str(clusterList[urllist.index(urlLayer.value)][0][3])
    metadata4.value = f"{dbscanVal}\n{hdbscanVal}\n{opticsVal}\n{frameDiff}"
        
def single_snapshot():
    return 0
    analyze_snapshot(urllist, snapshotTable, singleUrl=urllist.index(urlLayer.value))

def compare_images(button_click):
    snapshotEntry1 = snapshotDf[(snapshotDf["meta.agentid"] == agentList[agentLayer1.value])]
    image1 = snapshotEntry1["http.image"].item()
    snapshotEntry2 = snapshotDf[(snapshotDf["meta.agentid"] == agentList[agentLayer2.value])]
    image2 = snapshotEntry2["http.image"].item()
    imgDiff = image_diff(image1, image2)
    with output3:
        output3.clear_output()
        display(imgDiff[0].resize((
            int(imgDiff[0].width // IMAGE_RATIO),
            int(imgDiff[0].height // IMAGE_RATIO)
            )))
        ppdeep1 = snapshotEntry1['http.fuzzyhash'].item()
        ppdeep2 = snapshotEntry2['http.fuzzyhash'].item()
        sha256_1 = snapshotEntry1['http.sha256'].item()
        sha256_2 = snapshotEntry2['http.sha256'].item()
        ent1 = snapshotEntry1['http.entropy'].item()
        ent2 = snapshotEntry2['http.entropy'].item()
        metadata3.value = (f"Content SHA256 match: {sha256_match(sha256_1, sha256_2)}\n"
                           f"Content Ppdeep similarity: {ppdeep_diff(ppdeep1, ppdeep2)}\n"
                           f"Image Mean squared error: {round(imgDiff[1], 4)}\n"
                           f"URL match: {url_match(snapshotEntry1['url.init'].item(), snapshotEntry1['url.end'].item())}"
                           f" | {url_match(snapshotEntry2['url.init'].item(), snapshotEntry2['url.end'].item())}\n"
                           f"SSL match: {snapshotEntry1['http.ssl'].item() == snapshotEntry2['http.ssl'].item()}"
                          )

def update_output1(change):
    index = change['new']
    with output1:
        output1.clear_output()
        snapshotEntry = snapshotDf[(snapshotDf["meta.agentid"] == agentList[agentLayer1.value])]
        image = image_load(snapshotEntry["http.image"].item())
        display(image.resize((
            int(image.width // IMAGE_RATIO),
            int(image.height // IMAGE_RATIO)
            )))
        metadata1.value = (f"AgentID: {snapshotEntry['meta.agentid'].item()}\n"
                           f"URL: {snapshotEntry['url.init'].item()}\n"
                           f"Visited URL: {snapshotEntry['url.end'].item()}\n"
                           f"Collection time: {snapshotEntry['meta.interacttime'].item().microsecond/1000}s\n"
                           f"UserAgent: {snapshotEntry['http.useragent'].item()}\n"
                           f"SSL fingerprint: {snapshotEntry['http.ssl'].item()}\n"
                           f"SHA256: {snapshotEntry['http.sha256'].item()}\n"
                           f"Ppdeep: {snapshotEntry['http.fuzzyhash'].item()}"
                          )
        
def update_output2(change):
    index = change['new']
    with output2:
        output2.clear_output()
        snapshotEntry = snapshotDf[(snapshotDf["meta.agentid"] == agentList[agentLayer2.value])]
        image = image_load(snapshotEntry["http.image"].item())
        display(image.resize((
            int(image.width // IMAGE_RATIO),
            int(image.height // IMAGE_RATIO)
            )))
        metadata2.value = (f"AgentID: {snapshotEntry['meta.agentid'].item()}\n"
                           f"URL: {snapshotEntry['url.init'].item()}\n"
                           f"Visited URL: {snapshotEntry['url.end'].item()}\n"
                           f"Collection time: {snapshotEntry['meta.interacttime'].item().microsecond/1000}s\n"
                           f"UserAgent: {snapshotEntry['http.useragent'].item()}\n"
                           f"SSL fingerprint: {snapshotEntry['http.ssl'].item()}\n"
                           f"SHA256: {snapshotEntry['http.sha256'].item()}\n"
                           f"Ppdeep: {snapshotEntry['http.fuzzyhash'].item()}"
                          )
            
def reload_images(button_click):
    with output1:
        current = {}
        current['new'] = agentLayer1.value
        update_output1(current)
    with output2:
        current = {}
        current['new'] = agentLayer2.value
        update_output2(current)
    with output3:
        output3.clear_output()
        metadata3.value = ""


IMAGE_RATIO = 3.5
vboxWidth = "455px"

urlLayer = widgets.Dropdown(options=urlList, value=urlList[0], description="url", disabled=False)
agentLayer1 = widgets.IntSlider(min=0, max=len(agentList) - 1, value=0)
agentLayer2 = widgets.IntSlider(min=0, max=len(agentList) - 1, value=0)
snapshotButton = widgets.Button(description="Analyze snapshot")
reloadButton = widgets.Button(description="Reload images")
compareButton = widgets.Button(description="Compare images")

urlLayer.observe(load_url, names='value')
agentLayer1.observe(update_output1, names='value')
agentLayer2.observe(update_output2, names='value')
snapshotButton.on_click(single_snapshot)
reloadButton.on_click(reload_images)
compareButton.on_click(compare_images)

textboxSize = widgets.Layout(flex='0 1 auto', width='450px')
output1 = widgets.Output()
metadata1 = widgets.Textarea(placeholder='Vantage1 metadata', layout=textboxSize)
output2 = widgets.Output()
metadata2 = widgets.Textarea(placeholder='Vantage2 metadata', layout=textboxSize)
output3 = widgets.Output()
metadata3 = widgets.Textarea(placeholder='Comparison results', layout=textboxSize)
metadata4 = widgets.Textarea(placeholder='Snapshot clusters', layout=widgets.Layout(flex='0 1 auto', width='600px'))

hbox1 = widgets.HBox([urlLayer, metadata4])
vbox1 = widgets.VBox([agentLayer1, output1, metadata1], layout=widgets.Layout(width=vboxWidth))
vbox2 = widgets.VBox([agentLayer2, output2, metadata2], layout=widgets.Layout(width=vboxWidth))
hbox2 = widgets.HBox([compareButton])
vbox3 = widgets.VBox([hbox2, output3, metadata3], layout=widgets.Layout(width=vboxWidth))
hbox3 = widgets.HBox([vbox1, vbox2, vbox3])
layout = widgets.VBox([hbox1, hbox3])
display(layout)

if __name__ == "__main__":
    load_url(None)

In [None]:
# OLDCODE

#def calculate_entropy(data): # replace by sklearn or scipy builtins
#    """
#    Calculate the Shannon entorpy for the provided content.
#    """
#    counter = Counter(data)
#    length = float(len(data))
#    ent = -sum(
#        count / length * log2(count / length)
#        for count in counter.values()
#        )
#    return ent

#def calculate_arithmetic_mean(values, handlenone=True, nonevalue=0): # to be obsolete
#    if handlenone:
#        values = [nonevalue if value == None else value for value in values]
#    return sum(values) / len(values)

#def calculate_normalized_diff(ent1, ent2, handleinf=True, inf=1e32): # to be obsolete
#    absDiff = abs(ent1 - ent2)
#    average = (ent1 + ent2) / 2
#    if average != 0:
#        normDiff = absDiff / average
#    else:
#        if handleinf:
#            normDiff = inf
#        else:
#            normDiff = float("inf")
#    return float(normDiff)

#def convert_image_array(image, resize_width=128, resize=True):
#    """
#    Obsolete, if embeddings are used
#    """
#    img = image_load(image)
#    if resize:
#        imgWidth, imgHeight = img.size
#        aspect = imgHeight // imgWidth
#        img = img.resize((resize_width, resize_width * aspect)).convert('L')
#    return np.asarray(img).flatten()

#def convert_sha256_value(sha256_hash):
#    """
#    Obsolete, if embeddings are used
#    """
#    if sha256_hash:
#        return int(sha256_hash, 16)
#    else:
#        return 0

#def calculate_mean_ppdeep(ppdeepNpArray):
#    """
#    Obsolete, if embeddings are used
#    """
#    hashDiffList = []
#    npArrayLen = len(ppdeepNpArray)
#    for i in range(0, npArrayLen):
#        hashList = []
#        fhash1 = ppdeepNpArray[i]
#        for j in range(0, npArrayLen):
#            if i == j:
#                continue
#            fhash2 = ppdeepNpArray[j]
#            hashList.append(ppdeep_diff(fhash1, fhash2))
#        hashDiffList.append(calculate_arithmetic_mean(hashList))
#    return hashDiffList

#def calculate_mean_mse(imgNpArray):
#    """
#    Obsolete, if embeddings are used
#    """
#    mseDiffList = []
#    npArrayLen = len(imgNpArray)
#    for i in range(0, npArrayLen):
#        mseList = []
#        img1 = imgNpArray[i]
#        for j in range(0, npArrayLen):
#            if i == j:
#                continue
#            img2 = imgNpArray[j]
#            squared_diff = np.square(img1 - img2)
#            mse = np.mean(squared_diff)
#            mseList.append(mse)
#        mseDiffList.append(calculate_rmse(mseList))
#    return mseDiffList

#from gensim.models.doc2vec import Doc2Vec, TaggedDocument
#import networkx as nx
#from node2vec import Node2Vec

#def get_html_attributes(htmlDoc):
#    """
#    Obsolete, if embeddings are used
#    """
#    def tag_depth(tag):
#        depth = 0
#        while tag.parent:
#            tag = tag.parent
#            depth += 1
#        return depth
#
#    if not htmlDoc:
#        return pd.Series([0, 0, 0, 0],
#                         index=['http_document_tags', 'http_document_uniquetags', 'http_document_tagdepth', 'http_document_attributes'])
#    soup = bs(htmlDoc, 'html.parser')
#    tags = soup.find_all()
#    htmlTags = len(tags)
#    htmlUniqueTags = len(set(tag.name for tag in tags))
#    depths = [tag_depth(tag) for tag in tags]
#    htmlTagDepth = np.mean(depths)
#    htmlAttributes = sum(len(tag.attrs) for tag in tags)
#    return pd.Series([htmlTags, htmlUniqueTags, htmlTagDepth, htmlAttributes],
#                     index=['http_document_tags', 'http_document_uniquetags', 'http_document_tagdepth', 'http_document_attributes'])

#def get_image_entropy(imageData):
#    """
#    Obsolete, if embeddings are used
#    """
#    image = image_load(imageData)
#    return image_shannon_entropy(image)

#def get_image_edges(imageData, grayscale_convert=True):
#    """
#    Obsolete, if embeddings are used
#    """
#    image = image_load(imageData)
#    if grayscale_convert:
#        image = image.convert('L')
#    npImage = np.array(image)
#    edges = Canny(npImage, 100, 200)
#    return np.sum(edges > 0)

#def get_lbp_variance(imageData, grayscale_convert=True):
#    """
#    Obsolete, if embeddings are used
#    """
#    image = image_load(imageData)
#    if grayscale_convert:
#        image = image.convert('L')
#    npImage = np.array(image)
#    lbp = local_binary_pattern(npImage, P=8, R=1, method="uniform")
#    return np.var(lbp)

### HTML document embeddings - doc2vec
doc2vec = """
html_docs = frame['http.content'].to_list()
tagged_docs = [TaggedDocument(words=bs(html, 'html.parser').get_text().split(), tags=[str(i)]) for i, html in enumerate(html_docs)]
model = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4)
html_embeddings = [model.dv[str(i)] for i in range(len(html_docs))]
"""

### HTML document embeddings - node2vec
node2vec = """
def html_to_graph(htmlDoc):
    soup = bs(htmlDoc, 'html.parser')
    G = nx.DiGraph()
    def add_edges(parent, soup):
        for child in soup.children:
            if child.name:
                G.add_edge(parent, child.name)
                add_edges(child.name, child)
    for tag in soup.find_all(recursive=False):
        G.add_node(tag.name)
        add_edges(tag.name, tag)
    return G

graphs = [html_to_graph(htmlDoc) for htmlDoc in frame['http.content']]
node2vec_models = []
for G in graphs:
    node2vec = Node2Vec(G, dimensions=32, walk_length=10, num_walks=100, workers=4)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    node2vec_models.append(model)
html_embeddings = [model.wv['html'] if 'html' in model.wv else np.zeros(64) for model in node2vec_models]
"""

# Feature extraction - extract various numeric features.
# Replaced by embeddings

#from cv2 import Canny
#from skimage.measure import shannon_entropy as image_shannon_entropy
#from skimage.feature import local_binary_pattern

features = """
# HTTP content feature extraction
frame[['http_document_tags', 'http_document_uniquetags', 'http_document_tagdepth', 'http_document_attributes']] = frame['http_document_content'].apply(get_html_attributes)
frame['http_document_bytesize'] = [len(content.encode('utf-8')) for content in frame['http.content']]

# HTTP image feature extraction
frame['http_image_structentropy'] = frame['http.image'].apply(get_image_entropy)
frame['http_image_dataentropy'] = frame['http.image'].apply(calculate_entropy)
frame['http_image_edges'] = frame['http.image'].apply(get_image_edges, grayscale_convert=False)
frame['http_image_lbp'] = frame['http.image'].apply(get_lbp_variance, grayscale_convert=False)

contentFrame = frame[['http.entropy', 'http_document_bytesize', 'http_document_tags', 'http_document_uniquetags', 'http_document_tagdepth', 'http_document_attributes']]
imageFrame = frame[['http_image_structentropy', 'http_image_dataentropy', 'http_image_edges', 'http_image_lbp']]
"""

def create_snapshot_diff_old(cc_client, snapshot_db="harvester", snapshot_table="snapshot", diff_table="cluster", verbose=True):
    """
    Obsolete, if embeddings are used.
    TODO: Instead -- calculate embeddings and clusters, and save them to separate table.
    """
    cc_client.command(f"DROP TABLE IF EXISTS {snapshot_db}.{diff_table}")
    cc_client.command(f"CREATE TABLE IF NOT EXISTS {snapshot_db}.{diff_table} (url_init String, meta_agentid String, diff_fuzzyhash Float64, diff_mse Float64) ENGINE = MergeTree() ORDER BY url_init")
    
    urlList = cc_client.query(f"SELECT DISTINCT url.init FROM {snapshot_db}.{snapshot_table} ORDER BY url.init").result_columns[0]
    u = 0
    ucount = len(urlList)
    for url in urlList:
        startTime = time()
        diffDf = client.query_df(f"SELECT url.init, meta.agentid, http.fuzzyhash, http.image FROM {snapshot_db}.{snapshot_table} WHERE url.init=='{url}' ORDER BY meta.agentid ASC")
        # Process sha256 - can be queried and processed from the snapshot table directly
        # diffDf['http.sha256'] = diffDf['http.sha256'].apply(convert_sha256_value)
        # Process fuzzy-hash CTPH
        diffDf['http.fuzzyhash'] = calculate_mean_ppdeep(diffDf['http.fuzzyhash'].to_numpy())
        # Process PNG image MSE, SSIM
        diffDf['http.image'] = diffDf['http.image'].apply(convert_image_array, resize=False)
        diffDf['diff_mse'] = calculate_mean_mse(diffDf['http.image'].to_numpy())
        diffDf = diffDf.drop('http.image', axis=1)

        # Fixed in harvester syntax 0.52. While the snapshot file still uses dot synatx, retain this. Modify once the new snapshot format becomes the main one.
        # Modify everywhere in the code!
        diffDf.rename(columns={'url.init': 'url_init', 'meta.agentid': 'meta_agentid', 'http.fuzzyhash': 'diff_fuzzyhash'}, inplace=True)
        
        diffSchema = {
            "url_init": "string",
            "meta_agentid": "string",
            "diff_fuzzyhash": "float64",
            "diff_mse": "float64"
            }
        diffDf = diffDf.astype(diffSchema)
        status = cc_client.insert_df(database=snapshot_db, table=diff_table, df=diffDf)
        endTime = time()
        u += 1
        if verbose:
            print(f"{int(u / ucount * 100)}%  \t ({u}/{ucount}) \t {round((endTime - startTime), 2)}s \t {url}")