In [1]:
import os
import base64
import numpy as np
from json import loads
from io import BytesIO
from datetime import datetime

import pandas as pnd
import ipywidgets as widgets
from ppdeep import compare as ppcompare
from PIL import Image, ImageChops, ImageDraw
from IPython.display import display, clear_output


def ppdeep_diff(ppdeep1, ppdeep2):
    if ppdeep1 and ppdeep2:
        return ppcompare(ppdeep1, ppdeep2)
    else:
        return None

def sha256_match(hash1, hash2):
    if hash1 and hash2:
        return hash1 == hash2
    else:
        return None

def ssl_match(fprint1, fprint2):
    if fprint1 and fprint2:
        return fprint1 == fprint2
    else:
        return None

def url_match(url1, url2):
    if url1 and url2:
        if url1.endswith('\\'):
            url1 = url1[:-1]
        if url2.endswith('\\'):
            url2 = url2[:-1]
        return url1 == url2
    else:
        return None

def pil_load_image(imageb64):
    image = base64.b64decode(imageb64)
    if image:
        pilImage = Image.open(BytesIO(image))
    else:
        pilImage = Image.new('RGB', (1500, 3000), (0xff, 0xff, 0xff))
        edImage = ImageDraw.Draw(pilImage)
        edImage.text((100, 100),
                     "NO IMAGE",
                     (0x00, 0x00, 0x00),
                     font_size=100
                    )
        del edImage
    return pilImage
    
def pil_imgdiff(pilimage1, pilimage2):
    diff = ImageChops.difference(pilimage1, pilimage2)
    arr1 = np.array(pilimage1)
    arr2 = np.array(pilimage2)
    squared_diff = np.square(arr1 - arr2)
    mse = np.mean(squared_diff)
    return diff, mse

def get_snapshot_files(path, fileStartPattern):
    directory = os.fsencode(path)
    fileList = []
    for file in os.listdir(directory):
        fileName = os.fsdecode(file)
        if fileName.startswith(fileStartPattern) and fileName.endswith(".json"):
            if path[-1] != "/":
                path = path + "/"
            fileList.append(path + fileName)
        else:
            continue
    return fileList

def load_snapshot(snapshotFolderPath):
    snapshotFileList = get_snapshot_files(snapshotFolderPath, "report_")
    snapshotDf = pnd.DataFrame()
    for file in snapshotFileList:
        with open(file, "r") as jsonFile:
            fileData = jsonFile.read()
            jsonData = loads(fileData)
        jsonDf = pnd.DataFrame(jsonData["task_data"])
        jsonDf.insert(0, "worker_id", jsonData["task_meta"]["worker_id"])
        snapshotDf = pnd.concat([snapshotDf, jsonDf], copy=False)
    return snapshotDf

def get_snapshot_urls(snapshotDf, series="url"):
    seriesUrl = snapshotDf[series].unique()
    urlList = list(seriesUrl)
    return urlList

def get_snapshot_vantages(snapshotDf, series="worker_id"):
    seriesVantage = snapshotDf[series].unique()
    vantageList = list(seriesVantage)
    return vantageList

In [2]:
snapshotFolderPath1 = "snapshots/topdomains"
snapshotDf1 = load_snapshot(snapshotFolderPath1)

# remove empty data entries - for visual inspection only!
# For future full automatic data analysis retain all the data in the dataframe
SnapshotDf1 = snapshotDf1.query('curl != "" & http_content != "" & http_image != ""')

urlList1 = get_snapshot_urls(snapshotDf1)
vantageList1 = get_snapshot_vantages(snapshotDf1)

#snapshotFolderPath2 = "snapshots/topdomains"
#snapshotDf2 = load_snapshot(snapshotFolderPath2)

# remove empty data entries - for visual inspection only!
# For future full automatic data analysis retain all the data in the dataframe
#SnapshotDf2 = snapshotDf2.query('curl != "" & http_content != "" & http_image != ""')

#urlList2 = get_snapshot_urls(snapshotDf2)
#vantageList2 = get_snapshot_vantages(snapshotDf2)


snapshotDf2 = SnapshotDf1
urlList2 = urlList1
vantageList2 = vantageList1

In [3]:
# pil_load_image(snapshotDf2.query('worker_id == "3f70e31e-4f0a-11ee-8fd5-4200a9fe0102:australia-southeast1:IPNET:001testing:002" & url == "https://jpeg.ly/dPNnq"')["http_image"].item())

In [4]:
def compare_images(button_click):
    snapshotEntry1 = snapshotDf1[(snapshotDf1["url"] == urlLayer1.value) & (snapshotDf1["worker_id"] == vantageList1[agentSlider1.value])]
    snapshotImage1 = pil_load_image(snapshotEntry1["http_image"].item())
    snapshotEntry2 = snapshotDf2[(snapshotDf2["url"] == urlLayer2.value) & (snapshotDf2["worker_id"] == vantageList2[agentSlider2.value])]
    snapshotImage2 = pil_load_image(snapshotEntry2["http_image"].item())
    diff = pil_imgdiff(
        snapshotImage1,
        snapshotImage2
    )
    with output3:
        output3.clear_output()
        display(diff[0].resize((
            int(diff[0].width // IMAGE_RATIO),
            int(diff[0].height // IMAGE_RATIO)
            )))
        ppdeep1 = snapshotEntry1['fuzzyhash'].item()
        ppdeep2 = snapshotEntry2['fuzzyhash'].item()
        sha256_1 = snapshotEntry1['sha256'].item()
        sha256_2 = snapshotEntry2['sha256'].item()
        metadata3.value = (f"Content SHA256 match: {sha256_match(sha256_1, sha256_2)}\n"
                           f"Content Ppdeep similarity: {ppdeep_diff(ppdeep1, ppdeep2)}\n"
                           f"Image Mean squared error: {round(diff[1], 4)}\n"
                           f"URL match: {url_match(snapshotEntry1['url'].item(), snapshotEntry1['curl'].item())}"
                           f" | {url_match(snapshotEntry2['url'].item(), snapshotEntry2['curl'].item())}"
                          )

def update_output1(change):
    index = change['new']
    with output1:
        output1.clear_output()
        snapshotEntry = snapshotDf1[(snapshotDf1["url"] == urlLayer1.value) & (snapshotDf1["worker_id"] == vantageList1[agentSlider1.value])]
        snapshotImage = pil_load_image(snapshotEntry["http_image"].item())
        display(snapshotImage.resize((
            int(snapshotImage.width // IMAGE_RATIO),
            int(snapshotImage.height // IMAGE_RATIO)
            )))
        snapTime = snapshotEntry["time_frame"].item()[1] - snapshotEntry["time_frame"].item()[0]
        metadata1.value = (f"CollectorID: {snapshotEntry['worker_id'].item()}\n"
                           f"URL: {snapshotEntry['url'].item()}\n"
                           f"Visited URL: {snapshotEntry['curl'].item()}\n"
                           f"Collection time: {round(snapTime, 2)}s\n"
                           f"UserAgent: {snapshotEntry['useragent'].item()}\n"
                           f"SSL fingerprint: {snapshotEntry['ssl_fingerprint'].item()}\n"
                           f"SHA256: {snapshotEntry['sha256'].item()}\n"
                           f"Ppdeep: {snapshotEntry['fuzzyhash'].item()}"
                          )
        
def update_output2(change):
    index = change['new']
    with output2:
        output2.clear_output()
        snapshotEntry = snapshotDf2[(snapshotDf2["url"] == urlLayer2.value) & (snapshotDf2["worker_id"] == vantageList2[agentSlider2.value])]
        snapshotImage = pil_load_image(snapshotEntry["http_image"].item())
        display(snapshotImage.resize((
            int(snapshotImage.width // IMAGE_RATIO),
            int(snapshotImage.height // IMAGE_RATIO)
            )))
        snapTime = snapshotEntry["time_frame"].item()[1] - snapshotEntry["time_frame"].item()[0]
        metadata2.value = (f"CollectorID: {snapshotEntry['worker_id'].item()}\n"
                           f"URL: {snapshotEntry['url'].item()}\n"
                           f"Visited URL: {snapshotEntry['curl'].item()}\n"
                           f"Collection time: {round(snapTime, 2)}s\n"
                           f"UserAgent: {snapshotEntry['useragent'].item()}\n"
                           f"SSL fingerprint: {snapshotEntry['ssl_fingerprint'].item()}\n"
                           f"SHA256: {snapshotEntry['sha256'].item()}\n"
                           f"Ppdeep: {snapshotEntry['fuzzyhash'].item()}"
                          )

def update_layer(change):
    if urlLock.value == True:
        urlLayer1.value = change['new']
        urlLayer2.value = change['new']
            
def reload_images(button_click):
    with output1:
        current = {}
        current['new'] = agentSlider1.value
        update_output1(current)
    with output2:
        current = {}
        current['new'] = agentSlider2.value
        update_output2(current)
    with output3:
        output3.clear_output()
        metadata3.value = ""


IMAGE_RATIO = 3.5

urlLayer1 = widgets.Dropdown(options=urlList1, value=urlList1[0], description="url", disabled=False)
urlLayer2 = widgets.Dropdown(options=urlList2, value=urlList2[0], description="url", disabled=False)
urlLock = widgets.Checkbox(value=True, description="Lock URLs", disabled=False, indent=False)

agentSlider1 = widgets.IntSlider(min=0, max=len(vantageList1) - 1, value=0)
agentSlider2 = widgets.IntSlider(min=0, max=len(vantageList2) - 1, value=0)

reloadButton1 = widgets.Button(description="Reload images")
compareButton2 = widgets.Button(description="Compare images")

textboxSize = widgets.Layout(flex='0 1 auto', width='450px')
output1 = widgets.Output()
metadata1 = widgets.Textarea(placeholder='Vantage metadata', layout=textboxSize)
output2 = widgets.Output()
metadata2 = widgets.Textarea(placeholder='Vantage metadata', layout=textboxSize)
output3 = widgets.Output()
metadata3 = widgets.Textarea(placeholder='Comparison results', layout=textboxSize)

agentSlider1.observe(update_output1, names='value')
agentSlider2.observe(update_output2, names='value')
urlLayer1.observe(update_layer, names='value')
urlLayer2.observe(update_layer, names='value')
reloadButton1.on_click(reload_images)
compareButton2.on_click(compare_images)

vboxWidth = "455px"
vbox1 = widgets.VBox([urlLayer1, agentSlider1, output1, metadata1], layout=widgets.Layout(width=vboxWidth))
vbox2 = widgets.VBox([urlLayer2, agentSlider2, output2, metadata2], layout=widgets.Layout(width=vboxWidth))
vbox31 = widgets.VBox([urlLock])
vbox32 = widgets.VBox([reloadButton1, compareButton2])
vbox3 = widgets.HBox([vbox31, vbox32])
vbox4 = widgets.VBox([vbox3, output3, metadata3], layout=widgets.Layout(width=vboxWidth))
layout = widgets.HBox([vbox1, vbox2, vbox4])
display(layout)

HBox(children=(VBox(children=(Dropdown(description='url', options=('https://1rx.io', 'https://2mdn.net', 'http…