In [1]:
from image_shingle import ImageShingle
import os

In [2]:
CRAWL_PATH = 'crawls/clickstream/'

In [3]:
def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs

In [4]:
similarity_score = {}

for site in get_directories(CRAWL_PATH):
    clickstreams = get_directories(site)

    total_clickstreams = 0
    website_sum = 0

    for clickstream in clickstreams:

        total_actions = 0
        clickstream_sum = 0

        for _ in range(10):
            all_cookies_path = f"{clickstream}/all_cookies-{total_actions}.png"
            no_cookies_path = f"{clickstream}/no_cookies-{total_actions}.png"
            
            if os.path.isfile(all_cookies_path) and os.path.isfile(no_cookies_path):
                CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                all_cookies_shingle = ImageShingle(all_cookies_path, 40)
                no_cookies_shingle = ImageShingle(no_cookies_path, 40)

                clickstream_sum += all_cookies_shingle.compare(no_cookies_shingle)
                total_actions += 1
            else:
                break

        if total_actions != 0:
            clickstream_similarity = clickstream_sum / total_actions
            website_sum += clickstream_similarity

            total_clickstreams += 1
    
    if total_clickstreams != 0:
        website_similarity = website_sum / total_clickstreams
        similarity_score[site] = website_similarity

print(similarity_score)

{'crawls/clickstream/maxwellmlin.com': 0.9997619047619046, 'crawls/clickstream/mail.ru': 0.35129676870748305, 'crawls/clickstream/zoom.us': 0.6483843537414966, 'crawls/clickstream/europa.eu': 0.8113095238095238}
