In [2]:
from image_shingle import ImageShingle
import os
import pandas as pd

In [3]:
CRAWL_PATH = 'crawls/sep10-clickstream/'

In [4]:
def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs

In [9]:
rows_list = []

for path in get_directories(CRAWL_PATH):
    clickstreams = get_directories(path)

    sample_size = 0

    total_clickstreams = 0
    website_sum = 0

    for clickstream in clickstreams:

        total_actions = 0
        clickstream_sum = 0

        for _ in range(10):
            all_cookies_path = f"{clickstream}/all_cookies-{total_actions}.png"
            no_cookies_path = f"{clickstream}/no_cookies-{total_actions}.png"
            
            if os.path.isfile(all_cookies_path) and os.path.isfile(no_cookies_path):
                CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                all_cookies_shingle = ImageShingle(all_cookies_path, 40)
                no_cookies_shingle = ImageShingle(no_cookies_path, 40)

                clickstream_sum += all_cookies_shingle.compare(no_cookies_shingle)
                total_actions += 1
            else:
                break

        sample_size += total_actions

        if total_actions != 0:
            clickstream_similarity = clickstream_sum / total_actions
            website_sum += clickstream_similarity

            total_clickstreams += 1
    
    if total_clickstreams != 0:
        website_similarity = website_sum / total_clickstreams

        website = os.path.basename(os.path.normpath(path))
        rows_list.append({
            "website": website,
            "website_similarity": website_similarity,
            "sample_size": sample_size
        })
        
df = pd.DataFrame(rows_list)
print(df)

In [8]:
df.sort_values(by=['sample_size'], ascending=False)

Unnamed: 0,website,website_similarity,sample_size
25,crawls/sep9-clickstream/doubleverify.com,0.940222,100
10,crawls/sep9-clickstream/springer.com,0.867038,83
22,crawls/sep9-clickstream/docker.com,0.673868,82
24,crawls/sep9-clickstream/plos.org,0.236332,76
2,crawls/sep9-clickstream/europa.eu,0.898231,74
26,crawls/sep9-clickstream/bmj.com,0.559318,62
11,crawls/sep9-clickstream/nature.com,0.751446,62
21,crawls/sep9-clickstream/rt.com,0.672011,58
7,crawls/sep9-clickstream/aliyun.com,0.741929,54
6,crawls/sep9-clickstream/hp.com,0.767481,44
