In [26]:
from utils.image_shingle import ImageShingle
import os
import pandas as pd

In [27]:
CRAWL_NAME = 'clickstream-experiment'
CRAWL_PATH = f'crawls/{CRAWL_NAME}/'

In [28]:
def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs

print(len(get_directories(CRAWL_PATH)))

1


In [29]:
def compare_clickstreams(baseline: str, comparison: str, name: str) -> pd.DataFrame:
    rows_list = []

    for path in get_directories(CRAWL_PATH):
        clickstreams = get_directories(path)

        sample_size = 0

        total_clickstreams = 0
        website_sum = 0

        for clickstream in clickstreams:

            total_actions = 0
            clickstream_sum = 0

            for _ in range(10):
                all_cookies_path = f"{clickstream}/{baseline}-{total_actions}.png"
                no_cookies_path = f"{clickstream}/{comparison}-{total_actions}.png"
                
                if os.path.isfile(all_cookies_path) and os.path.isfile(no_cookies_path):
                    CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                    all_cookies_shingle = ImageShingle(all_cookies_path, 40)
                    no_cookies_shingle = ImageShingle(no_cookies_path, 40)

                    clickstream_sum += all_cookies_shingle.compare(no_cookies_shingle)
                    total_actions += 1
                else:
                    break

            sample_size += total_actions

            if total_actions != 0:
                clickstream_similarity = clickstream_sum / total_actions
                website_sum += clickstream_similarity

                total_clickstreams += 1
        
        if total_clickstreams != 0:
            website_similarity = website_sum / total_clickstreams
            website_difference = 1 - website_similarity

            website = os.path.basename(os.path.normpath(path))
            rows_list.append({
                "website": website,
                f"difference_{name}": website_difference,
                f"sample_size_{name}": sample_size
            })

    return pd.DataFrame(rows_list)

In [30]:
no_cookies = compare_clickstreams("baseline", "all_cookies", "control_group")
all_cookies = compare_clickstreams("baseline", "no_cookies", "remove_cookies")

In [31]:
merged_df = no_cookies.merge(all_cookies, on=["website"], how="inner")

In [32]:
merged_df.to_csv(f'analysis/{CRAWL_NAME}.csv', index=False)

In [33]:
merged_df

Unnamed: 0,website,difference_control_group,sample_size_control_group,difference_remove_cookies,sample_size_remove_cookies
0,mail.ru,0.272619,33,0.558185,9
