In [16]:
from utils.image_shingle import ImageShingle
import os
import pandas as pd
import json

In [17]:
CRAWL_NAME = 'clickstream'
CRAWL_PATH = f'crawls/{CRAWL_NAME}/'

In [18]:
def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs

print(len(get_directories(CRAWL_PATH)))

102


In [19]:
def compare_clickstreams(baseline: str, comparison: str, name: str) -> pd.DataFrame:
    rows_list = []

    paths = []
    with open(f"crawls/{CRAWL_NAME}/results.json") as log_file:
        results = json.load(log_file)
        for path in results:
            # if not results[path]["unknown_exception"]:
            paths.append(path)

    for path in paths:
        clickstreams = get_directories(path)

        sample_size = 0

        total_clickstreams = 0
        website_sum = 0

        for clickstream in clickstreams:

            total_actions = 0
            clickstream_sum = 0

            for _ in range(10):
                all_cookies_path = f"{clickstream}/{baseline}-{total_actions}.png"
                no_cookies_path = f"{clickstream}/{comparison}-{total_actions}.png"
                
                if os.path.isfile(all_cookies_path) and os.path.isfile(no_cookies_path):
                    CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                    all_cookies_shingle = ImageShingle(all_cookies_path, CHUNK_SIZE)
                    no_cookies_shingle = ImageShingle(no_cookies_path, CHUNK_SIZE)

                    clickstream_sum += all_cookies_shingle.compare(no_cookies_shingle)
                    total_actions += 1
                else:
                    break

            sample_size += total_actions

            if total_actions != 0:
                clickstream_similarity = clickstream_sum / total_actions
                website_sum += clickstream_similarity

                total_clickstreams += 1
        
        if total_clickstreams != 0:
            website_similarity = website_sum / total_clickstreams
            website_difference = 1 - website_similarity

            website = os.path.basename(os.path.normpath(path))
            rows_list.append({
                "website": website,
                f"difference_{name}": website_difference,
                f"sample_size_{name}": sample_size
            })

    return pd.DataFrame(rows_list)

In [20]:
no_cookies = compare_clickstreams("baseline", "all_cookies", "control_group")
all_cookies = compare_clickstreams("baseline", "no_cookies", "remove_cookies")

In [21]:
merged_df = no_cookies.merge(all_cookies, on=["website"], how="inner")

In [28]:
def compare_with_control(baseline: str, control: str, experimental: str) -> pd.DataFrame:
    rows_list = []

    paths = []
    with open(f"crawls/{CRAWL_NAME}/results.json") as log_file:
        results = json.load(log_file)
        for path in results:
            # if not results[path]["unknown_exception"]:
            paths.append(path)

    for path in paths:
        clickstreams = get_directories(path)

        sample_size = 0

        total_clickstreams = 0
        website_sum = 0

        for clickstream in clickstreams:

            total_actions = 0
            clickstream_sum = 0

            for _ in range(10):
                baseline_path = f"{clickstream}/{baseline}-{total_actions}.png"
                control_path = f"{clickstream}/{control}-{total_actions}.png"
                experimental_path = f"{clickstream}/{experimental}-{total_actions}.png"
                
                if os.path.isfile(baseline_path) and os.path.isfile(control_path) and os.path.isfile(experimental_path):
                    CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                    baseline_shingle = ImageShingle(baseline_path, CHUNK_SIZE)
                    control_shingle = ImageShingle(control_path, CHUNK_SIZE)
                    experimental_shingle = ImageShingle(experimental_path, CHUNK_SIZE)

                    try:
                        clickstream_sum += ImageShingle.compare_with_control(baseline_shingle, control_shingle, experimental_shingle)
                    except:
                        print(baseline_path)
                    total_actions += 1
                else:
                    break

            sample_size += total_actions

            if total_actions != 0:
                clickstream_similarity = clickstream_sum / total_actions
                website_sum += clickstream_similarity

                total_clickstreams += 1
        
        if total_clickstreams != 0:
            website_similarity = website_sum / total_clickstreams
            website_difference = 1 - website_similarity

            website = os.path.basename(os.path.normpath(path))
            rows_list.append({
                "website": website,
                f"compare_with_control": website_difference,
                f"sample_size": sample_size
            })

    return pd.DataFrame(rows_list)

In [29]:
df = compare_with_control("baseline", "all_cookies", "no_cookies")

crawls/clickstream/freepik.com/2/baseline-2.png
crawls/clickstream/freepik.com/3/baseline-1.png
crawls/clickstream/springer.com/2/baseline-3.png
crawls/clickstream/abc.net.au/3/baseline-1.png
crawls/clickstream/bmj.com/0/baseline-1.png
crawls/clickstream/bmj.com/5/baseline-1.png
crawls/clickstream/mlb.com/0/baseline-1.png
crawls/clickstream/mlb.com/5/baseline-1.png
crawls/clickstream/mlb.com/6/baseline-1.png
crawls/clickstream/mlb.com/7/baseline-0.png
crawls/clickstream/mlb.com/8/baseline-0.png
crawls/clickstream/mlb.com/8/baseline-1.png
crawls/clickstream/slate.com/4/baseline-1.png
crawls/clickstream/ey.com/7/baseline-0.png
crawls/clickstream/ey.com/8/baseline-0.png
crawls/clickstream/discogs.com/9/baseline-4.png
crawls/clickstream/discogs.com/9/baseline-5.png
crawls/clickstream/seekingalpha.com/0/baseline-0.png
crawls/clickstream/seekingalpha.com/4/baseline-0.png
crawls/clickstream/allrecipes.com/1/baseline-1.png
crawls/clickstream/allrecipes.com/7/baseline-3.png
crawls/clickstream/a

In [None]:
# merged_df.to_csv(f'analysis/{CRAWL_NAME}.csv', index=False)

In [None]:
merged_df

Unnamed: 0,website,difference_control_group,sample_size_control_group,difference_remove_cookies,sample_size_remove_cookies
0,mail.ru,0.292826,84,0.490741,34
1,zoom.us,0.122629,66,0.139063,63
2,digicert.com,0.047083,92,0.073212,90
3,stackoverflow.com,0.115669,33,0.115963,33
4,teamviewer.com,0.199566,94,0.202908,94
5,amazon.co.uk,0.411267,29,0.214583,14
6,hp.com,0.29273,65,0.252909,54
7,aliyun.com,0.185793,71,0.188948,81
8,rackspace.com,0.312184,69,0.365078,71
9,freepik.com,0.290662,68,0.308839,71
