In [4]:
from utils.image_shingle import ImageShingle
import os
import pandas as pd

In [5]:
CRAWL_NAME = 'clickstream-experiment'
CRAWL_PATH = f'crawls/{CRAWL_NAME}/'

In [6]:
def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs

print(len(get_directories(CRAWL_PATH)))

102


In [7]:
def compare_clickstreams(baseline: str, comparison: str, name: str) -> pd.DataFrame:
    rows_list = []

    for path in get_directories(CRAWL_PATH):
        clickstreams = get_directories(path)

        sample_size = 0

        total_clickstreams = 0
        website_sum = 0

        for clickstream in clickstreams:

            total_actions = 0
            clickstream_sum = 0

            for _ in range(10):
                all_cookies_path = f"{clickstream}/{baseline}-{total_actions}.png"
                no_cookies_path = f"{clickstream}/{comparison}-{total_actions}.png"
                
                if os.path.isfile(all_cookies_path) and os.path.isfile(no_cookies_path):
                    CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                    all_cookies_shingle = ImageShingle(all_cookies_path, 40)
                    no_cookies_shingle = ImageShingle(no_cookies_path, 40)

                    clickstream_sum += all_cookies_shingle.compare(no_cookies_shingle)
                    total_actions += 1
                else:
                    break

            sample_size += total_actions

            if total_actions != 0:
                clickstream_similarity = clickstream_sum / total_actions
                website_sum += clickstream_similarity

                total_clickstreams += 1
        
        if total_clickstreams != 0:
            website_similarity = website_sum / total_clickstreams
            website_difference = 1 - website_similarity

            website = os.path.basename(os.path.normpath(path))
            rows_list.append({
                "website": website,
                f"difference_{name}": website_difference,
                f"sample_size_{name}": sample_size
            })

    return pd.DataFrame(rows_list)

In [13]:
# def compare_clickstreams(baseline: str, comparison: str, name: str) -> tuple[pd.DataFrame, dict]:
#     rows_list = []
#     website_differences = {}

#     for path in get_directories(CRAWL_PATH)[:2]:
#         clickstreams = get_directories(path)

#         sample_size = 0

#         total_clickstreams = 0
#         website_sum = 0

#         differences = []
#         for clickstream in clickstreams:

#             total_actions = 0
#             clickstream_sum = 0

#             for _ in range(10):
#                 all_cookies_path = f"{clickstream}/{baseline}-{total_actions}.png"
#                 no_cookies_path = f"{clickstream}/{comparison}-{total_actions}.png"
                
#                 if os.path.isfile(all_cookies_path) and os.path.isfile(no_cookies_path):
#                     CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
#                     all_cookies_shingle = ImageShingle(all_cookies_path, 40)
#                     no_cookies_shingle = ImageShingle(no_cookies_path, 40)

#                     differences.append(all_cookies_shingle.compare(no_cookies_shingle))
#                     total_actions += 1
#                 else:
#                     break

#             sample_size += total_actions

#             if total_actions != 0:
#                 clickstream_similarity = clickstream_sum / total_actions
#                 website_sum += clickstream_similarity

#                 total_clickstreams += 1
        
#         website = os.path.basename(os.path.normpath(path))
#         website_differences[website] = differences
        
#         if total_clickstreams != 0:
#             website_similarity = website_sum / total_clickstreams
#             website_difference = 1 - website_similarity

#             rows_list.append({
#                 "website": website,
#                 f"difference_{name}": website_difference,
#                 f"sample_size_{name}": sample_size
#             })

#     return pd.DataFrame(rows_list), website_differences

In [14]:
# no_cookies, control_comparisons = compare_clickstreams("baseline", "all_cookies", "control_group")
# all_cookies, no_cookies_comparisons = compare_clickstreams("baseline", "no_cookies", "remove_cookies")

In [5]:
no_cookies = compare_clickstreams("baseline", "all_cookies", "control_group")
all_cookies = compare_clickstreams("baseline", "no_cookies", "remove_cookies")

In [6]:
merged_df = no_cookies.merge(all_cookies, on=["website"], how="inner")

In [7]:
merged_df.to_csv(f'analysis/{CRAWL_NAME}.csv', index=False)

In [8]:
merged_df

Unnamed: 0,website,difference_control_group,sample_size_control_group,difference_remove_cookies,sample_size_remove_cookies
0,mail.ru,0.275032,57,0.547173,19
1,zoom.us,0.044787,68,0.418331,35
2,europa.eu,0.203922,63,0.155351,69
3,stackoverflow.com,0.032738,1,0.032738,1
4,teamviewer.com,0.203953,88,0.967262,10
...,...,...,...,...,...
87,segment.io,0.032257,86,0.037567,82
88,mastercard.com,0.587200,80,0.999219,10
89,coe.int,0.022619,14,0.257143,14
90,yumpu.com,0.226356,62,0.925595,10
