In [10]:
from utils.image_shingle import ImageShingle
import os
import pandas as pd
import json
import utils.utils as utils

In [11]:
CRAWL_NAME = 'clickstream-intercept-request'
CRAWL_PATH = f'crawls/{CRAWL_NAME}/'

PATHS = []
with open(f"crawls/{CRAWL_NAME}/results.json") as results:
    results = json.load(results)
    for path in results:
        if results[path]["crawl_failure"] is False:
            PATHS.append(path)

print(len(PATHS))

43


In [12]:
def compare_clickstreams(baseline: str, comparison: str, name: str) -> pd.DataFrame:
    rows_list = []

    for i, path in enumerate(PATHS):
        print(f"{i+1}/{len(PATHS)} completed")
        clickstreams = utils.get_directories(path)

        sample_size = 0

        total_clickstreams = 0
        website_sum = 0

        for clickstream in clickstreams:

            total_actions = 0
            clickstream_sum = 0

            for _ in range(10):
                all_cookies_path = f"{clickstream}/{baseline}-{total_actions}.png"
                no_cookies_path = f"{clickstream}/{comparison}-{total_actions}.png"
                
                if os.path.isfile(all_cookies_path) and os.path.isfile(no_cookies_path):
                    CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                    all_cookies_shingle = ImageShingle(all_cookies_path, CHUNK_SIZE)
                    no_cookies_shingle = ImageShingle(no_cookies_path, CHUNK_SIZE)

                    clickstream_sum += all_cookies_shingle.compare(no_cookies_shingle)
                    total_actions += 1

            sample_size += total_actions

            if total_actions != 0:
                clickstream_similarity = clickstream_sum / total_actions
                website_sum += clickstream_similarity

                total_clickstreams += 1
        
        if total_clickstreams != 0:
            website_similarity = website_sum / total_clickstreams
            website_difference = 1 - website_similarity

            website = os.path.basename(os.path.normpath(path))
            rows_list.append({
                "website": website,
                f"difference_{name}": website_difference,
                f"sample_size_{name}": sample_size
            })

    return pd.DataFrame(rows_list)

In [13]:
no_cookies = compare_clickstreams("baseline", "control", "control")
all_cookies = compare_clickstreams("baseline", "experimental", "remove_cookies")

1/43 completed
2/43 completed
3/43 completed
4/43 completed
5/43 completed
6/43 completed
7/43 completed
8/43 completed
9/43 completed
10/43 completed
11/43 completed
12/43 completed
13/43 completed
14/43 completed
15/43 completed
16/43 completed
17/43 completed
18/43 completed
19/43 completed
20/43 completed
21/43 completed
22/43 completed
23/43 completed
24/43 completed
25/43 completed
26/43 completed
27/43 completed
28/43 completed
29/43 completed
30/43 completed
31/43 completed
32/43 completed
33/43 completed
34/43 completed
35/43 completed
36/43 completed
37/43 completed
38/43 completed
39/43 completed
40/43 completed
41/43 completed
42/43 completed
43/43 completed
1/43 completed
2/43 completed
3/43 completed
4/43 completed
5/43 completed
6/43 completed
7/43 completed
8/43 completed
9/43 completed
10/43 completed
11/43 completed
12/43 completed
13/43 completed
14/43 completed
15/43 completed
16/43 completed
17/43 completed
18/43 completed
19/43 completed
20/43 completed
21/43 comp

In [14]:
merged_df = no_cookies.merge(all_cookies, on=["website"])

In [15]:
def compare_with_control(baseline: str, control: str, experimental: str) -> pd.DataFrame:
    rows_list = []

    for i, path in enumerate(PATHS):
        print(f"{i+1}/{len(PATHS)} completed")

        clickstreams = utils.get_directories(path)

        sample_size = 0

        total_clickstreams = 0
        website_sum = 0

        for clickstream in clickstreams:

            total_actions = 0
            clickstream_sum = 0

            for _ in range(10):
                baseline_path = f"{clickstream}/{baseline}-{total_actions}.png"
                control_path = f"{clickstream}/{control}-{total_actions}.png"
                experimental_path = f"{clickstream}/{experimental}-{total_actions}.png"
                
                if os.path.isfile(baseline_path) and os.path.isfile(control_path) and os.path.isfile(experimental_path):
                    CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                    baseline_shingle = ImageShingle(baseline_path, CHUNK_SIZE)
                    control_shingle = ImageShingle(control_path, CHUNK_SIZE)
                    experimental_shingle = ImageShingle(experimental_path, CHUNK_SIZE)

                    try:
                        clickstream_sum += ImageShingle.compare_with_control(baseline_shingle, control_shingle, experimental_shingle)
                    except:
                        print(baseline_path)
                    total_actions += 1

            sample_size += total_actions

            if total_actions != 0:
                clickstream_similarity = clickstream_sum / total_actions
                website_sum += clickstream_similarity

                total_clickstreams += 1
        
        if total_clickstreams != 0:
            website_similarity = website_sum / total_clickstreams
            website_difference = 1 - website_similarity

            website = os.path.basename(os.path.normpath(path))
            rows_list.append({
                "website": website,
                f"difference": website_difference,
                f"sample_size": sample_size
            })

    return pd.DataFrame(rows_list)

In [16]:
control_comparison = compare_with_control("baseline", "control", "experimental")
merged_df = merged_df.merge(control_comparison, on=["website"])

1/43 completed
2/43 completed
3/43 completed
4/43 completed
5/43 completed
6/43 completed
7/43 completed
8/43 completed
9/43 completed
10/43 completed
11/43 completed
12/43 completed
13/43 completed
14/43 completed
15/43 completed
16/43 completed
17/43 completed
18/43 completed
crawls/clickstream-intercept-request/ebay.co.uk/5/baseline-0.png
crawls/clickstream-intercept-request/ebay.co.uk/7/baseline-0.png
19/43 completed
20/43 completed
21/43 completed
22/43 completed
23/43 completed
24/43 completed
crawls/clickstream-intercept-request/mlb.com/3/baseline-0.png
crawls/clickstream-intercept-request/mlb.com/4/baseline-0.png
crawls/clickstream-intercept-request/mlb.com/4/baseline-1.png
crawls/clickstream-intercept-request/mlb.com/5/baseline-0.png
crawls/clickstream-intercept-request/mlb.com/8/baseline-0.png
crawls/clickstream-intercept-request/mlb.com/8/baseline-1.png
crawls/clickstream-intercept-request/mlb.com/9/baseline-0.png
25/43 completed
26/43 completed
27/43 completed
crawls/clicks

In [17]:
merged_df

Unnamed: 0,website,difference_control_comparison,sample_size_control_comparison,difference_remove_cookies_comparison,sample_size_remove_cookies_comparison,difference,sample_size
0,mail.ru,0.124683,79,0.386151,18,0.31874,18
1,zoom.us,0.145888,71,0.10819,76,0.002632,71
2,digicert.com,0.002603,82,0.002603,82,0.0,82
3,stackoverflow.com,0.032585,64,0.046186,52,0.008821,52
4,teamviewer.com,0.028222,86,0.035534,82,0.017215,82
5,amazon.co.uk,0.422106,51,0.303057,36,0.170512,34
6,hp.com,0.10203,87,0.09209,75,0.032672,74
7,aliyun.com,0.04284,51,0.020143,50,0.00968,43
8,rackspace.com,0.023865,56,0.056635,55,0.056302,55
9,freepik.com,0.152726,60,0.138781,75,0.042456,60


In [20]:
merged_df.to_csv(f'analysis/{CRAWL_NAME}.csv', index=False)