In [1]:
from utils.image_shingle import ImageShingle
import os
import pandas as pd
import json
import utils.utils as utils
import statistics

In [2]:
CRAWL_NAME = 'remove-third-party'

In [3]:
CRAWL_PATH = f'crawls/{CRAWL_NAME}/'
PATHS = []
with open(f"crawls/{CRAWL_NAME}/results.json") as results:
    results = json.load(results)
    for path in results:
        if results[path]["crawl_failure"] is False:
            PATHS.append(path)

print(len(PATHS))

26


In [4]:
def compare_clickstreams(baseline: str, comparison: str, name: str) -> pd.DataFrame:
    rows_list = []

    for i, path in enumerate(PATHS):
        print(f"Analyzing site {i+1}/{len(PATHS)}")
        clickstreams = utils.get_directories(path)

        sample_size = 0

        total_clickstreams = 0
        website_sum = 0

        for clickstream in clickstreams:

            total_actions = 0
            clickstream_sum = 0

            for _ in range(10):
                baseline_base_path = f"{clickstream}/{baseline}-{total_actions}"
                comparison_path = f"{clickstream}/{comparison}-{total_actions}.png"
                
                if os.path.isfile(comparison_path):
                    comparison_shingle = ImageShingle(comparison_path)

                    max_similarity = 0
                    for i in range(1, 11):
                        baseline_path = f"{baseline_base_path}-{i}.png"
                        if os.path.isfile(baseline_path):
                            baseline_shingle = ImageShingle(baseline_path)
                            max_similarity = max(max_similarity, baseline_shingle.compare(comparison_shingle))

                    clickstream_sum += max_similarity
                    total_actions += 1

            sample_size += total_actions

            if total_actions != 0:
                clickstream_similarity = clickstream_sum / total_actions
                website_sum += clickstream_similarity

                total_clickstreams += 1
        
        if total_clickstreams != 0:
            website_similarity = website_sum / total_clickstreams
            website_difference = 1 - website_similarity

            website = os.path.basename(os.path.normpath(path))
            rows_list.append({
                "website": website,
                f"difference_{name}": website_difference,
                f"sample_size_{name}": sample_size
            })

    return pd.DataFrame(rows_list)

In [5]:
def compare_with_control() -> pd.DataFrame:
    rows_list = []

    for i, path in enumerate(PATHS):
        print(f"Analyzing site {i+1}/{len(PATHS)}")

        clickstreams = utils.get_directories(path)

        sample_size = 0

        total_clickstreams = 0

        clickstream_similarities = []
        for clickstream in clickstreams:

            total_actions = 0
            clickstream_sum = 0

            for _ in range(10):
                baseline_base_path = f"{clickstream}/baseline-{total_actions}"
                control_path = f"{clickstream}/control-{total_actions}.png"
                experimental_path = f"{clickstream}/experimental-{total_actions}.png"
                
                if os.path.isfile(control_path) and os.path.isfile(experimental_path):
                    baseline_shingles = []
                    for i in range(1, 11):
                        baseline_path = f"{baseline_base_path}-{i}.png"
                        if os.path.isfile(baseline_path):
                            baseline_shingles.append(ImageShingle(baseline_path))

                    control_shingle = ImageShingle(control_path)
                    experimental_shingle = ImageShingle(experimental_path)

                    try:
                        sim = ImageShingle.compare_with_controls(baseline_shingles, control_shingle, experimental_shingle)
                        
                        if sim != 1:
                            print(baseline_base_path)
                            with open("logs.txt", "a") as file:
                                file.write(baseline_base_path + "\n")
                        
                        clickstream_sum += sim
                    except:
                        print(baseline_shingles)
                    total_actions += 1

            sample_size += total_actions

            if total_actions != 0:
                clickstream_similarity = clickstream_sum / total_actions
                clickstream_similarities.append(clickstream_similarity)

                total_clickstreams += 1
        
        if total_clickstreams != 0:
            website_similarity = statistics.mean(clickstream_similarities)
            stdev = statistics.stdev(clickstream_similarities)
            website_difference = 1 - website_similarity

            website = os.path.basename(os.path.normpath(path))
            rows_list.append({
                "website": website,
                f"mean diff": website_difference,
                f"stdev": stdev,
                f"samples": sample_size,
            })

    return pd.DataFrame(rows_list)

In [6]:
# Two comparison algorithm
# control = compare_clickstreams("baseline", "control", "control")
# experimental = compare_clickstreams("baseline", "experimental", "remove_cookies")
# merged_df = control.merge(experimental, on=["website"])

# Chunk comparison algorithm
control_comparison = compare_with_control()

1/26 completed
crawls/remove-third-party/mail.ru/0/baseline-0
crawls/remove-third-party/mail.ru/0/baseline-1
crawls/remove-third-party/mail.ru/0/baseline-2
crawls/remove-third-party/mail.ru/0/baseline-3
crawls/remove-third-party/mail.ru/1/baseline-0
crawls/remove-third-party/mail.ru/1/baseline-1
crawls/remove-third-party/mail.ru/1/baseline-2
crawls/remove-third-party/mail.ru/1/baseline-3
crawls/remove-third-party/mail.ru/1/baseline-4
crawls/remove-third-party/mail.ru/2/baseline-1
crawls/remove-third-party/mail.ru/2/baseline-2
crawls/remove-third-party/mail.ru/2/baseline-3
crawls/remove-third-party/mail.ru/2/baseline-4
crawls/remove-third-party/mail.ru/2/baseline-5
crawls/remove-third-party/mail.ru/3/baseline-0
crawls/remove-third-party/mail.ru/3/baseline-1
crawls/remove-third-party/mail.ru/3/baseline-2
crawls/remove-third-party/mail.ru/3/baseline-3
crawls/remove-third-party/mail.ru/3/baseline-4
crawls/remove-third-party/mail.ru/4/baseline-0
crawls/remove-third-party/mail.ru/4/baseline-

In [7]:
control_comparison

Unnamed: 0,website,mean diff,stdev,samples
0,mail.ru,0.02261,0.023511,46
1,zoom.us,0.0,0.0,50
2,europa.eu,0.096667,0.305687,41
3,digicert.com,0.0,0.0,45
4,stackoverflow.com,0.0,0.0,49
5,teamviewer.com,0.002646,0.008366,55
6,amazon.co.uk,0.03064,0.047969,39
7,aliyun.com,0.007345,0.011379,45
8,rackspace.com,0.0,0.0,53
9,freepik.com,0.054335,0.157129,28


In [8]:
control_comparison.to_csv(f'analysis/{CRAWL_NAME}.csv', index=False)