In [2]:
from image_shingle import ImageShingle
import os
import pandas as pd

In [10]:
CRAWL_NAME = 'sep10-clickstream'
CRAWL_PATH = f'crawls/{CRAWL_NAME}/'

In [4]:
def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs

In [9]:
rows_list = []

for path in get_directories(CRAWL_PATH):
    clickstreams = get_directories(path)

    sample_size = 0

    total_clickstreams = 0
    website_sum = 0

    for clickstream in clickstreams:

        total_actions = 0
        clickstream_sum = 0

        for _ in range(10):
            all_cookies_path = f"{clickstream}/all_cookies-{total_actions}.png"
            no_cookies_path = f"{clickstream}/no_cookies-{total_actions}.png"
            
            if os.path.isfile(all_cookies_path) and os.path.isfile(no_cookies_path):
                CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                all_cookies_shingle = ImageShingle(all_cookies_path, 40)
                no_cookies_shingle = ImageShingle(no_cookies_path, 40)

                clickstream_sum += all_cookies_shingle.compare(no_cookies_shingle)
                total_actions += 1
            else:
                break

        sample_size += total_actions

        if total_actions != 0:
            clickstream_similarity = clickstream_sum / total_actions
            website_sum += clickstream_similarity

            total_clickstreams += 1
    
    if total_clickstreams != 0:
        website_similarity = website_sum / total_clickstreams

        website = os.path.basename(os.path.normpath(path))
        rows_list.append({
            "website": website,
            "website_similarity": website_similarity,
            "sample_size": sample_size
        })
        
df = pd.DataFrame(rows_list)
print(df)

                 website  website_similarity  sample_size
0                mail.ru            0.425397            6
1                zoom.us            0.583210           32
2              europa.eu            0.898231           74
3           digicert.com            1.000000            2
4      stackoverflow.com            0.754009           31
5           amazon.co.uk            0.975000            1
6                 hp.com            0.767481           44
7             aliyun.com            0.741929           54
8          rackspace.com            0.522734           43
9            freepik.com            0.734762           17
10          springer.com            0.867038           83
11            nature.com            0.751446           62
12       casalemedia.com            0.973354           44
13      surveymonkey.com            0.754694           24
14  paloaltonetworks.com            0.327937            5
15            nvidia.com            0.360317            2
16           i

In [15]:
df.sort_values(by=['sample_size'], ascending=False)
df.to_csv(f'analysis/{CRAWL_NAME}.csv', index=False)