In [6]:
from image_shingle import ImageShingle
import os
import pandas as pd

In [7]:
CRAWL_NAME = 'clickstream'
CRAWL_PATH = f'crawls/{CRAWL_NAME}/'

In [8]:
def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs

print(len(get_directories(CRAWL_PATH)))

114


In [9]:
rows_list = []

for path in get_directories(CRAWL_PATH):
    clickstreams = get_directories(path)

    sample_size = 0

    total_clickstreams = 0
    website_sum = 0

    for clickstream in clickstreams:

        total_actions = 0
        clickstream_sum = 0

        for _ in range(10):
            all_cookies_path = f"{clickstream}/all_cookies-{total_actions}.png"
            no_cookies_path = f"{clickstream}/no_cookies-{total_actions}.png"
            
            if os.path.isfile(all_cookies_path) and os.path.isfile(no_cookies_path):
                CHUNK_SIZE = 40  # Recommended by https://www.usenix.org/legacy/events/sec07/tech/full_papers/anderson/anderson.pdf
                all_cookies_shingle = ImageShingle(all_cookies_path, 40)
                no_cookies_shingle = ImageShingle(no_cookies_path, 40)

                clickstream_sum += all_cookies_shingle.compare(no_cookies_shingle)
                total_actions += 1
            else:
                break

        sample_size += total_actions

        if total_actions != 0:
            clickstream_similarity = clickstream_sum / total_actions
            website_sum += clickstream_similarity

            total_clickstreams += 1
    
    if total_clickstreams != 0:
        website_similarity = website_sum / total_clickstreams

        website = os.path.basename(os.path.normpath(path))
        rows_list.append({
            "website": website,
            "website_similarity": website_similarity,
            "sample_size": sample_size
        })
        
df = pd.DataFrame(rows_list)
print(df)

             website  website_similarity  sample_size
0         abc.net.au            0.841729           35
1         aliyun.com            0.885851            6
2       amazon.co.uk            0.251194            8
3            bmj.com            0.575774           69
4   businesswire.com            0.891667           60
..               ...                 ...          ...
82  verywellmind.com            0.753376           24
83         viber.com            0.399016           12
84       weborama.fr            0.631944           13
85         wufoo.com            0.808808           41
86          zend.com            0.627611           33

[87 rows x 3 columns]


In [11]:
df.sort_values(by=['sample_size'], ascending=False, inplace=True)
df.to_csv(f'analysis/{CRAWL_NAME}.csv', index=False)