# Other analysis scripts

In [84]:
import pandas as pd
import json

In [85]:
CSV_DIR = './files/csv'

## CSV comparison

In [86]:
import csv

def compute_jaccard_similarity(file1, file2):
    """
    Check similarity between keys of two CSV files
    """
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        reader1 = csv.reader(f1)
        reader2 = csv.reader(f2)
        
        # Skip the headers
        next(reader1)
        next(reader2)
        
        # Read the first columns from both files
        column1 = [row[0] for row in reader1]
        column2 = [row[0] for row in reader2]
        
        # Compute Jaccard Similarity
        # i.e., intersection / union
        similarity = len(set(column1) & set(column2)) / len(set(column1) | set(column2))
        
        return similarity
    
def get_intersection(file1, file2):
    """
    Get intersection between keys of two CSV files
    """
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        reader1 = csv.reader(f1)
        reader2 = csv.reader(f2)

        # Skip the headers
        next(reader1)
        next(reader2)
        
        # Read the first columns from both files
        column1 = [row[0] for row in reader1]
        column2 = [row[0] for row in reader2]

        return set(column1) & set(column2)


# Rejected > Accepted
file1_path = CSV_DIR + '/headless_more_rejected_than_accepted.csv'
file2_path = CSV_DIR + '/native_more_rejected_than_accepted.csv'
similarity_score = compute_jaccard_similarity(file1_path, file2_path)
print(f"Jaccard Similarity between Headless and Native for sites that have more Rejected > Accepted: {similarity_score}")
print(get_intersection(file1_path, file2_path))

# Rejected > No Interaction
file1_path = CSV_DIR + '/headless_more_rejected_than_no_interaction.csv'
file2_path = CSV_DIR + '/native_more_rejected_than_no_interaction.csv'
similarity_score = compute_jaccard_similarity(file1_path, file2_path)
print(f"Jaccard Similarity between Headless and Native for sites that have more Rejected > No Interaction: {similarity_score}")
print(get_intersection(file1_path, file2_path))

Jaccard Similarity between Headless and Native for sites that have more Rejected > Accepted: 0.25925925925925924
{'https://bmj.com', 'https://match.com', 'https://cancer.org', 'https://amplitude.com', 'https://worldcat.org', 'https://avg.com', 'https://rackspacecloud.com'}
Jaccard Similarity between Headless and Native for sites that have more Rejected > No Interaction: 0.6394557823129252
{'https://mastercard.com', 'https://ccleaner.com', 'https://king.com', 'https://adtelligent.com', 'https://agora.io', 'https://pubnub.com', 'https://deltadna.net', 'https://sendinblue.com', 'https://eagleeyenetworks.com', 'https://siteground.us', 'https://mimecast.net', 'https://watchguard.com', 'https://tfl.gov.uk', 'https://snowflake.com', 'https://premierleague.com', 'https://iubenda.com', 'https://pb.com', 'https://doodle.com', 'https://thelancet.com', 'https://pandasecurity.com', 'https://swrve.com', 'https://smartthings.com', 'https://ultradns.com', 'https://gumgum.com', 'https://symantec.com', 

## Discrepancy analysis

In [105]:
def get_cookies_from_har(file):
    """
    Returns a list of cookies from an HAR file.

    The HAR file should be generated using Chrome DevTools.
    `file` is the path to the HAR file.
    """

    all_cookies = []
    data = json.load(open(file, 'r'))
    for entry in data['log']['entries']:
        request = entry['request']
        
        if cookies := request.get('cookies'):
            all_cookies.extend(cookies)

    return all_cookies

def get_cookies_from_csv(file):
    """
    Returns a list of cookies from a CSV file.

    The CSV file should be generated using `get_cookies` in `analysis.ipynb`.
    `file` is the path to the CSV file.
    """

    cookies = []

    df = pd.read_csv(file)
    for index, cookie in df.iterrows():
        cookies.append({
            'name': cookie['name'],
            'domain': cookie['host'],
        })

    return cookies

def compute_cookie_similarity(cookies1, cookies2):
    """
    Check similarity between two cookie lists.

    The lists must be dictionaries with keys 'domain' and 'name'.
    """

    cookies1 = [(cookie['domain'], cookie['name']) for cookie in cookies1]
    cookies2 = [(cookie['domain'], cookie['name']) for cookie in cookies2]

    similarity = len(set(cookies1) & set(cookies2)) / len(set(cookies1) | set(cookies2))
    
    # TODO: This is a little messy, maybe cleanup later?
    cookies1_unique = set(cookies1) - set(cookies2)
    print(f"Unique cookies in HAR file:")
    for cookie in cookies1_unique:
        print(cookie)

    print()
    cookies2_unique = set(cookies2) - set(cookies1)
    print(f"Unique cookies in bannerClick:")
    for cookie in cookies2_unique:
        print(cookie)

    return similarity

def count_unique_cookies(cookies):
    unique_cookies = set([(cookie['domain'], cookie['name']) for cookie in cookies])
    return len(unique_cookies)

In [111]:
har_cookies = get_cookies_from_har('./files/har/www.bmj.com_accept.har')
bannerclick_cookies = get_cookies_from_csv('./files/csv/bmj.com_accept.csv')

print(count_unique_cookies(bannerclick_cookies))

har_cookies = get_cookies_from_har('./files/har/www.bmj.com_reject.har')
bannerclick_cookies = get_cookies_from_csv('./files/csv/bmj.com_reject.csv')

print(count_unique_cookies(bannerclick_cookies))

har_cookies = get_cookies_from_har('./files/har/www.bmj.com_no_interaction.har')
bannerclick_cookies = get_cookies_from_csv('./files/csv/bmj.com_no_interaction.csv')

print(count_unique_cookies(bannerclick_cookies))

similarity = compute_cookie_similarity(har_cookies, bannerclick_cookies)
print(similarity)

19
43
41
Unique cookies in HAR file:
('.casalemedia.com', 'CMID')
('.casalemedia.com', 'CMPS')
('www.bmj.com', 'dmd-signal-112-497-382A7D95-d149ee82-cacf-44ad-a398-9df3aa174b2a')
('.doubleclick.net', 'DSID')
('.casalemedia.com', 'CMPRO')
('.www.medtargetsystem.com', 's-dmd-id-x')

Unique cookies in bannerClick:
('.www.linkedin.com', 'bscookie')
('pagesense-collect.zoho.eu', '4662279173')
('www.bmj.com', 'dmd-signal-112-497-382A7D95-05c1dfc8-3792-4017-b58f-c7889f0fa9f1')
0.8085106382978723


In [109]:
# Hypothesis: BannerClick misses some cookies since bmj.com does not load some cookies until refresh

har_cookies = get_cookies_from_har('./files/har/www.bmj.com_do_not_reload.har')
bannerclick_cookies = get_cookies_from_csv('./files/csv/bmj.com_accept.csv')

print(count_unique_cookies(har_cookies))

20
