# Other analysis scripts

In [41]:
import pandas as pd
import json

In [42]:
CSV_DIR = './files/csv'

## CSV comparison

In [43]:
import csv

def compute_jaccard_similarity(file1, file2):
    """
    Check similarity between keys of two CSV files
    """
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        reader1 = csv.reader(f1)
        reader2 = csv.reader(f2)
        
        # Skip the headers
        next(reader1)
        next(reader2)
        
        # Read the first columns from both files
        column1 = [row[0] for row in reader1]
        column2 = [row[0] for row in reader2]
        
        # Compute Jaccard Similarity
        # i.e., intersection / union
        similarity = len(set(column1) & set(column2)) / len(set(column1) | set(column2))
        
        return similarity
    
def get_intersection(file1, file2):
    """
    Get intersection between keys of two CSV files
    """
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        reader1 = csv.reader(f1)
        reader2 = csv.reader(f2)

        # Skip the headers
        next(reader1)
        next(reader2)
        
        # Read the first columns from both files
        column1 = [row[0] for row in reader1]
        column2 = [row[0] for row in reader2]

        return set(column1) & set(column2)


# Rejected > Accepted
file1_path = CSV_DIR + '/headless_more_rejected_than_accepted.csv'
file2_path = CSV_DIR + '/native_more_rejected_than_accepted.csv'
similarity_score = compute_jaccard_similarity(file1_path, file2_path)
print(f"Jaccard Similarity between Headless and Native for sites that have more Rejected > Accepted: {similarity_score}")
print(get_intersection(file1_path, file2_path))

# Rejected > No Interaction
file1_path = CSV_DIR + '/headless_more_rejected_than_no_interaction.csv'
file2_path = CSV_DIR + '/native_more_rejected_than_no_interaction.csv'
similarity_score = compute_jaccard_similarity(file1_path, file2_path)
print(f"Jaccard Similarity between Headless and Native for sites that have more Rejected > No Interaction: {similarity_score}")
print(get_intersection(file1_path, file2_path))

Jaccard Similarity between Headless and Native for sites that have more Rejected > Accepted: 0.25925925925925924
{'https://bmj.com', 'https://match.com', 'https://cancer.org', 'https://amplitude.com', 'https://worldcat.org', 'https://avg.com', 'https://rackspacecloud.com'}
Jaccard Similarity between Headless and Native for sites that have more Rejected > No Interaction: 0.6394557823129252
{'https://mastercard.com', 'https://ccleaner.com', 'https://king.com', 'https://adtelligent.com', 'https://agora.io', 'https://pubnub.com', 'https://deltadna.net', 'https://sendinblue.com', 'https://eagleeyenetworks.com', 'https://siteground.us', 'https://mimecast.net', 'https://watchguard.com', 'https://tfl.gov.uk', 'https://snowflake.com', 'https://premierleague.com', 'https://iubenda.com', 'https://pb.com', 'https://doodle.com', 'https://thelancet.com', 'https://pandasecurity.com', 'https://swrve.com', 'https://smartthings.com', 'https://ultradns.com', 'https://gumgum.com', 'https://symantec.com', 

## Discrepancy analysis

In [50]:
def get_cookies_from_har(file, url):
    """
    Returns a list of cookies from an HAR file.

    The HAR file should be generated using Chrome DevTools.
    `file` is the path to the HAR file.
    `url` is the URL of the website which the HAR was extracted from.
    """

    data = json.load(open(file, 'r'))
    for entry in data['log']['entries']:
        request = entry['request']
        if request['method'] == 'GET' and request['url'] == url:
            return request['cookies']

har_cookies = get_cookies_from_har('./files/har/www.bmj.com.har', 'https://www.bmj.com/')

def get_cookies_from_csv(file):
    """
    Returns a list of cookies from a CSV file.

    The CSV file should be generated using `get_cookies` in `analysis.ipynb`.
    `file` is the path to the CSV file.
    """

    cookies = []

    df = pd.read_csv(file)
    for index, cookie in df.iterrows():
        cookies.append({
            'name': cookie['name'],
            'domain': cookie['host'],
            # 'value': cookie['value'] # TODO: Add values
        })

    return cookies

bannerclick_cookies = get_cookies_from_csv('./files/csv/bmj.com_accept.csv')

In [51]:
def compute_cookie_similarity(cookies1, cookies2):
    """
    Check similarity between two cookie lists.

    The lists must be dictionaries with keys 'domain' and 'name'.
    """

    cookies1 = [(cookie['domain'], cookie['name']) for cookie in cookies1]
    cookies2 = [(cookie['domain'], cookie['name']) for cookie in cookies2]

    for cookie in cookies1_set:
        print(cookie)

    similarity = len(cookies1_set & cookies2_set) / len(cookies1_set | cookies2_set)
    
    return similarity

similarity = compute_cookie_similarity(har_cookies, bannerclick_cookies)
print(similarity)

('www.bmj.com', 'dmd-signal-112-497-382A7D95-21fabb8c-b6b7-47c7-aa07-d50b34b9096d')
('.bmj.com', 'zps-tgr-dts')
('www.bmj.com', 'ln_or')
('.bmj.com', 'loggedIn')
('.bmj.com', '_ga_LHT0ZJKRSY')
('www.bmj.com', 'dmd-vid')
('.bmj.com', 'wisepops')
('.bmj.com', '_fbp')
('.bmj.com', 'wisepops_session')
('.bmj.com', '_gat_UA-432960-5')
('.bmj.com', 'zsc080ab08721924c07a116c8a1a078f0f0')
('www.bmj.com', 'dmd-sid')
('.bmj.com', 'OptanonConsent')
('www.bmj.com', '_sess')
('.bmj.com', 'zft-sdc')
('.bmj.com', 'FPLC')
('.bmj.com', '_gid')
('.bmj.com', 'FPID')
('.bmj.com', '_ga')
('.bmj.com', 'RefTrackGroup')
('.bmj.com', 'wisepops_visits')
('www.bmj.com', 'zabUserId')
('.bmj.com', 'RefTrack')
('.bmj.com', '_ga_8P7810XE1M')
('www.bmj.com', 'dmd-ahk')
0.3333333333333333
