# Other analysis scripts

In [2]:
import pandas as pd
import json

In [3]:
CSV_DIR = './files/csv'

## CSV comparison

In [4]:
import csv

def compute_jaccard_similarity(file1, file2):
    """
    Check similarity between keys of two CSV files
    """
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        reader1 = csv.reader(f1)
        reader2 = csv.reader(f2)
        
        # Skip the headers
        next(reader1)
        next(reader2)
        
        # Read the first columns from both files
        column1 = [row[0] for row in reader1]
        column2 = [row[0] for row in reader2]
        
        # Compute Jaccard Similarity
        # i.e., intersection / union
        similarity = len(set(column1) & set(column2)) / len(set(column1) | set(column2))
        
        return similarity
    
def get_intersection(file1, file2):
    """
    Get intersection between keys of two CSV files
    """
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        reader1 = csv.reader(f1)
        reader2 = csv.reader(f2)

        # Skip the headers
        next(reader1)
        next(reader2)
        
        # Read the first columns from both files
        column1 = [row[0] for row in reader1]
        column2 = [row[0] for row in reader2]

        return set(column1) & set(column2)


# Rejected > Accepted
file1_path = CSV_DIR + '/headless_more_rejected_than_accepted.csv'
file2_path = CSV_DIR + '/native_more_rejected_than_accepted.csv'
similarity_score = compute_jaccard_similarity(file1_path, file2_path)
print(f"Jaccard Similarity between Headless and Native for sites that have more Rejected > Accepted: {similarity_score}")
print(get_intersection(file1_path, file2_path))

# Rejected > No Interaction
file1_path = CSV_DIR + '/headless_more_rejected_than_no_interaction.csv'
file2_path = CSV_DIR + '/native_more_rejected_than_no_interaction.csv'
similarity_score = compute_jaccard_similarity(file1_path, file2_path)
print(f"Jaccard Similarity between Headless and Native for sites that have more Rejected > No Interaction: {similarity_score}")
print(get_intersection(file1_path, file2_path))

Jaccard Similarity between Headless and Native for sites that have more Rejected > Accepted: 0.25925925925925924
{'https://worldcat.org', 'https://bmj.com', 'https://cancer.org', 'https://match.com', 'https://amplitude.com', 'https://rackspacecloud.com', 'https://avg.com'}
Jaccard Similarity between Headless and Native for sites that have more Rejected > No Interaction: 0.6394557823129252
{'https://plos.org', 'https://mendeley.com', 'https://senderscore.com', 'https://mitre.org', 'https://esri.com', 'https://viber.com', 'https://superuser.com', 'https://tfl.gov.uk', 'https://seagate.com', 'https://imgsmail.ru', 'https://ey.com', 'https://wufoo.com', 'https://gmu.edu', 'https://serverfault.com', 'https://helpshift.com', 'https://bmj.com', 'https://kaltura.com', 'https://siteground.com', 'https://justpremium.com', 'https://truste.com', 'https://hp.com', 'https://king.com', 'https://technologyreview.com', 'https://watchguard.com', 'https://verisign.com', 'https://zend.com', 'https://algol

## Discrepancy analysis

In [5]:
def get_cookies_from_har(file):
    """
    Returns a list of cookies from an HAR file.

    The HAR file should be generated using Chrome DevTools.
    `file` is the path to the HAR file.
    """

    all_cookies = []
    data = json.load(open(file, 'r'))
    for entry in data['log']['entries']:
        request = entry['request']
        
        if cookies := request.get('cookies'):
            all_cookies.extend(cookies)

    return all_cookies

def get_cookies_from_csv(file):
    """
    Returns a list of cookies from a CSV file.

    The CSV file should be generated using `get_cookies` in `analysis.ipynb`.
    `file` is the path to the CSV file.
    """

    cookies = []

    df = pd.read_csv(file)
    for index, cookie in df.iterrows():
        cookies.append({
            'name': cookie['name'],
            'domain': cookie['host'],
        })

    return cookies

def compute_cookie_similarity(cookies1, cookies2):
    """
    Check similarity between two cookie lists.

    The lists must be dictionaries with keys 'domain' and 'name'.
    """

    cookies1 = [(cookie['domain'], cookie['name']) for cookie in cookies1]
    cookies2 = [(cookie['domain'], cookie['name']) for cookie in cookies2]

    similarity = len(set(cookies1) & set(cookies2)) / len(set(cookies1) | set(cookies2))
    
    # TODO: This is a little messy, maybe cleanup later?
    cookies1_unique = set(cookies1) - set(cookies2)
    print(f"Unique cookies in HAR file:")
    for cookie in cookies1_unique:
        print(cookie)

    print()
    cookies2_unique = set(cookies2) - set(cookies1)
    print(f"Unique cookies in bannerClick:")
    for cookie in cookies2_unique:
        print(cookie)

    return similarity

def count_unique_cookies(cookies):
    unique_cookies = set([(cookie['domain'], cookie['name']) for cookie in cookies])
    return len(unique_cookies)

In [6]:
har_cookies = get_cookies_from_har('./files/har/www.bmj.com_accept.har')
bannerclick_cookies = get_cookies_from_csv('./files/csv/bmj.com_accept.csv')

print(count_unique_cookies(bannerclick_cookies))

har_cookies = get_cookies_from_har('./files/har/www.bmj.com_reject.har')
bannerclick_cookies = get_cookies_from_csv('./files/csv/bmj.com_reject.csv')

print(count_unique_cookies(bannerclick_cookies))

har_cookies = get_cookies_from_har('./files/har/www.bmj.com_no_interaction.har')
bannerclick_cookies = get_cookies_from_csv('./files/csv/bmj.com_no_interaction.csv')

print(count_unique_cookies(bannerclick_cookies))

similarity = compute_cookie_similarity(har_cookies, bannerclick_cookies)
print(similarity)

19
43
41
Unique cookies in HAR file:
('.doubleclick.net', 'DSID')
('.www.medtargetsystem.com', 's-dmd-id-x')
('.casalemedia.com', 'CMPRO')
('www.bmj.com', 'dmd-signal-112-497-382A7D95-d149ee82-cacf-44ad-a398-9df3aa174b2a')
('.casalemedia.com', 'CMID')
('.casalemedia.com', 'CMPS')

Unique cookies in bannerClick:
('pagesense-collect.zoho.eu', '4662279173')
('www.bmj.com', 'dmd-signal-112-497-382A7D95-05c1dfc8-3792-4017-b58f-c7889f0fa9f1')
('.www.linkedin.com', 'bscookie')
0.8085106382978723


In [7]:
# Hypothesis: BannerClick misses some cookies since bmj.com does not load some cookies until refresh

har_cookies = get_cookies_from_har('./files/har/www.bmj.com_do_not_reload.har')
bannerclick_cookies = get_cookies_from_csv('./files/csv/bmj.com_accept.csv')

print(count_unique_cookies(har_cookies))

20


## Cookie-classify

In [10]:
df = pd.read_csv(CSV_DIR+'/map_des_finals.csv')
df = df.sort_values('count_necessary', ascending=False)
df.head(10)

Unnamed: 0.1,Unnamed: 0,visit_id,site_url,site_rank,complete,banners,interaction,clicked,count_fp,count_tp,count_tr,count_total,nc_cmp_name,cmp_name,count_necessary,count_performance,count_functional,count_targeting,count_unclassified
130,130,3661935020521489,https://screenconnect.com,130,True,1,No interaction,No interaction,0,110,28,110,0,0,17,12,4,17,60
640,640,4269024996019340,https://screenconnect.com,20000130,True,1,Reject,No interaction,0,106,25,106,0,0,17,12,4,17,56
385,385,5966544606052266,https://screenconnect.com,10000130,True,1,Accept,Accepted,0,106,24,106,0,0,17,12,4,17,56
587,587,950327434686470,https://logitech.com,20000077,True,0,Reject,No interaction,49,58,30,107,0,0,9,1,1,3,93
77,77,1280782962631736,https://logitech.com,77,True,0,No interaction,No interaction,49,60,32,109,0,0,9,1,1,3,95
509,509,2524000499587100,https://workable.com,10000254,True,1,Accept,Accepted,29,14,2,43,0,0,9,6,1,12,15
254,254,7831584087051595,https://workable.com,254,True,1,No interaction,No interaction,27,14,2,41,0,0,9,6,1,12,13
348,348,926607498462804,https://adweek.com,10000093,True,1,Accept,Accepted,40,85,72,125,0,0,8,4,2,14,97
603,603,4309030855650777,https://adweek.com,20000093,False,1,Reject,Rejected,40,82,72,122,onetrust,0,8,4,2,14,94
67,67,2387127390367580,https://helpshift.com,67,True,1,No interaction,No interaction,34,33,17,67,0,0,8,12,1,23,23
