In [2]:
import pandas as pd
import numpy as np
import json
import os
import utils
import csv
import math
import matplotlib
import matplotlib.pyplot as plt


### Functions for Analysis and Creating Blocklist

In [3]:
if not os.path.exists("analysis"):
    os.mkdir("analysis")

def get_tracking_domains(list_path: str = "inputs/blocklists/") -> set[str]:
    """
    Get tracking domains from blocklists.

    Args:
        list_path: Path to blocklists. Defaults to "inputs/blocklists/".

    Returns:
        A set of tracking domains.
    """
    lists = []
    for item in os.listdir(list_path):
        path = os.path.join(list_path, item)
        lists.append(path)

    tracking_sites = set()
    for list_path in lists:
        with open(list_path) as file:
            lines = file.readlines()
            for line in lines:
                tracking_sites.add(line.rstrip())

    # print("Tracking sites aggregated from 4 blocklists.")
    return tracking_sites

# Create set of tracking domains from aggregation of 4 blocklists
trackings_domains = get_tracking_domains()
print(trackings_domains)

def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs


def detect_tracking(blocklist, cookie_list) -> list[dict[str, str, str]]:
    """
    Check if any URLs from a list appear in a blocklist of known tracking cookies.

    Args:
        blocklist: Set of blocked domains.
        cookie_list: List of cookies, where each cookie is a dict of 3 key-value pairs.

    Returns:
        A filtered list of detected tracking cookies.
    """

    detected_trackers = []
    for cookie in cookie_list:
        cookie_domain = cookie["Cookie Domain"]
        if utils.get_domain_and_tld(cookie_domain) in blocklist:
            detected_trackers.append(cookie)

    return detected_trackers


def get_cookies_from_har(file: str) -> list[dict[str, str, str]]:
    """
    Returns a list of cookies from response entries in an HAR file.
    [HAR Specification](http://www.softwareishard.com/blog/har-12-spec/).

    Args:
        file: Path to the HAR file.
    Returns:
        A list of dictionaries representing all cookies in HTTP responses in that HAR file with domains, where each dictionary holds 3 key-value pairs (Cookie Name, Cookie Value, Cookie Domain).
    """

    cookies = []
    data = json.load(open(file, "r")) # parses JSON data into Python dictionary
    for entry in data["log"]["entries"]: # each entry is an HTTP request/response pair
        
        response = entry["response"] # extract response dictionary

        if response.get("cookies"): # response contains cookies
            for cookie in response["cookies"]:
                # print(cookie)
                if cookie.get("domain"): # if cookie has domain
                    cookies.append({"Cookie Name": cookie["name"], "Cookie Value": cookie["value"], "Cookie Domain": cookie["domain"]})

    return cookies

def check_requests(detected_list_from_responses: list[dict[str, str, str]], file: str) -> list[dict[str, str, str]]:
    
    detected_list_from_requests = []
    data = json.load(open(file, "r")) # parses JSON data into Python dictionary
    for entry in data["log"]["entries"]: # each entry is an HTTP request/response pair
        
        request = entry["request"] # extract request dictionary

        if request.get("cookies"): # response contains cookies
            for cookie in request["cookies"]:
                values_of_cookie_names = [d["Cookie Name"] for d in detected_list_from_responses]
                if cookie.get("name") in values_of_cookie_names: # if cookie name is in list of detected cookies from responses
                    detected_list_from_requests.append({"Cookie Name": cookie["name"], "Cookie Value": cookie["value"]})

    return detected_list_from_requests

def analyze_har(har_path: str) -> list[dict[str, str, str]]:
    """
    Return a list of tracking cookies detected in the requests of a specified HAR file.

    Args:
        har_path: Path to the HAR file.

    Returns:
        A list of dictionaries representing detected tracking cookies from requests, where each dictionary holds 3 key-value pairs (Cookie Name, Cookie Value, Cookie Domain).
    """
    cookies = get_cookies_from_har(har_path)
    filtered_list = detect_tracking(trackings_domains, cookies)
    return filtered_list

# print(get_cookies_from_har("crawls/depth0/bmj.com/0/normal.json"))
# print(analyze_har("crawls/depth0/bmj.com/0/normal.json"))




### Create Dataframes and Generate CSV Files
Note: Running this cell block will append lines to existing CSV files. Delete existing CSV files or comment out lines before each new run.

In [4]:
success_file_path = "inputs/sites/success.txt"
with open(success_file_path, "r") as success_file:
    success_lines = success_file.readlines()

# domain_paths = get_directories("crawls/depth1") 
domain_paths = get_directories("crawls/depth0") 

# for counting number of inner pages per domain
domains_paths_normal = {}
domains_paths_reject = {}

incomplete_runs = 0
total_inner_pages = 0

detected_trackers_from_responses_normal = []
detected_trackers_from_requests_normal = [] # will be used to create DataFrame

detected_trackers_from_responses_reject = []
detected_trackers_from_requests_reject = [] # will be used to create Dataframe

for site in domain_paths:
    # Skip if site is not in success.txt
    # FIXME: success.txt currently not formatted properly; uncommenting this causes no rows to be written to CSV
    # if not any(site in line for line in success_lines):
    #     continue

    inner_site_paths = get_directories(site)
    total_inner_pages += len(inner_site_paths)

    for inner_site_path in inner_site_paths:
        normal_har_path = f"{inner_site_path}/normal.json"
        reject_har_path = f"{inner_site_path}/after_reject.json"

        if not os.path.isfile(normal_har_path) or not os.path.isfile(reject_har_path):
            # Requires both normal and intercept HAR files to exist
            incomplete_runs += 1
            continue
            
        domain = site.split("/")[2]

        # Append inner site path to the dictionary for normal crawls
        if domain in domains_paths_normal:
            domains_paths_normal[domain].append(inner_site_path)
        else:
            domains_paths_normal[domain] = [inner_site_path]

        # Append inner site path to the dictionary for after_reject crawls
        if domain in domains_paths_reject:
            domains_paths_reject[domain].append(inner_site_path)
        else:
            domains_paths_reject[domain] = [inner_site_path]

        detected_list_from_requests_normal = analyze_har(normal_har_path)

        # saving trackers from responses for easy parsing into dataframe if needed
        for detected_tracker in detected_list_from_requests_normal:
            detected_trackers_from_responses_normal.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                "Cookie Domain": detected_tracker["Cookie Domain"]
            })

        trackers_requests_normal = check_requests(detected_trackers_from_responses_normal, normal_har_path) # FIXME: currently iterating over file twice; can iterate once if # of requests == # of responses
        
        for detected_tracker in trackers_requests_normal:
            detected_trackers_from_requests_normal.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
            })

        # # Create file if it doesn't exist; if it exists then write a row for each inner site path with a count of the number of trackers.
        # normal_file = "analysis/depth1_normal.csv"
        # normal_file_exists = os.path.isfile(normal_file)

        # if normal_file_exists:
        #     with open(normal_file, mode="a", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow([inner_site_path, len(trackers_requests_normal)])
        #         file.flush() # bugfix where rows weren't writing: flush() clears internal buffer

        # else:
        #     with open(normal_file, mode="w", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow(["Inner Site Path", "Length of Detected List"])
        #         writer.writerow([inner_site_path, len(trackers_requests_normal)])
        #         file.flush()


        # Repeat for files generated after run with rejecting cookies
        detected_list_from_requests_reject = analyze_har(reject_har_path)

        # saving trackers from responses for easy parsing into dataframe if needed
        for detected_tracker in detected_list_from_requests_reject:
            detected_trackers_from_responses_reject.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                "Cookie Domain": detected_tracker["Cookie Domain"]
            })

        trackers_requests_reject = check_requests(detected_trackers_from_responses_reject, reject_har_path) # FIXME: currently iterating over file twice; can iterate once if # of requests == # of responses
        
        for detected_tracker in trackers_requests_reject:
            detected_trackers_from_requests_reject.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
            })

        # # Create file if it doesn't exist; if it exists then write a row for each inner site path with a count of the number of trackers.
        # reject_file = "analysis/depth1_after_reject.csv"
        # reject_file_exists = os.path.isfile(reject_file)

        # if reject_file_exists:
        #     with open(reject_file, mode="a", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow([inner_site_path, len(trackers_requests_reject)])
        #         file.flush() # bugfix where rows weren't writing: flush() clears internal buffer

        # else:
        #     with open(reject_file, mode="w", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow(["Inner Site Path", "Length of Detected List"])
        #         writer.writerow([inner_site_path, len(trackers_requests_reject)])
        #         file.flush()


# Create DataFrames for detected trackers in normal and after_reject crawls
# Each tracker is in a row, with its domain and inner site path
df_normal = pd.DataFrame(detected_trackers_from_requests_normal)
df_after_reject = pd.DataFrame(detected_trackers_from_requests_reject)

In [52]:
# df_normal.info()
# df_after_reject.info()
# df_normal.head(15)
# df_after_reject.head(15)
# df_normal.to_csv("analysis/depth1_normal_1.csv")
# df_after_reject.to_csv("analysis/depth1_after_reject_1.csv")

### Finding Trackers that Remain After Rejecting

In [5]:
# Drop duplicates in df_normal (if every value in a row is the same, it is considered a duplicate)
df_normal_unique = df_normal.drop_duplicates()

# Perform an inner merge (cookies in df_after_reject are kept if they are in df_normal_unique)
merged_df = df_after_reject.merge(df_normal_unique, on=["Domain", "Inner Site Path", "Cookie Name", "Cookie Value"], how="inner")
merged_df


Unnamed: 0,Domain,Inner Site Path,Cookie Name,Cookie Value
0,mail.ru,crawls/depth0/mail.ru/0,guid,82C96B0E64B1A957X1689364823
1,amazon.co.uk,crawls/depth0/amazon.co.uk/0,ad-privacy,0
2,amazon.co.uk,crawls/depth0/amazon.co.uk/0,ad-privacy,0
3,rackspace.com,crawls/depth0/rackspace.com/0,D41ID,v3|v4|292a3489d82b46d6b4b95a820fd72cbe|https:/...
4,rackspace.com,crawls/depth0/rackspace.com/0,D41ID,v3|v4|292a3489d82b46d6b4b95a820fd72cbe|https:/...
...,...,...,...,...
1175,workable.com,crawls/depth0/workable.com/0,__cf_bm,EnSvwqo7gpt8ZLO201X8X9bQRTzrD0YzJQphcO5TAXE-16...
1176,workable.com,crawls/depth0/workable.com/0,__cf_bm,EnSvwqo7gpt8ZLO201X8X9bQRTzrD0YzJQphcO5TAXE-16...
1177,workable.com,crawls/depth0/workable.com/0,__cf_bm,EnSvwqo7gpt8ZLO201X8X9bQRTzrD0YzJQphcO5TAXE-16...
1178,workable.com,crawls/depth0/workable.com/0,__cf_bm,EnSvwqo7gpt8ZLO201X8X9bQRTzrD0YzJQphcO5TAXE-16...


In [6]:
# Group by domain and set Num_Inner_Pages using the dictionary domains_paths_normal
df_normal_domains = df_normal.groupby('Domain', as_index=False).agg(
    Num_Inner_Pages=('Domain', lambda x: len(domains_paths_normal.get(x.iloc[0], []))), # Use the length of inner site paths in the domains_paths_normal dictionary
    Num_Trackers_Per_Domain=('Cookie Name', 'count')  # Count the number of trackers for each domain
)

df_normal_domains["Average Trackers Per Page"] = df_normal_domains["Num_Trackers_Per_Domain"] / df_normal_domains["Num_Inner_Pages"]


# Group by domain and set Num_Inner_Pages using the dictionary domains_paths_reject
df_after_reject_domains = df_after_reject.groupby('Domain', as_index=False).agg(
    Num_Inner_Pages=('Domain', lambda x: len(domains_paths_reject.get(x.iloc[0], []))), # Use the length of inner site paths in the domains_paths_reject dictionary
    Num_Trackers_Per_Domain=('Cookie Name', 'count')  # Count the number of trackers for each domain
)

df_after_reject_domains["Average Trackers Per Page"] = df_after_reject_domains["Num_Trackers_Per_Domain"] / df_after_reject_domains["Num_Inner_Pages"]

In [7]:
# df_normal.info()
# df_normal_domains.info()
df_normal_domains

Unnamed: 0,Domain,Num_Inner_Pages,Num_Trackers_Per_Domain,Average Trackers Per Page
0,aap.org,1,48,48.0
1,abc.net.au,1,10,10.0
2,acer.com,1,27,27.0
3,active.com,1,529,529.0
4,adtelligent.com,1,4,4.0
...,...,...,...,...
117,workable.com,1,52,52.0
118,worldcat.org,1,27,27.0
119,wufoo.com,1,23,23.0
120,xoom.com,1,1,1.0


In [8]:
# df_after_reject_domains.info()
df_after_reject_domains
# df_after_reject.info()

Unnamed: 0,Domain,Num_Inner_Pages,Num_Trackers_Per_Domain,Average Trackers Per Page
0,aap.org,1,18,18.0
1,abc.net.au,1,8,8.0
2,acer.com,1,11,11.0
3,active.com,1,26,26.0
4,adweek.com,1,44,44.0
...,...,...,...,...
114,worldcat.org,1,32,32.0
115,wufoo.com,1,23,23.0
116,xoom.com,1,1,1.0
117,zebra.com,1,6,6.0


### Distribution of Trackers Kept After Rejecting, Grouped by Domain

In [11]:
# Group by "Domain" and count the number of trackers for each domain
domain_counts = merged_df.groupby("Domain").size().reset_index(name="Tracker Count")

# Sort the DataFrame by descending "Tracker Count"
domain_counts_sorted = domain_counts.sort_values(by="Tracker Count", ascending=False)
# domain_counts
domain_counts_sorted

Unnamed: 0,Domain,Tracker Count
81,sportingnews.com,89
59,observer.com,65
13,broadcom.com,54
20,comodo.com,50
39,inmobi.com,46
...,...,...
25,esri.com,1
29,flightradar24.com,1
79,spglobal.com,1
49,merchantlink.com,1


### Distribution of Tracking Cookies Across All Inner Pages (Regardless of Domain)
Run this cell to check that number of complete+incomplete pages equals total inner pages.

In [44]:
def compare_trackers(reject_filepath, normal_filepath):
    no_trackers_after_reject = []  # List of inner site paths with trackers in normal crawl, but no trackers after rejection
    increased_trackers = []  # List of inner site paths with more trackers after rejection than in normal crawl
    never_trackers = []  # List of inner site paths with no trackers in either normal or rejection crawl
    violating_sites = []  # List of inner site paths with trackers after we click the reject button

    with open(reject_filepath, 'r') as reject_file, open(normal_filepath, 'r') as normal_file:
        read_reject = csv.reader(reject_file)
        read_normal = csv.reader(normal_file)

        # Skip header
        next(read_reject)
        next(read_normal)

        length = 0

        # Since both csvs are sorted by inner site path, we can just iterate through both at the same time
        for normal, after_reject in zip(read_normal, read_reject):
            inner_site_path, num_trackers_normal = normal
            _, num_trackers_reject = after_reject

            if inner_site_path != _:
                raise RuntimeError("Inner site paths do not match")

            num_trackers_normal = int(num_trackers_normal)
            num_trackers_reject = int(num_trackers_reject)

            if num_trackers_normal > 0 and num_trackers_reject == 0:  # if there are trackers in normal crawl, but not after reject
                no_trackers_after_reject.append(inner_site_path)

            if num_trackers_normal < num_trackers_reject:  # if there are more trackers after reject than in normal crawl
                increased_trackers.append(inner_site_path)

            if num_trackers_normal == 0 and num_trackers_reject == 0:  # if there are no trackers in either normal or reject
                never_trackers.append(inner_site_path)

            if num_trackers_reject != 0:  # if there are trackers in reject
                violating_sites.append(inner_site_path)

            length += 1

    # from previous cell
    print("Total inner pages:", total_inner_pages)
    print("Incomplete inner pages:", incomplete_runs)
    
    print("Complete inner pages:", length)
    print("Inner pages that removed all trackers after rejection:", len(no_trackers_after_reject))
    print("Inner pages with increased trackers after rejection:", len(increased_trackers))
    print("Inner pages that never contained trackers:", len(never_trackers))
    print("Inner pages that sent cookies to 3rd party trackers after rejection:", len(violating_sites))


def get_length_detected_list(csv_reader, inner_site_path):
    for row in csv_reader:
        current_inner_site_path, length_detected_list = row
        if current_inner_site_path == inner_site_path:
            return length_detected_list

    return '0'  # If inner_site_path not found, return '0'


compare_trackers('analysis/depth0_after_reject.csv', 'analysis/depth0_normal.csv')

Total inner pages: 252
Incomplete inner pages: 2
Complete inner pages: 250
Inner pages that removed all trackers after rejection: 13
Inner pages with increased trackers after rejection: 34
Inner pages that never contained trackers: 118
Inner pages that sent cookies to 3rd party trackers after rejection: 119
