In [24]:
import pandas as pd
import numpy as np
import json
import os
import utils
import csv
import math


['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'get_domain', 'get_full_domain', 'tldextract']


### Create CSV files for analysis
Note: Running this cell block will append lines to existing CSV files. Delete existing CSV files before each new run.

In [23]:
if not os.path.exists("analysis"):
    os.mkdir("analysis")

def get_tracking_domains(list_path: str = "inputs/blocklists/") -> set[str]:
    """
    Get tracking domains from blocklists.

    Args:
        list_path: Path to blocklists. Defaults to "inputs/blocklists/".

    Returns:
        A set of tracking domains.
    """
    lists = []
    for item in os.listdir(list_path):
        path = os.path.join(list_path, item)
        lists.append(path)

    tracking_sites = set()
    for list_path in lists:
        with open(list_path) as file:
            lines = file.readlines()
            for line in lines:
                tracking_sites.add(line.rstrip())

    # print("Tracking sites aggregated from 4 blocklists.")
    return tracking_sites

# Create set of tracking domains from aggregation of 4 blocklists
trackings_domains = get_tracking_domains()
print(trackings_domains)

def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs


def detect_tracking(blocklist, cookie_list) -> list[dict[str, str, str]]:
    """
    Check if any URLs from a list appear in a blocklist of known tracking cookies.

    Args:
        blocklist: Set of blocked domains.
        cookie_list: List of cookies, where each cookie is a dict of 3 key-value pairs.

    Returns:
        A filtered list of detected tracking cookies.
    """

    detected_trackers = []
    for cookie in cookie_list:
        cookie_domain = cookie["Cookie Domain"]
        if utils.get_domain_and_tld(cookie_domain) in blocklist:
            detected_trackers.append(cookie)

    return detected_trackers


def get_cookies_from_har(file: str) -> list[dict[str, str, str]]:
    """
    Returns a list of cookies from an HAR file.
    [HAR Specification](http://www.softwareishard.com/blog/har-12-spec/).

    Args:
        file: Path to the HAR file.
    Returns:
        A list of dictionaries representing all cookies in that HAR file with domains, where each dictionary holds 3 key-value pairs (Cookie Name, Cookie Value, Cookie Domain).
    """

    cookies = []
    data = json.load(open(file, "r")) # parses JSON data into Python dictionary
    for entry in data["log"]["entries"]: # each entry is an HTTP request/response pair
        
        request = entry["response"] # extract request dictionary FIXME: change back to request

        if request.get("cookies"): # request contains cookies
            for cookie in request["cookies"]:
                # print(cookie)
                if cookie.get("domain"): # if cookie has domain
                    cookies.append({"Cookie Name": cookie["name"], "Cookie Value": cookie["value"], "Cookie Domain": cookie["domain"]})

    return cookies


def analyze_har(har_path: str) -> list[dict[str, str, str]]:
    """
    Return a list of tracking cookies detected in the specified HAR file.

    Args:
        har_path: Path to the HAR file.

    Returns:
        A list of dictionaries representing detected tracking cookies, where each dictionary holds 3 key-value pairs (Cookie Name, Cookie Value, Cookie Domain).
    """
    cookies = get_cookies_from_har(har_path)
    detected_list = detect_tracking(trackings_domains, cookies)
    return detected_list

# print(get_cookies_from_har("crawls/depth0/bmj.com/0/normal.json"))
print(analyze_har("crawls/depth0/bmj.com/0/normal.json"))




AttributeError: module 'utils' has no attribute 'get_domain_and_tld'

In [None]:
success_file_path = "inputs/sites/success.txt"
with open(success_file_path, "r") as success_file:
    success_lines = success_file.readlines()


# domain_paths = get_directories("crawls/depth1_noquery") 
domain_paths = get_directories("crawls/depth0") 

# Initialize dictionaries to store inner site paths for normal and after_reject crawls
# domains_paths_normal = {}
# domains_paths_reject = {}

incomplete_runs = 0
total_inner_pages = 0

detected_trackers_normal = []
detected_trackers_after_reject = []

for site in domain_paths:
    # Skip if site is not in success.txt
    # FIXME: success.txt currently not formatted properly; uncommenting this causes no rows to be written to CSV
    # if not any(site in line for line in success_lines):
    #     continue

    inner_site_paths = get_directories(site)
    total_inner_pages += len(inner_site_paths)

    for inner_site_path in inner_site_paths:
        normal_har_path = f"{inner_site_path}/normal.json"
        reject_har_path = f"{inner_site_path}/after_reject.json"

        if not os.path.isfile(normal_har_path) or not os.path.isfile(reject_har_path):
            # Requires both normal and intercept HAR files to exist
            incomplete_runs += 1
            continue
            
        domain = site.split("/")[2]

        detected_list_normal = analyze_har(normal_har_path)

        for detected_tracker in detected_list_normal:
            detected_trackers_normal.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                "Cookie Domain": detected_tracker["Cookie Domain"]
            })


        # Create file if it doesn't exist; if it exists then write a row for each inner site path with a count of the number of trackers.
        normal_file = "analysis/depth0_normal.csv"
        normal_file_exists = os.path.isfile(normal_file)

        if normal_file_exists:
            with open(normal_file, mode="a", newline="") as file:
                writer = csv.writer(file)
                writer.writerow([inner_site_path, len(detected_list_normal)])
                file.flush() # bugfix where rows weren't writing: flush() clears internal buffer

        else:
            with open(normal_file, mode="w", newline="") as file:
                writer = csv.writer(file)
                writer.writerow(["Inner Site Path", "Length of Detected List"])
                writer.writerow([inner_site_path, len(detected_list_normal)])
                file.flush()


        # Repeat for files generated after run with intercept.
        detected_list_reject = analyze_har(reject_har_path)

        for detected_tracker in detected_list_reject:
            detected_trackers_after_reject.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                "Cookie Domain": detected_tracker["Cookie Domain"]
            })


        reject_file = "analysis/depth0_after_reject.csv"
        reject_file_exists = os.path.isfile(reject_file)

        if reject_file_exists:
            with open(reject_file, mode="a", newline="") as file:
                writer = csv.writer(file)
                writer.writerow([inner_site_path, len(detected_list_reject)])
                file.flush()
        else:
            with open(reject_file, mode="w", newline="") as file:
                writer = csv.writer(file)
                writer.writerow(["Inner Site Path", "Length of Detected List"])
                writer.writerow([inner_site_path, len(detected_list_reject)])
                file.flush()

# Create DataFrames for detected trackers in normal and after_reject crawls
# Each tracker is in a row, with its domain and inner site path
df_normal = pd.DataFrame(detected_trackers_normal)
df_after_reject = pd.DataFrame(detected_trackers_after_reject)

df_normal.info()
df_after_reject.info()

In [4]:
# df_normal.tail(15)
# df_after_reject.tail(15)

# Check if the entire DataFrame contains null values
if df_normal.isnull().any().any():
    print("df_normal contains null values.")

if df_after_reject.isnull().any().any():
    print("df_after_reject contains null values.")

if df_normal['Tracker'].isnull().any():
    print("Tracker col contains null values.")


# df_normal_stackoverflow = df_normal.loc[df_normal['Domain'] == 'stackoverflow.com']
# print(df_normal_stackoverflow)

In [10]:
# Group by domain and set Num_Inner_Pages using the dictionary domains_paths_normal
df_normal_domains = df_normal.groupby('Domain', as_index=False).agg(
    Num_Inner_Pages=('Domain', lambda x: len(domains_paths_normal.get(x.iloc[0], []))), # Use the length of inner site paths in the domains_paths_normal dictionary
    Num_Trackers_Per_Domain=('Tracker', 'count')  # Count the number of trackers for each domain
)

df_normal_domains["Average Trackers Per Page"] = df_normal_domains["Num_Trackers_Per_Domain"] / df_normal_domains["Num_Inner_Pages"]


# Group by domain and set Num_Inner_Pages using the dictionary domains_paths_reject
df_after_reject_domains = df_after_reject.groupby('Domain', as_index=False).agg(
    Num_Inner_Pages=('Domain', lambda x: len(domains_paths_reject.get(x.iloc[0], []))), # Use the length of inner site paths in the domains_paths_reject dictionary
    Num_Trackers_Per_Domain=('Tracker', 'count')  # Count the number of trackers for each domain
)

df_after_reject_domains["Average Trackers Per Page"] = df_after_reject_domains["Num_Trackers_Per_Domain"] / df_after_reject_domains["Num_Inner_Pages"]

In [8]:
# df_normal.info()
# df_normal_domains.info()
df_normal_domains

Unnamed: 0,Domain,Num_Inner_Pages,Num_Trackers_Per_Domain,Average Trackers Per Page
0,aap.org,1,1,1.0
1,acer.com,1,8,8.0
2,active.com,1,2,2.0
3,adtelligent.com,1,6,6.0
4,adweek.com,1,4,4.0
...,...,...,...,...
77,wufoo.com,1,2,2.0
78,xoom.com,1,2,2.0
79,yieldmo.com,1,1,1.0
80,zebra.com,1,2,2.0


In [9]:

# df_after_reject_domains.info()
df_after_reject_domains
# df_after_reject.info()

Unnamed: 0,Domain,Num_Inner_Pages,Num_Trackers_Per_Domain,Average Trackers Per Page
0,aap.org,1,1,1.0
1,acer.com,1,8,8.0
2,active.com,1,1,1.0
3,adtelligent.com,1,3,3.0
4,adweek.com,1,1,1.0
...,...,...,...,...
85,wish.com,1,6,6.0
86,wufoo.com,1,3,3.0
87,xoom.com,1,2,2.0
88,yieldmo.com,1,1,1.0


### Distribution of Tracking Cookies Across All Inner Pages (Regardless of Domain)
Run this cell to check that number of complete+incomplete pages equals total inner pages.

In [11]:
def compare_trackers(reject_filepath, normal_filepath):
    no_trackers_after_reject = []  # List of inner site paths with trackers in normal crawl, but no trackers after rejection
    increased_trackers = []  # List of inner site paths with more trackers after rejection than in normal crawl
    never_trackers = []  # List of inner site paths with no trackers in either normal or rejection crawl
    violating_sites = []  # List of inner site paths with trackers after we click the reject button

    with open(reject_filepath, 'r') as reject_file, open(normal_filepath, 'r') as normal_file:
        read_reject = csv.reader(reject_file)
        read_normal = csv.reader(normal_file)

        # Skip header
        next(read_reject)
        next(read_normal)

        length = 0

        # Since both csvs are sorted by inner site path, we can just iterate through both at the same time
        for normal, after_reject in zip(read_normal, read_reject):
            inner_site_path, num_trackers_normal = normal
            _, num_trackers_reject = after_reject

            if inner_site_path != _:
                raise RuntimeError("Inner site paths do not match")

            num_trackers_normal = int(num_trackers_normal)
            num_trackers_reject = int(num_trackers_reject)

            if num_trackers_normal > 0 and num_trackers_reject == 0:  # if there are trackers in normal crawl, but not after reject
                no_trackers_after_reject.append(inner_site_path)

            if num_trackers_normal < num_trackers_reject:  # if there are more trackers after reject than in normal crawl
                increased_trackers.append(inner_site_path)

            if num_trackers_normal == 0 and num_trackers_reject == 0:  # if there are no trackers in either normal or reject
                never_trackers.append(inner_site_path)

            if num_trackers_reject != 0:  # if there are trackers in reject
                violating_sites.append(inner_site_path)

            length += 1

    # from previous cell
    print("Total inner pages:", total_inner_pages)
    print("Incomplete inner pages:", incomplete_runs)
    
    print("Complete inner pages:", length)
    print("Inner pages that removed all trackers after rejection:", len(no_trackers_after_reject))
    print("Inner pages with increased trackers after rejection:", len(increased_trackers))
    print("Inner pages that never contained trackers:", len(never_trackers))
    print("Inner pages that sent cookies to 3rd party trackers after rejection:", len(violating_sites))


def get_length_detected_list(csv_reader, inner_site_path):
    for row in csv_reader:
        current_inner_site_path, length_detected_list = row
        if current_inner_site_path == inner_site_path:
            return length_detected_list

    return '0'  # If inner_site_path not found, return '0'


compare_trackers('analysis/depth0_after_reject.csv', 'analysis/depth0_normal.csv')

Total inner pages: 252
Incomplete inner pages: 2
Complete inner pages: 250
Inner pages that removed all trackers after rejection: 8
Inner pages with increased trackers after rejection: 33
Inner pages that never contained trackers: 152
Inner pages that sent cookies to 3rd party trackers after rejection: 90
