In [41]:
import pandas as pd
import numpy as np
import json
import os
import utils
import csv
import math
import matplotlib
import matplotlib.pyplot as plt
from crawler import CMP

### Functions for Analysis and Creating Blocklist

In [42]:
if not os.path.exists("analysis"):
    os.mkdir("analysis")

def get_tracking_domains(list_path: str = "inputs/blocklists/") -> set[str]:
    """
    Get tracking domains from blocklists.

    Args:
        list_path: Path to blocklists. Defaults to "inputs/blocklists/".

    Returns:
        A set of tracking domains.
    """
    lists = []
    for item in os.listdir(list_path):
        path = os.path.join(list_path, item)
        lists.append(path)

    tracking_sites = set()
    for list_path in lists:
        with open(list_path) as file:
            lines = file.readlines()
            for line in lines:
                tracking_sites.add(line.rstrip())

    # print("Tracking sites aggregated from 4 blocklists.")
    return tracking_sites

# Create set of tracking domains from aggregation of 4 blocklists
trackings_domains = get_tracking_domains()
print(trackings_domains)

def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs


def detect_tracking(blocklist, cookie_list) -> list[dict[str, str, str]]:
    """
    Check if any URLs from a list appear in a blocklist of known tracking cookies.

    Args:
        blocklist: Set of blocked domains.
        cookie_list: List of cookies, where each cookie is a dict of 3 key-value pairs.

    Returns:
        A filtered list of detected tracking cookies.
    """

    detected_trackers = []
    for cookie in cookie_list:
        cookie_domain = cookie["Cookie Domain"]
        if utils.get_domain(cookie_domain) in blocklist:
            detected_trackers.append(cookie)

    return detected_trackers


def get_cookies_from_har(file: str) -> list[dict[str, str, str]]:
    """
    Returns a list of cookies from response entries in an HAR file.
    [HAR Specification](http://www.softwareishard.com/blog/har-12-spec/).

    Args:
        file: Path to the HAR file.
    Returns:
        A list of dictionaries representing all cookies in HTTP responses in that HAR file with domains, where each dictionary holds 3 key-value pairs (Cookie Name, Cookie Value, Cookie Domain).
    """

    cookies = []
    data = json.load(open(file, "r")) # parses JSON data into Python dictionary
    for entry in data["log"]["entries"]: # each entry is an HTTP request/response pair
        
        response = entry["response"] # extract response dictionary

        if response.get("cookies"): # response contains cookies
            for cookie in response["cookies"]:
                # print(cookie)
                if cookie.get("domain"): # if cookie has domain
                    cookies.append({"Cookie Name": cookie["name"], "Cookie Value": cookie["value"], "Cookie Domain": cookie["domain"]})

    return cookies

def check_requests(detected_list_from_responses: list[dict[str, str, str]], file: str) -> list[dict[str, str, str]]:
    
    detected_list_from_requests = []
    data = json.load(open(file, "r")) # parses JSON data into Python dictionary
    for entry in data["log"]["entries"]: # each entry is an HTTP request/response pair
        
        request = entry["request"] # extract request dictionary

        for cookie in request.get("cookies"):
            values_of_cookie_names = [d["Cookie Name"] for d in detected_list_from_responses]
            if cookie.get("name") in values_of_cookie_names: # if cookie name is in list of detected cookies from responses
                detected_list_from_requests.append({"Cookie Name": cookie["name"], "Cookie Value": cookie["value"]})

    return detected_list_from_requests

def analyze_har(har_path: str) -> list[dict[str, str, str]]:
    """
    Return a list of tracking cookies detected in the requests of a specified HAR file.

    Args:
        har_path: Path to the HAR file.

    Returns:
        A list of dictionaries representing detected tracking cookies from requests, where each dictionary holds 3 key-value pairs (Cookie Name, Cookie Value, Cookie Domain).
    """
    cookies = get_cookies_from_har(har_path)
    filtered_list = detect_tracking(trackings_domains, cookies)
    return filtered_list

# print(get_cookies_from_har("crawls/depth0/bmj.com/0/normal.json"))
# print(analyze_har("crawls/depth0/bmj.com/0/normal.json"))




### Create Dataframes and Generate CSV Files
Note: Running this cell block will append lines to existing CSV files. Delete existing CSV files or comment out lines before each new run.

In [43]:

CRAWL_NAME = "aug29-onetrust"

domain_paths = get_directories(f"crawls/{CRAWL_NAME}") 
successful_domains = []
with open(f"crawls/{CRAWL_NAME}/results.json") as log_file:
    results = json.load(log_file)
    for path in results:
        if results[path]["interact_success"]:
            site = os.path.basename(os.path.normpath(path))
            successful_domains.append(site)

# for counting number of inner pages per domain
domains_paths_normal = {}
domains_paths_reject = {}

incomplete_runs = 0
total_inner_pages = 0

detected_trackers_from_responses_normal = []
detected_trackers_from_requests_normal = [] # will be used to create DataFrame

detected_trackers_from_responses_reject = []
detected_trackers_from_requests_reject = [] # will be used to create Dataframe

for site in domain_paths:
    # Skip if site is not in success.txt
    # FIXME: success.txt currently not formatted properly; uncommenting this causes no rows to be written to CSV
    # if not any(site in line for line in success_lines):
    #     continue

    inner_site_paths = get_directories(site)
    total_inner_pages += len(inner_site_paths)

    for inner_site_path in inner_site_paths:
        normal_har_path = f"{inner_site_path}/normal.json"
        reject_har_path = f"{inner_site_path}/onetrust_reject_tracking.json"

        if not os.path.isfile(normal_har_path) or not os.path.isfile(reject_har_path):
            # Requires both normal and intercept HAR files to exist
            incomplete_runs += 1
            continue
            
        domain = site.split("/")[2]

        # Append inner site path to the dictionary for normal crawls
        if domain in domains_paths_normal:
            domains_paths_normal[domain].append(inner_site_path)
        else:
            domains_paths_normal[domain] = [inner_site_path]

        # Append inner site path to the dictionary for after_reject crawls
        if domain in domains_paths_reject:
            domains_paths_reject[domain].append(inner_site_path)
        else:
            domains_paths_reject[domain] = [inner_site_path]

        detected_list_from_requests_normal = analyze_har(normal_har_path)

        # saving trackers from responses for easy parsing into dataframe if needed
        for detected_tracker in detected_list_from_requests_normal:
            detected_trackers_from_responses_normal.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                "Cookie Domain": detected_tracker["Cookie Domain"]
            })

        trackers_requests_normal = check_requests(detected_trackers_from_responses_normal, normal_har_path)
        
        for detected_tracker in trackers_requests_normal:
            detected_trackers_from_requests_normal.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                # TODO: maybe we can use reqeust.url to get the domain?
            })

        # # Create file if it doesn't exist; if it exists then write a row for each inner site path with a count of the number of trackers.
        # normal_file = "analysis/depth1_normal.csv"
        # normal_file_exists = os.path.isfile(normal_file)

        # if normal_file_exists:
        #     with open(normal_file, mode="a", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow([inner_site_path, len(trackers_requests_normal)])
        #         file.flush() # bugfix where rows weren't writing: flush() clears internal buffer

        # else:
        #     with open(normal_file, mode="w", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow(["Inner Site Path", "Length of Detected List"])
        #         writer.writerow([inner_site_path, len(trackers_requests_normal)])
        #         file.flush()


        # Repeat for files generated after run with rejecting cookies
        detected_list_from_requests_reject = analyze_har(reject_har_path)

        # saving trackers from responses for easy parsing into dataframe if needed
        for detected_tracker in detected_list_from_requests_reject:
            detected_trackers_from_responses_reject.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                "Cookie Domain": detected_tracker["Cookie Domain"]
            })

        trackers_requests_reject = check_requests(detected_trackers_from_responses_reject, reject_har_path)
        
        for detected_tracker in trackers_requests_reject:
            detected_trackers_from_requests_reject.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
            })

        # # Create file if it doesn't exist; if it exists then write a row for each inner site path with a count of the number of trackers.
        # reject_file = "analysis/depth1_after_reject.csv"
        # reject_file_exists = os.path.isfile(reject_file)

        # if reject_file_exists:
        #     with open(reject_file, mode="a", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow([inner_site_path, len(trackers_requests_reject)])
        #         file.flush() # bugfix where rows weren't writing: flush() clears internal buffer

        # else:
        #     with open(reject_file, mode="w", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow(["Inner Site Path", "Length of Detected List"])
        #         writer.writerow([inner_site_path, len(trackers_requests_reject)])
        #         file.flush()


# Create DataFrames for detected trackers in normal and after_reject crawls
# Each tracker is in a row, with its domain and inner site path
df_normal = pd.DataFrame(detected_trackers_from_requests_normal)
df_after_reject = pd.DataFrame(detected_trackers_from_requests_reject)

In [44]:
# df_normal.info()
# df_after_reject.info()
# df_normal.head(15)
# df_after_reject.head(15)
# df_normal.to_csv("analysis/depth1_normal_1.csv")
# df_after_reject.to_csv("analysis/depth1_after_reject_1.csv")

### Finding Trackers that Remain After Rejecting

In [45]:
# Drop duplicates in df_normal (if every value in a row is the same, it is considered a duplicate)
df_normal_unique = df_normal.drop_duplicates()

# Perform an inner merge (cookies in df_after_reject are kept if they are in df_normal_unique)
merged_df = df_after_reject.merge(df_normal_unique, on=["Domain", "Inner Site Path", "Cookie Name", "Cookie Value"], how="inner")
merged_df


Unnamed: 0,Domain,Inner Site Path,Cookie Name,Cookie Value
0,zoom.us,crawls/aug29-onetrust/zoom.us/0,__adroll,d3dc1b30610c733dfde89449a040a84b-g_1693361642-...
1,zoom.us,crawls/aug29-onetrust/zoom.us/0,__adroll_shared,d3dc1b30610c733dfde89449a040a84b-g_1693361642-...
2,hp.com,crawls/aug29-onetrust/hp.com/0,demdex,72735675346893061151097407425753442073
3,hp.com,crawls/aug29-onetrust/hp.com/0,demdex,72735675346893061151097407425753442073
4,hp.com,crawls/aug29-onetrust/hp.com/0,demdex,72735675346893061151097407425753442073
...,...,...,...,...
708,sketchup.com,crawls/aug29-onetrust/sketchup.com/0,6suuid,47deda1732353600c9c3ee643501000032b9b602
709,sketchup.com,crawls/aug29-onetrust/sketchup.com/0,6suuid,47deda1732353600c9c3ee643501000032b9b602
710,sketchup.com,crawls/aug29-onetrust/sketchup.com/0,6suuid,47deda1732353600c9c3ee643501000032b9b602
711,sketchup.com,crawls/aug29-onetrust/sketchup.com/0,6suuid,47deda1732353600c9c3ee643501000032b9b602


In [46]:
# Group by domain and set Num_Inner_Pages using the dictionary domains_paths_normal
df_normal_domains = df_normal.groupby('Domain', as_index=False).agg(
    Num_Inner_Pages=('Domain', lambda x: len(domains_paths_normal.get(x.iloc[0], []))), # Use the length of inner site paths in the domains_paths_normal dictionary
    Num_Trackers_Per_Domain_No_Interaction=('Cookie Name', 'count')  # Count the number of trackers for each domain
)

df_normal_domains["Average Trackers Per Page"] = df_normal_domains["Num_Trackers_Per_Domain_No_Interaction"] / df_normal_domains["Num_Inner_Pages"]


# Group by domain and set Num_Inner_Pages using the dictionary domains_paths_reject
df_after_reject_domains = df_after_reject.groupby('Domain', as_index=False).agg(
    Num_Inner_Pages=('Domain', lambda x: len(domains_paths_reject.get(x.iloc[0], []))), # Use the length of inner site paths in the domains_paths_reject dictionary
    Num_Trackers_Per_Domain_Disable_Only_Tracking=('Cookie Name', 'count')  # Count the number of trackers for each domain
)

df_after_reject_domains["Average Trackers Per Page"] = df_after_reject_domains["Num_Trackers_Per_Domain_Disable_Only_Tracking"] / df_after_reject_domains["Num_Inner_Pages"]

### Merge dfs and compare different runs

In [53]:
successful_domains = pd.DataFrame(successful_domains, columns =['Domain'])

merged_df = pd.merge(df_normal_domains[["Domain", "Num_Trackers_Per_Domain_No_Interaction"]], df_after_reject_domains[["Domain", "Num_Trackers_Per_Domain_Disable_Only_Tracking"]], on='Domain', how='outer')
merged_df = pd.merge(merged_df, successful_domains, on='Domain', how='outer')
merged_df.fillna(0, inplace=True)
merged_df.to_csv(f"analysis/{CRAWL_NAME}-merged.csv")

In [48]:
count = 0
for (_, row) in merged_df.iterrows():
    if row.Num_Trackers_Per_Domain_Disable_Only_Tracking > 0:
        count += 1

print(count)

41


In [49]:
count = 0
for (_, row) in merged_df.iterrows():
    if row.Num_Trackers_Per_Domain_No_Interaction > 0:
        count += 1

print(count)

47


In [50]:
# df_normal.info()
# df_normal_domains.info()
df_normal_domains

Unnamed: 0,Domain,Num_Inner_Pages,Num_Trackers_Per_Domain_No_Interaction,Average Trackers Per Page
0,agora.io,1,2,2.0
1,app.link,1,7,7.0
2,atlasobscura.com,1,11,11.0
3,avg.com,1,87,87.0
4,blackboard.com,1,7,7.0
5,bmj.com,1,15,15.0
6,branch.io,1,7,7.0
7,breitbart.com,1,132,132.0
8,cancer.org,1,203,203.0
9,databricks.com,1,35,35.0


In [51]:
# df_after_reject_domains.info()
df_after_reject_domains
# df_after_reject.info()

Unnamed: 0,Domain,Num_Inner_Pages,Num_Trackers_Per_Domain_Disable_Only_Tracking,Average Trackers Per Page
0,atlasobscura.com,1,9,9.0
1,blackboard.com,1,6,6.0
2,breitbart.com,1,41,41.0
3,cancer.org,1,151,151.0
4,databricks.com,1,3,3.0
5,deloitte.com,1,95,95.0
6,discogs.com,1,2,2.0
7,docker.com,1,2,2.0
8,docusign.com,1,8,8.0
9,dxc.com,1,2,2.0


### Distribution of Trackers Kept After Rejecting, Grouped by Domain

In [52]:
# Group by "Domain" and count the number of trackers for each domain
domain_counts = merged_df.groupby("Domain").size().reset_index(name="Tracker Count")

# Sort the DataFrame by descending "Tracker Count"
domain_counts_sorted = domain_counts.sort_values(by="Tracker Count", ascending=False)
# domain_counts
domain_counts_sorted

Unnamed: 0,Domain,Tracker Count
0,adtelligent.com,1
55,sketchup.com,1
53,siteground.biz,1
52,sendinblue.com,1
51,senderscore.com,1
...,...,...
24,gamespot.com,1
23,fujitsu.com,1
22,freepik.com,1
21,flightradar24.com,1
