In [12]:
import pandas as pd
import numpy as np
import json
import os
import utils
import csv
import math
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime, timezone


### Functions for Reformatting HAR Files into a Single JSON File for CookieBlock

In [20]:
if not os.path.exists("cookieblock_files"):
    os.mkdir("cookieblock_files")


def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs


def get_cookies_from_har(file: str, output: dict[dict]) -> None:
    """
    Processes cookies from an HAR file, and appends them to the output list in CookieBlock's specified structure.
    [HAR Specification](http://www.softwareishard.com/blog/har-12-spec/).

    Args:
        file: Path to the HAR file.
    Returns:
        Void.
        
    """

    data = json.load(open(file, "r")) # parses JSON data into Python dictionary
    siteURL = file.split("/")[2]

    for entry in data["log"]["entries"]: # each entry is an HTTP request/response pair
        
        response = entry["response"] # extract response dictionary

        if response.get("cookies"): # response contains cookies
            for cookie in response["cookies"]:
                if cookie.get("domain") and cookie.get("name") in [x.get("name") for x in entry["request"].get("cookies")]:
                    cookie_id = cookie["name"] + ";" + cookie["domain"] + ";" + cookie["path"] + siteURL # cookie ID is cookie_name;cookie_domain;path;siteURL
                    
                    reformatted = reformat_cookie(cookie, cookie_id, siteURL)

                    output.update(reformatted)

    pass

def reformat_cookie(cookie: list[dict], cookie_id: str, siteURL: str) -> dict:
    """
    Reformats a cookie from the HAR format to CookieBlock's format.

    Args:
        cookie: Cookie to be reformatted.
        cookie_id: ID of the cookie.
        siteURL: URL or first party domain of the cookie.

    Returns:
        A dictionary containing the reformatted cookie.

        Output structure:
            [
            {
            "cookie_id": {
            "name": "<name>",
            "domain": "<domain>",
            "path": "/path",
            "first_party_domain": "http://first-party-domain",
            "label": 0-3,
            "cmp_origin": 0-2,
            "variable_data": [
                {
                "value": "<cookie content>",
                "expiry": "<expiration in seconds>",                
                "session": "<true/false>",
                "http_only": true,
                "host_only": true,
                "secure": true,
                "same_site": "<no restriction/lax/strict>"                
                }
            ]
            }
            }
            ]
    """

    expiry = cookie.get("expires") or "0"  # If 'expires' is missing, set it to "0"
    expiry_datetime = datetime.fromisoformat(expiry.replace('Z', '+00:00'))
    # Convert the offset-naive datetime to offset-aware UTC datetime
    epoch_utc = datetime(1970, 1, 1, tzinfo=timezone.utc)

    # Calculate the time difference and get the total seconds
    expiry_seconds = (expiry_datetime - epoch_utc).total_seconds()


    reformatted_cookie = {
        cookie_id: {
            "name": cookie["name"],
            "domain": cookie["domain"],
            "path": cookie["path"],
            "first_party_domain": f"http://{siteURL}",
            "label": 0, # only required if training a classifier, not needed for prediction
            # 0: necessary, 1: functional, 2: analytics, 3: advertising
            "cmp_origin": 1,
            # 0: Cookiebot, 1: OneTrust, 2: Termly
            "variable_data": [
                {
                    "value": cookie["value"],
                    "expiry": int(expiry_seconds),
                    "session": expiry_seconds == 0,
                    "http_only": cookie.get("httpOnly", False), # defaults to False if not present
                    "host_only": True,
                    "secure": cookie.get("secure", False), # defaults to False if not present
                    "same_site": cookie.get("SameSite", "no restriction")  # defaults to "no restriction" if not present
                }
            ]
        }
    }

    return reformatted_cookie

# TODO: what to put in the "label" field for each cookie? (cookiepedia label or cmp label?)


In [21]:
testoutput = {}
testdict = get_cookies_from_har("crawls/depth0/bmj.com/0/normal.json", testoutput)
print(testoutput)

{'FPID;bmj.com;/bmj.com': {'name': 'FPID', 'domain': 'bmj.com', 'path': '/', 'first_party_domain': 'http://bmj.com', 'label': 0, 'cmp_origin': 1, 'variable_data': [{'value': 'FPID2.2.riuIeRPAfgd1tJn%2B%2BxW76atiPXcsoS64Cqa9G4y6rn0%3D.1689360923', 'expiry': 1752432932, 'session': False, 'http_only': True, 'host_only': True, 'secure': True, 'same_site': 'no restriction'}]}, 'test_cookie;.doubleclick.net;/bmj.com': {'name': 'test_cookie', 'domain': '.doubleclick.net', 'path': '/', 'first_party_domain': 'http://bmj.com', 'label': 0, 'cmp_origin': 1, 'variable_data': [{'value': '', 'expiry': 1217630755, 'session': False, 'http_only': True, 'host_only': True, 'secure': True, 'same_site': 'no restriction'}]}, 's-dmd-id-x;www.medtargetsystem.com;/bmj.com': {'name': 's-dmd-id-x', 'domain': 'www.medtargetsystem.com', 'path': '/', 'first_party_domain': 'http://bmj.com', 'label': 0, 'cmp_origin': 1, 'variable_data': [{'value': 'deleted', 'expiry': 0, 'session': True, 'http_only': False, 'host_only

In [None]:
with open('cookieblock_files/output.json', 'w') as json_file:
    json.dump(testoutput, json_file)

### Create Dataframes and Generate CSV Files
Note: Running this cell block will append lines to existing CSV files. Delete existing CSV files or comment out lines before each new run.

In [3]:
success_file_path = "inputs/sites/success.txt"
with open(success_file_path, "r") as success_file:
    success_lines = success_file.readlines()

# domain_paths = get_directories("crawls/depth1") 
domain_paths = get_directories("crawls/depth0") 

# for counting number of inner pages per domain
domains_paths_normal = {}
domains_paths_reject = {}

incomplete_runs = 0
total_inner_pages = 0

detected_trackers_from_responses_normal = []
detected_trackers_from_requests_normal = [] # will be used to create DataFrame

detected_trackers_from_responses_reject = []
detected_trackers_from_requests_reject = [] # will be used to create Dataframe

for site in domain_paths:
    # Skip if site is not in success.txt
    # FIXME: success.txt currently not formatted properly; uncommenting this causes no rows to be written to CSV
    # if not any(site in line for line in success_lines):
    #     continue

    inner_site_paths = get_directories(site)
    total_inner_pages += len(inner_site_paths)

    for inner_site_path in inner_site_paths:
        normal_har_path = f"{inner_site_path}/normal.json"
        reject_har_path = f"{inner_site_path}/after_reject.json"

        if not os.path.isfile(normal_har_path) or not os.path.isfile(reject_har_path):
            # Requires both normal and intercept HAR files to exist
            incomplete_runs += 1
            continue
            
        domain = site.split("/")[2]

        # Append inner site path to the dictionary for normal crawls
        if domain in domains_paths_normal:
            domains_paths_normal[domain].append(inner_site_path)
        else:
            domains_paths_normal[domain] = [inner_site_path]

        # Append inner site path to the dictionary for after_reject crawls
        if domain in domains_paths_reject:
            domains_paths_reject[domain].append(inner_site_path)
        else:
            domains_paths_reject[domain] = [inner_site_path]

        detected_list_from_requests_normal = analyze_har(normal_har_path)

        # saving trackers from responses for easy parsing into dataframe if needed
        for detected_tracker in detected_list_from_requests_normal:
            detected_trackers_from_responses_normal.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                "Cookie Domain": detected_tracker["Cookie Domain"]
            })

        trackers_requests_normal = check_requests(detected_trackers_from_responses_normal, normal_har_path)
        
        for detected_tracker in trackers_requests_normal:
            detected_trackers_from_requests_normal.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                # TODO: maybe we can use request.url to get the domain?
            })

        # # Create file if it doesn't exist; if it exists then write a row for each inner site path with a count of the number of trackers.
        # normal_file = "analysis/depth1_normal.csv"
        # normal_file_exists = os.path.isfile(normal_file)

        # if normal_file_exists:
        #     with open(normal_file, mode="a", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow([inner_site_path, len(trackers_requests_normal)])
        #         file.flush() # bugfix where rows weren't writing: flush() clears internal buffer

        # else:
        #     with open(normal_file, mode="w", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow(["Inner Site Path", "Length of Detected List"])
        #         writer.writerow([inner_site_path, len(trackers_requests_normal)])
        #         file.flush()


        # Repeat for files generated after run with rejecting cookies
        detected_list_from_requests_reject = analyze_har(reject_har_path)

        # saving trackers from responses for easy parsing into dataframe if needed
        for detected_tracker in detected_list_from_requests_reject:
            detected_trackers_from_responses_reject.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
                "Cookie Domain": detected_tracker["Cookie Domain"]
            })

        trackers_requests_reject = check_requests(detected_trackers_from_responses_reject, reject_har_path)
        
        for detected_tracker in trackers_requests_reject:
            detected_trackers_from_requests_reject.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": detected_tracker["Cookie Name"],
                "Cookie Value": detected_tracker["Cookie Value"],
            })

        # # Create file if it doesn't exist; if it exists then write a row for each inner site path with a count of the number of trackers.
        # reject_file = "analysis/depth1_after_reject.csv"
        # reject_file_exists = os.path.isfile(reject_file)

        # if reject_file_exists:
        #     with open(reject_file, mode="a", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow([inner_site_path, len(trackers_requests_reject)])
        #         file.flush() # bugfix where rows weren't writing: flush() clears internal buffer

        # else:
        #     with open(reject_file, mode="w", newline="") as file:
        #         writer = csv.writer(file)
        #         writer.writerow(["Inner Site Path", "Length of Detected List"])
        #         writer.writerow([inner_site_path, len(trackers_requests_reject)])
        #         file.flush()


# Create DataFrames for detected trackers in normal and after_reject crawls
# Each tracker is in a row, with its domain and inner site path
df_normal = pd.DataFrame(detected_trackers_from_requests_normal)
df_after_reject = pd.DataFrame(detected_trackers_from_requests_reject)

In [16]:
# df_normal.info()
# df_after_reject.info()
# df_normal.head(15)
# df_after_reject.head(15)
# df_normal.to_csv("analysis/depth1_normal_1.csv")
# df_after_reject.to_csv("analysis/depth1_after_reject_1.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4431 entries, 0 to 4430
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Domain           4431 non-null   object
 1   Inner Site Path  4431 non-null   object
 2   Cookie Name      4431 non-null   object
 3   Cookie Value     4431 non-null   object
dtypes: object(4)
memory usage: 138.6+ KB
