In [1]:
import pandas as pd
import numpy as np
import json
import os
import utils
import csv
import math
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime, timezone


### Functions for Reformatting HAR Files into a Single JSON File for CookieBlock

In [8]:
if not os.path.exists("cookieblock_files"):
    os.mkdir("cookieblock_files")


def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs


def get_cookies_from_har(file: str, output: dict[dict]) -> None:
    """
    Processes cookies from an HAR file, and appends them to the output list in CookieBlock's specified structure.
    [HAR Specification](http://www.softwareishard.com/blog/har-12-spec/).

    Args:
        file: Path to the HAR file.
    Returns:
        Void.
        
    """

    data = json.load(open(file, "r")) # parses JSON data into Python dictionary
    siteURL = file.split("/")[2]

    for entry in data["log"]["entries"]: # each entry is an HTTP request/response pair
        
        response = entry["response"] # extract response dictionary

        if response.get("cookies"): # response contains cookies
            for cookie in response["cookies"]:
                if cookie.get("domain") and cookie.get("name") in [x.get("name") for x in entry["request"].get("cookies")]:
                    cookie_id = cookie["name"] + ";" + cookie["domain"] + ";" + cookie.get("path", "/") + siteURL # cookie ID is cookie_name;cookie_domain;path;siteURL
                    
                    reformatted = reformat_cookie(cookie, cookie_id, siteURL)

                    output.update(reformatted)

    pass

def reformat_cookie(cookie: list[dict], cookie_id: str, siteURL: str) -> dict:
    """
    Reformats a cookie from the HAR format to CookieBlock's format.

    Args:
        cookie: Cookie to be reformatted.
        cookie_id: ID of the cookie.
        siteURL: URL or first party domain of the cookie.

    Returns:
        A dictionary containing the reformatted cookie.

        Output structure:
            [
            {
            "cookie_id": {
            "name": "<name>",
            "domain": "<domain>",
            "path": "/path",
            "first_party_domain": "http://first-party-domain",
            "label": 0-3,
            "cmp_origin": 0-2,
            "variable_data": [
                {
                "value": "<cookie content>",
                "expiry": "<expiration in seconds>",                
                "session": "<true/false>",
                "http_only": true,
                "host_only": true,
                "secure": true,
                "same_site": "<no restriction/lax/strict>"                
                }
            ]
            }
            }
            ]
    """

    expiry = cookie.get("expires") or "0"  # If 'expires' is missing, set it to "0"
    
    # Check if expiry is None or not a valid datetime string
    if expiry is None or not expiry.strip():
        expiry = '1970-01-01T00:00:00Z' # just an arbitrary default value
    
    try:
        expiry_datetime = datetime.fromisoformat(expiry.replace('Z', '+00:00'))
    except ValueError:
        # Handle the case where the expiry is not a valid datetime string
        expiry_datetime = datetime.fromisoformat('1970-01-01T00:00:00+00:00')
    
    # Convert the offset-naive datetime to offset-aware UTC datetime
    epoch_utc = datetime(1970, 1, 1, tzinfo=timezone.utc)

    # Calculate the time difference and get the total seconds
    expiry_seconds = (expiry_datetime - epoch_utc).total_seconds()


    reformatted_cookie = {
        cookie_id: {
            "name": cookie["name"],
            "domain": cookie["domain"],
            "path": cookie.get("path", "/"),
            "first_party_domain": f"http://{siteURL}",
            "label": 0, # only required if training a classifier, not needed for prediction
            # 0: necessary, 1: functional, 2: analytics, 3: advertising
            "cmp_origin": 1,
            # 0: Cookiebot, 1: OneTrust, 2: Termly
            "variable_data": [
                {
                    "value": cookie["value"],
                    "expiry": int(expiry_seconds),
                    "session": expiry_seconds == 0,
                    "http_only": cookie.get("httpOnly", False), # defaults to False if not present
                    "host_only": True,
                    "secure": cookie.get("secure", False), # defaults to False if not present
                    "same_site": cookie.get("SameSite", "no restriction")  # defaults to "no restriction" if not present
                }
            ]
        }
    }

    return reformatted_cookie

# TODO: what to put in the "label" field for each cookie? (cookiepedia label or cmp label?)


In [10]:
normal_output = {}
reject_output = {}

incomplete_runs = 0
total_inner_pages = 0

domain_paths = get_directories("crawls/depth0") 
for site in domain_paths:

    inner_site_paths = get_directories(site)
    total_inner_pages += len(inner_site_paths)

    for inner_site_path in inner_site_paths:
        normal_har_path = f"{inner_site_path}/normal.json"
        reject_har_path = f"{inner_site_path}/after_reject.json"

        if not os.path.isfile(normal_har_path) or not os.path.isfile(reject_har_path):
            # Requires both normal and intercept HAR files to exist
            incomplete_runs += 1
            continue
        
        parse_normal_har = get_cookies_from_har(normal_har_path, normal_output)
        parse_reject_har = get_cookies_from_har(reject_har_path, reject_output)

print(incomplete_runs)
print(total_inner_pages)

2
252


In [22]:
with open('cookieblock_files/normal_output.json', 'w') as json_file:
    json.dump(normal_output, json_file)

with open('cookieblock_files/reject_output.json', 'w') as json_file:
    json.dump(reject_output, json_file)