In [3]:
import pandas as pd
import numpy as np
import json
import os
import utils
import csv
import math
import matplotlib
import matplotlib.pyplot as plt

In [22]:
def get_directories(root: str) -> list[str]:
    """
    Return a list of directories in a given root directory.

    Args:
        root: Path to the root directory.

    Returns:
        A list of directories.
    """
    dirs = []
    for item in os.listdir(root):
        path = os.path.join(root, item)
        if os.path.isdir(path):
            dirs.append(path)

    return dirs

def get_cookies_from_har(file: str) -> list[dict[str, str, str]]:
    """
    Returns a list of cookies from response entries in an HAR file.
    [HAR Specification](http://www.softwareishard.com/blog/har-12-spec/).

    Args:
        file: Path to the HAR file.
    Returns:
        A list of dictionaries representing all cookies in HTTP responses in that HAR file with domains, where each dictionary holds 3 key-value pairs (Cookie Name, Cookie Value, Cookie Domain).
    """

    cookies = []
    data = json.load(open(file, "r")) # parses JSON data into Python dictionary
    for entry in data["log"]["entries"]: # each entry is an HTTP request/response pair
        
        response = entry["response"] # extract response dictionary

        if response.get("cookies"): # response contains cookies
            for cookie in response["cookies"]:
                # print(cookie)
                if cookie.get("domain"): # if cookie has domain
                    cookies.append({"Cookie Name": cookie["name"], "Cookie Value": cookie["value"], "Cookie Domain": cookie["domain"]})

    return cookies

def check_requests(detected_list_from_responses: list[dict[str, str, str]], file: str) -> list[dict[str, str, str]]:
    """
    Returns a list of cookies from request entries in an HAR file that also appeared in a response entry.

    Args:
        detected_list_from_responses: List of cookies from response entries in an HAR file.
        file: Path to the HAR file.

    Returns:
        A list of dictionaries representing all cookies in HTTP requests in that HAR file, where each dictionary holds 3 key-value pairs (Cookie Name, Cookie Value, Cookie Domain).
    """

    detected_list_from_requests = []
    data = json.load(open(file, "r")) # parses JSON data into Python dictionary
    for entry in data["log"]["entries"]: # each entry is an HTTP request/response pair
        
        request = entry["request"] # extract request dictionary

        for cookie in request.get("cookies"):
            cookie_names = [d["Cookie Name"] for d in detected_list_from_responses]
            if cookie.get("name") in cookie_names: # if cookie name is in list of detected cookies from responses
                detected_list_from_requests.append({"Cookie Name": cookie["name"], "Cookie Value": cookie["value"]})
                # add this cookie's domain
                detected_list_from_requests[-1]["Cookie Domain"] = detected_list_from_responses[cookie_names.index(cookie["name"])]["Cookie Domain"]

    return detected_list_from_requests

In [18]:
# Copyright (C) 2021-2022 Dino Bollinger, ETH Zürich, Information Security Group
# Released under the MIT License
"""
Parts of the script from this cell taken from cookie_stats.py of CookieBlock-Consent-Crawler.

This script takes as input a json file as produced by the script `database_scripts/extract_cookies_from_db.py`,
which is a list of cookies with matching consent label declaration.

This script can extract a number of additional statistics on this data, such as the majority opinion to
the categorisation provided by Cookiepedia, and the ratio of disagreement on the label of a cookie name.

Cookiepedia hereby provides a large database of cookies, where each cookie is assigned one out of 5 categories, those being:
   1. Strictly Necessary
   2. Functional
   3. Performance (Analytics)
   4. Targeting/Advertising
   5. Unknown
Where the latter category is usually given when Cookiepedia does not have enough data to form a categorization.
"""
# from docopt import docopt
import requests
import requests.exceptions as rexcepts

import pickle
import os
import json
import random
import re
import traceback
import logging

from enum import IntEnum
from typing import Tuple, List, Dict, Optional, Any

logger = logging.getLogger("stats")

# Lookup dictionary for cookiepedia names
cookiepedia_lookup = dict()

class CookieCategories(IntEnum):
    Unrecognized = -1
    Necessary = 0
    Functional = 1
    Analytical = 2
    Advertising = 3
    Unknown = 4

def simple_get(url) -> Optional[requests.Response]:
    """
    Perform a simple GET request to the target address, and handle errors.
    @param url: URL to send the GET request to
    @return: Response object, or None if an error occurred
    """
    try:
        # fake chrome user agent, required or else Cookiepedia will not respond
        headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
        r = requests.get(url, timeout=(15, 15), headers=headers)
        return r
    except (rexcepts.TooManyRedirects, rexcepts.SSLError,
            rexcepts.URLRequired, rexcepts.MissingSchema) as ex:
        logger.debug(f"'{type(ex)}' exception occurred while trying to access: '{url}'")
        return None
    except (rexcepts.ConnectionError, rexcepts.Timeout) as ex:
        logger.debug(f"'{type(ex)}' exception occurred while trying to access: '{url}'")
        return None
    except Exception as ex:
        logger.error(f"Unexpected '{type(ex)}' exception occurred while trying to access: '{url}'")
        logger.error(traceback.format_exc())
        return None
    
def get_cookiepedia_opinion(cookie_name: str) -> Tuple[CookieCategories, str]:
    """
    Send a request to Cookiepedia to get their category for the given cookie name, if present.
    Unknown category, connection errors and cookie not found are all translated to category id -1.
    @param cookie_name: Cookie name to retrieve category for.
    @return: Tuple of Category ID and Category Name
    """
    if cookie_name in cookiepedia_lookup:
        return cookiepedia_lookup[cookie_name]

    result = simple_get(f"https://cookiepedia.co.uk/cookies/{cookie_name}")

    cookiepedia_category: str = "Not Found"
    if result is not None:
        m_obj = re.search("The main purpose of this cookie is:\\s*<strong>(.*)</strong>", result.text)
        if m_obj is not None:
            cookiepedia_category = m_obj.group(1)
    else:
        cookiepedia_category = "Connection Failed"

    # Translate string category name to an integer for comparison purposes
    cp_id: CookieCategories
    if cookiepedia_category == "Strictly Necessary":
        cp_id = CookieCategories.Necessary
    elif cookiepedia_category == "Functionality":
        cp_id = CookieCategories.Functional
    elif cookiepedia_category == "Performance":
        cp_id = CookieCategories.Analytical
    elif cookiepedia_category == "Targeting/Advertising":
        cp_id = CookieCategories.Advertising
    elif re.match("(Unknown|Not Found|Connection Failed)", cookiepedia_category):
        cp_id = CookieCategories.Unknown
    else:
        logger.warning(f"Unrecognized category name: {cookiepedia_category}")
        cp_id = CookieCategories.Unrecognized

    cookiepedia_lookup[cookie_name] = (cp_id, cookiepedia_category)

    return cp_id, cookiepedia_category

In [23]:
domain_paths = get_directories("crawls/depth0")

# for counting number of inner pages per domain
domains_paths_normal = {}
domains_paths_reject = {}

incomplete_runs = 0
total_inner_pages = 0

detected_cookies_normal = []
detected_cookies_reject = []

for site in domain_paths:
    inner_site_paths = get_directories(site)
    total_inner_pages += len(inner_site_paths)

    for inner_site_path in inner_site_paths:
        normal_har_path = f"{inner_site_path}/normal.json"
        reject_har_path = f"{inner_site_path}/after_reject.json"

        if not os.path.isfile(normal_har_path) or not os.path.isfile(reject_har_path):
            # Requires both normal and intercept HAR files to exist
            incomplete_runs += 1
            continue

        domain = site.split("/")[2]

        # Append inner site path to the dictionary for normal crawls
        if domain in domains_paths_normal:
            domains_paths_normal[domain].append(inner_site_path)
        else:
            domains_paths_normal[domain] = [inner_site_path]

        # Append inner site path to the dictionary for after_reject crawls
        if domain in domains_paths_reject:
            domains_paths_reject[domain].append(inner_site_path)
        else:
            domains_paths_reject[domain] = [inner_site_path]

        cookies_from_responses_normal = get_cookies_from_har(normal_har_path)
        # print(cookies_from_responses_normal)
        cookies_from_requests_normal = check_requests(cookies_from_responses_normal, normal_har_path)
        # print(cookies_from_requests_normal)

        # saving cookies from responses for easy parsing into dataframe
        for cookie in cookies_from_requests_normal:
            detected_cookies_normal.append({
                "Domain": domain,
                "Inner Site Path": inner_site_path,
                "Cookie Name": cookie["Cookie Name"],
                "Cookie Value": cookie["Cookie Value"],
                "Cookie Domain": cookie["Cookie Domain"],
                "Cookiepedia Category": get_cookiepedia_opinion(cookie["Cookie Name"])[1]
            })
df_detected_cookies_normal = pd.DataFrame(detected_cookies_normal)

[{'Cookie Name': 'o', 'Cookie Value': ':1624:VBg=.m', 'Cookie Domain': 'mail.ru'}, {'Cookie Name': 'mrhc', 'Cookie Value': 'lpmtfX4fi8nZC1c+lnmlJywIc0YE4xRtFlOWuj4BlG4=', 'Cookie Domain': 'mail.ru'}, {'Cookie Name': 'VID', 'Cookie Value': '2Yqr0609RoYI00000t1kP4oI:::0-0-0-9cc0211:CAASEHk3M5uvxTW_06RQ5fNFugYaYC50ln4UErl3bH08TEGosDE6jSwGtc5N_-UQjOBXy-Agdmet2KYsHPMPP2u_IREmnRcoyISRpYdGOdOqhgXyTpIN0d1QN75T3YpaOceMRghUIkM12T5_BAbk6KOBLE32fw', 'Cookie Domain': '.mail.ru'}, {'Cookie Name': 'VID', 'Cookie Value': '2Yqr0609RoYI00000t1kP4oI:::0-0-0-9cc0211:CAASEIhP9dsTCPTqSfp_K0li53MaYAvHtkjaHqklV9KPbj-AIpu3Ky0ES_d6xqhlE62QL-NP4YCJHraF7cNo7vEOX_7mF0zyW_grBAvWIcSxH2pGUPFfN_PMdP9qEgUdmh4Y6AIjQ6VQrHruuX_KsyHUydPreg', 'Cookie Domain': '.mail.ru'}, {'Cookie Name': 'VID', 'Cookie Value': '2Yqr0609RoYI00000t1kP4oI:::0-0-0-9cc0211:CAASEJzy8xr6dWWnI7NuNTCI63EaYDjcXpY2vVshTLYCz11XyYIyNv8ZvowUGQcNAGhcx20iuVAu5uCsHR30TMnQFPL875ZN3nNxDGxnCOloV8FwZPaMM8pmxzdFhf99vVSh4aWGtH1n93Fy9jvdAfRn-mcMtA', 'Cookie Domain

In [27]:
# print(df_detected_cookies_normal)
print(df_detected_cookies_normal.info())
category_counts = df_detected_cookies_normal['Cookiepedia Category'].value_counts()
print(category_counts)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6788 entries, 0 to 6787
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Domain                6788 non-null   object
 1   Inner Site Path       6788 non-null   object
 2   Cookie Name           6788 non-null   object
 3   Cookie Value          6788 non-null   object
 4   Cookie Domain         6788 non-null   object
 5   Cookiepedia Category  6788 non-null   object
dtypes: object(6)
memory usage: 318.3+ KB
None
Cookiepedia Category
Unknown                  3453
Targeting/Advertising    2127
Not Found                 653
Performance               308
Strictly Necessary        140
Functionality             107
Name: count, dtype: int64
