## **Section 5.3: Build List of Lookup Timestamps from our Large Crawls**

- Parse the timestamps of the `HS_DESC` lookups collected during our large-scale crawl in section 4.2. Restrict this set of lookups to the ones with the optimal injection rate of 3 subresources per second.
- Write all (start, end) timestamp tuples into `hsdesc_lookup_details.json`.
- We use this large list of actual lookup timestamps to "replay" lookups in our attack simulations.

In [None]:
import json
import statistics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from glob import glob
from tqdm import tqdm
from datetime import datetime
from os.path import abspath, dirname, join, basename, isfile

In [None]:
# MIND: Set this path to the file system location of a crawl conducted
#       with our instrumentation code and subsequent execution of
#       `../4_attack-tuning/process_events_log.py` on the crawl folders.
#       The data set we provide in this repository for section 4.2 does
#       not contain the required `lookup_timestamps` key in `results.json`.

# DATA_DIR = abspath("PLEASE SET TO APPROPRIATE TOR CRAWL")

In [None]:
EXPERIMENT_PARAMS_JSON = "experiment.json"
HSDESC_EVENTS_CSV = "events_hs_desc.csv"

In [None]:
def load_from_json(json_path):
    return json.loads(open(json_path).read())


def load_from_csv(csv_path):
    data = []

    with open(csv_path, "r", newline="") as csv_fp:

        reader = csv.DictReader(csv_fp)

        for row in reader:
            data.append(row)

    return data

In [None]:
def convert_to_dt(time_str):
    """Convert from format used in HSDESC_EVENTS_CSV to datetime."""
    
    try:
        return datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S.%f")
    except:
        # Timestamp lacks the ms portion when ms = .000
        return datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")


def find_earliest_circ_start(hsdesc_events):
    """Find the earliest circ start time.
    All lookup timestamps will be relative to this."""

    return min([convert_to_dt(hsdesc_event['start']) for hsdesc_event in hsdesc_events])

In [None]:
hsdesc_lookup_details = []
n_experiments = 0
lookup_durations = []

for result_json in tqdm(glob(join(DATA_DIR, "*MAX_HSDESC_RATE*_3.0*_COMPLETED/results.json"))):

    exp_dir = dirname(result_json)
    exp_params = load_from_json(join(exp_dir, EXPERIMENT_PARAMS_JSON))

    n_experiments += 1
    lookup_times_dict = {}
    lookup_times = []
    
    results = load_from_json(result_json)
    
    lookup_times_dict["guard_fp"] = exp_params["guard_fp"]
    
    # We don't use exp_start_time in the simulations.
    # Just for redundancy and to help with debugging.
    lookup_times_dict["exp_start_time"] = exp_params["start_time"]

    hsdesc_events = load_from_csv(join(exp_dir, HSDESC_EVENTS_CSV))
    
    # Get the first circuit's start time.
    # All timestamps will be relative to that.
    min_dt = find_earliest_circ_start(hsdesc_events)
    attack_start_ts = min_dt.timestamp()
    
    lookup_times_dict["attack_start_time"] = attack_start_ts
    lookup_times_dict["lookup_times"] = sorted(results['lookup_timestamps'], key=lambda x: x[0])
    
    hsdesc_lookup_details.append(lookup_times_dict)

In [None]:
with open("hsdesc_lookup_details.json", "w") as f:
    json.dump(hsdesc_lookup_details, f)
    f.write("\n")