## **Section 5.1: Estimating Tor's Noise Lookup Rate**

- We count the number of times our relay (carrying the `HSDir` flag) responds to `HS_DESC` lookups from the Tor network with a specific HTTP code and message binned by hour.
- Based on 744 of these counters (i.e., collected over 31 days), we extrapolate the Tor network's noise lookup rate for the purposes of our paper.
- With this notebook, table 3 in our paper can be obtained.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from os.path import join, abspath
from datetime import datetime

In [2]:
DATA_DIR = abspath("./1_data_noise-lookup-rate")

In [3]:
# Read in the whitespace-separated file that contains the response code count log from our HSDir.
# Take care of:
#   1) correctly naming the columns,
#   2) stripping individual elements of the label prefix (rm_eq, "v2_404_not_fnd_cnt=3104" => "3104"),
#   3) parsing elements of the first column correctly as dates (parse_timestamps).

columns = ["timestamp", "v2_200_ok", "v2_400_inv_len", "v2_400_bad_dec",
           "v2_404_not_enc", "v2_404_not_fnd", "v3_200_ok", "v3_404_bad_dec",
           "v3_404_not_fnd", "v3_503_sgl_hop"]

rm_eq = lambda data: (int(data.split("=")[1]))
parse_timestamps = lambda ts: (datetime.strptime(ts, "%Y/%m/%d_%H:%M:%S"))

df = pd.read_csv(join(DATA_DIR, "hsdir_hsdesc_resp_code_cnt_stats.log"),
            delim_whitespace=True,
            header=None, names=columns,
            converters={1: rm_eq, 2: rm_eq, 3: rm_eq, 4: rm_eq, 5: rm_eq, 6: rm_eq, 7: rm_eq, 8: rm_eq, 9: rm_eq},
            parse_dates=[0], date_parser=parse_timestamps,
            index_col=0,
            skiprows=0)

In [4]:
# Split based on onion version.
# Columns 0 up to including 4 are v2-related, 5 up to including 8 are v3-related.

df_v2 = pd.melt(df.iloc[:, [0,1,2,3,4]], var_name="resp_code", value_name="resp_count",
                value_vars=["v2_200_ok", "v2_400_inv_len", "v2_400_bad_dec",
                            "v2_404_not_enc", "v2_404_not_fnd"])

df_v3 = pd.melt(df.iloc[:, [5,6,7,8]], var_name="resp_code", value_name="resp_count",
                value_vars=["v3_200_ok", "v3_404_bad_dec", "v3_404_not_fnd", "v3_503_sgl_hop"])

In [5]:
print(len(df_v2))
assert(len(df_v2) == 3720)
df_v2

3720


Unnamed: 0,resp_code,resp_count
0,v2_200_ok,62
1,v2_200_ok,63
2,v2_200_ok,61
3,v2_200_ok,86
4,v2_200_ok,56
...,...,...
3715,v2_404_not_fnd,206
3716,v2_404_not_fnd,215
3717,v2_404_not_fnd,239
3718,v2_404_not_fnd,202


In [6]:
print(len(df_v3))
assert(len(df_v3) == 2976)
df_v3

2976


Unnamed: 0,resp_code,resp_count
0,v3_200_ok,52
1,v3_200_ok,25
2,v3_200_ok,15
3,v3_200_ok,12
4,v3_200_ok,24
...,...,...
2971,v3_503_sgl_hop,0
2972,v3_503_sgl_hop,0
2973,v3_503_sgl_hop,0
2974,v3_503_sgl_hop,0


In [7]:
# Calculate sum of all counter values.
sum_total = df_v2.sum()["resp_count"] + df_v3.sum()["resp_count"]
print("Total number of v2 and v3 responses: {:,}".format(sum_total))

Total number of v2 and v3 responses: 884,260


In [8]:
v2_grouped_resp_code = df_v2.groupby(["resp_code"])

v2_stats = v2_grouped_resp_code.sum().rename(columns={"resp_count": "Count"})
v2_stats["Share"] = ((v2_grouped_resp_code.sum() / sum_total) * 100.0)

v2_stats["Onion version"] = "v2"
v2_stats["HTTP code"] = "CHANGEME"
v2_stats["Message"] = "CHANGEME"
v2_stats["Response cells"] = "single"
v2_stats["Count (%)"] = "CHANGEME"

v2_stats.at["v2_200_ok", "HTTP code"] = 200
v2_stats.at["v2_200_ok", "Message"] = "Found"
v2_stats.at["v2_200_ok", "Response cells"] = "multiple"

v2_stats.at["v2_400_bad_dec", "HTTP code"] = 400
v2_stats.at["v2_400_bad_dec", "Message"] = "Decoding Failed"

v2_stats.at["v2_400_inv_len", "HTTP code"] = 400
v2_stats.at["v2_400_inv_len", "Message"] = "Invalid Descriptor"

v2_stats.at["v2_404_not_enc", "HTTP code"] = 404
v2_stats.at["v2_404_not_enc", "Message"] = "Not Encrypted"

v2_stats.at["v2_404_not_fnd", "HTTP code"] = 404
v2_stats.at["v2_404_not_fnd", "Message"] = "Not Found"

# Construct combined count and percentage field.
for idx, _ in v2_stats.iterrows():
    v2_stats.at[idx, "Count (%)"] = "{:,} ({:.2f})".format(v2_stats.at[idx, "Count"], v2_stats.at[idx, "Share"])

In [9]:
v2_stats

Unnamed: 0_level_0,Count,Share,Onion version,HTTP code,Message,Response cells,Count (%)
resp_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
v2_200_ok,47331,5.352611,v2,200,Found,multiple,"47,331 (5.35)"
v2_400_bad_dec,0,0.0,v2,400,Decoding Failed,single,0 (0.00)
v2_400_inv_len,10,0.001131,v2,400,Invalid Descriptor,single,10 (0.00)
v2_404_not_enc,0,0.0,v2,404,Not Encrypted,single,0 (0.00)
v2_404_not_fnd,446826,50.531066,v2,404,Not Found,single,"446,826 (50.53)"


In [10]:
# This command is used to generate the raw version of the first half
# (all v2 counters) of Table 3 in Section 5.1.

print(v2_stats.to_latex(columns=["Onion version", "HTTP code", "Message", "Response cells", "Count (%)"],
                        index=False, multirow=True, bold_rows=True,
                        float_format="{:0.3f}".format))

\begin{tabular}{lllll}
\toprule
Onion version & HTTP code &            Message & Response cells &       Count (\%) \\
           v2 &       200 &              Found &       multiple &   47,331 (5.35) \\
\midrule
           v2 &       400 &    Decoding Failed &         single &        0 (0.00) \\
           v2 &       400 & Invalid Descriptor &         single &       10 (0.00) \\
           v2 &       404 &      Not Encrypted &         single &        0 (0.00) \\
           v2 &       404 &          Not Found &         single & 446,826 (50.53) \\
\bottomrule
\end{tabular}



In [11]:
v3_grouped_resp_code = df_v3.groupby(["resp_code"])

v3_stats = v3_grouped_resp_code.sum().rename(columns={"resp_count": "Count"})
v3_stats["Share"] = ((v3_grouped_resp_code.sum() / sum_total) * 100.0)

v3_stats["Onion version"] = "v3"
v3_stats["HTTP code"] = "CHANGEME"
v3_stats["Message"] = "CHANGEME"
v3_stats["Response cells"] = "single"
v3_stats["Count (%)"] = "CHANGEME"

v3_stats.at["v3_200_ok", "HTTP code"] = 200
v3_stats.at["v3_200_ok", "Message"] = "Found"
v3_stats.at["v3_200_ok", "Response cells"] = "multiple"

v3_stats.at["v3_404_bad_dec", "HTTP code"] = 404
v3_stats.at["v3_404_bad_dec", "Message"] = "Decoding Failed"

v3_stats.at["v3_404_not_fnd", "HTTP code"] = 404
v3_stats.at["v3_404_not_fnd", "Message"] = "Not Found"

v3_stats.at["v3_503_sgl_hop", "HTTP code"] = 503
v3_stats.at["v3_503_sgl_hop", "Message"] = "Reject Single Hop"

# Construct combined count and percentage field.
for idx, _ in v3_stats.iterrows():
    v3_stats.at[idx, "Count (%)"] = "{:,} ({:.2f})".format(v3_stats.at[idx, "Count"], v3_stats.at[idx, "Share"])

In [12]:
v3_stats

Unnamed: 0_level_0,Count,Share,Onion version,HTTP code,Message,Response cells,Count (%)
resp_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
v3_200_ok,31564,3.569538,v3,200,Found,multiple,"31,564 (3.57)"
v3_404_bad_dec,0,0.0,v3,404,Decoding Failed,single,0 (0.00)
v3_404_not_fnd,358421,40.53344,v3,404,Not Found,single,"358,421 (40.53)"
v3_503_sgl_hop,108,0.012214,v3,503,Reject Single Hop,single,108 (0.01)


In [13]:
# This command is used to generate the raw version of the second half
# (all v3 counters) of Table 3 in Section 5.1.

print(v3_stats.to_latex(columns=["Onion version", "HTTP code", "Message", "Response cells", "Count (%)"],
                        index=False, multirow=True, bold_rows=True,
                        float_format="{:0.3f}".format))

\begin{tabular}{lllll}
\toprule
Onion version & HTTP code &           Message & Response cells &       Count (\%) \\
           v3 &       200 &             Found &       multiple &   31,564 (3.57) \\
\midrule
           v3 &       404 &   Decoding Failed &         single &        0 (0.00) \\
           v3 &       404 &         Not Found &         single & 358,421 (40.53) \\
           v3 &       503 & Reject Single Hop &         single &      108 (0.01) \\
\bottomrule
\end{tabular}



In [14]:
# Final component of Table 3 in Section 5.1 consists of counts and
# percentages for all counters grouped by `Response cells` field.

sum_multiple_cells = v2_stats.at["v2_200_ok", "Count"] + v3_stats.at["v3_200_ok", "Count"]
perc_multiple_cells = (sum_multiple_cells / sum_total) * 100.0

sum_single_cell = v2_stats.at["v2_400_bad_dec", "Count"] + v2_stats.at["v2_400_inv_len", "Count"] + \
                  v2_stats.at["v2_404_not_fnd", "Count"] + v3_stats.at["v3_404_bad_dec", "Count"] + \
                  v3_stats.at["v3_404_not_fnd", "Count"]
perc_single_cell = (sum_single_cell / sum_total) * 100.0

print("Count and percentage of all `multiple cell` responses:  {:,} ({:.2f})".format(sum_multiple_cells, perc_multiple_cells))
print("Count and percentage of all `single cell` responses:   {:,} ({:.2f})".format(sum_single_cell, perc_single_cell))

Count and percentage of all `multiple cell` responses:  78,895 (8.92)
Count and percentage of all `single cell` responses:   805,257 (91.07)


In [15]:
NUM_TOR_HSDIRS = 3500
NOISE_RATE = ((sum_single_cell * NUM_TOR_HSDIRS) / (744 * 60 * 60))

print(NUM_TOR_HSDIRS)
print(NOISE_RATE)

3500
1052.2698252688172
