# Generate Training Data
## Preparation

In [1]:
from scapy.all import *
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
import logging

logging.basicConfig(level=logging.DEBUG)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [3]:
# Formatters for displaying pandas data tables with columns of (lists of) scapy packets

def packet_list_fmt(lst):
    return f"{len(lst)} packets"

def packet_fmt(pkt):
    return pkt[DNS].summary()

In [4]:
ALGO_NAME = {
    0: 'unsigned',
    0: 0,
    5: 'rsasha1', 
    7: 'rsasha1nsec3sha1', 
    8: 'rsasha256', 
    10: 'rsasha512',
    13: 'ecdsap256sha256', 
    14: 'ecdsap384sha384', 
    15: 'ed25519', 
    16: 'ed448',
}
ALGO_NUM = {name: num for num, name in ALGO_NAME.items()}

In [5]:
# retrieve information encoded in DNS qnames

def split_qname(row):
    qname = row['qname']
    parts = qname.decode('ascii').split(".")
    try:
        if ".".join(parts[-3:]) == "dnstb.net.":
            if '-' not in parts[-4]:
                # this qname has a label directly under dnstb.net, store it
                # then remove to be compatible with qnames without label
                qlabel = parts[-4]
                del parts[-4]
            else:
                qlabel = ''
            
            resolver_parts = parts[-5].split("-")            
            if parts[-4] == 'unsigned':
                return {
                    'zone_algorithm': 0,
                    'zone_keysize': 0,
                    'zone_nsec': 0,
                    'zone_status': parts[-4],
                    'resolver': resolver_parts[1],
                    'qlabel': qlabel,                    
                }
            else:
                zone_parts = parts[-4].split("-")
                return {
                    'zone_algorithm': zone_parts[0],
                    'zone_keysize': zone_parts[1],
                    'zone_nsec': zone_parts[2],
                    'zone_status': zone_parts[3],
                    'resolver': resolver_parts[1],
                    'qlabel': qlabel,
                }
    except IndexError:
        logging.warning(f"could not split qname {qname} into meaningful information")
        return {
            'zone_algorithm': None,
            'zone_keysize': None,
            'zone_nsec': None,
            'zone_status': None,
            'resolver': None,
            'qlabel': None,
        }
    logging.warning(f"Could not split qname: {qname}")
    return {
        'zone_algorithm': None,
        'zone_keysize': None,
        'zone_nsec': None,
        'zone_status': None,
        'resolver': None,
        'qlabel': None,
    }

## Load Data

In [None]:
%%bash 

mkdir -p ../traffic
rsync -aP root@ns1.adnssec.dedyn.io:/var/log/tcplogger/ ../traffic/

In [6]:
# load the traffic logs
# TODO: loop over available traffic dumps

c_logs = filter(lambda s: s, """
tcpdump_opn_cisco-umbrella_vpn0_20210617014323.pcap
tcpdump_opn_cloudflare_vpn0_20210617014323.pcap
tcpdump_opn_comodo-secure-dns_vpn0_20210617014323.pcap
tcpdump_opn_cznic-odvr_vpn0_20210617014323.pcap
tcpdump_opn_freenom-world_vpn0_20210617014323.pcap
tcpdump_opn_google_vpn0_20210617014323.pcap
tcpdump_opn_neustar-free-recursive_vpn0_20210617014323.pcap
tcpdump_opn_norton-connectsafe_vpn0_20210617014323.pcap
tcpdump_opn_opennic_vpn0_20210617014323.pcap
tcpdump_opn_oracle-dyn_vpn0_20210617014323.pcap
tcpdump_opn_quad9_vpn0_20210617014323.pcap

tcpdump_lab_bind9113_vpn0_20210617024144.pcap
tcpdump_lab_kresd532_vpn0_20210617024144.pcap
tcpdump_lab_powerdns460_vpn0_20210617024144.pcap
tcpdump_lab_unbound167_vpn0_20210617024144.pcap
tcpdump_lab_ws2012r2_vpn0_20210617024144.pcap
tcpdump_lab_ws2012_vpn0_20210617024144.pcap
tcpdump_lab_ws2016_vpn0_20210617024144.pcap
tcpdump_lab_ws2019_vpn0_20210617024144.pcap
""".split("\n"))


c_packets = reduce(lambda a,b: a + b, (rdpcap(f"../traffic/{f}") for f in c_logs))

In [7]:
# using the client logs, get a list of queries

queries = []
queries_by_id = {}
dropped = []

for p in c_packets:
    try:
        p[DNS]
    except IndexError:
        logging.warning(f'Broken DNS packet at time {p.time}?')
        dropped.append(p)
        continue
    if p[DNS].qr == 0:
        # query
        qid = p[DNS].id
        q = {
            'id': qid,
            'qname': p[DNS].qd.qname,
            'tag': p[DNS].qd.qname.split(b'.', 1)[0],
            'started': p.time,
            'client_packets': [p],
            'server_packets': [],
        }
        queries.append(q)
        queries_by_id[qid] = q
    else:
        # response
        qid = p[DNS].id
        try:
            queries_by_id[qid]
        except KeyError:
            logging.warning(f"Response without query at time {p.time}")
            continue
        queries_by_id[qid]['finished'] = p.time
        queries_by_id[qid]['client_packets'].append(p)
    
queries = pd.DataFrame(queries)
queries['qname_parts'] = queries.apply(split_qname, axis=1)
for key in ['zone_algorithm', 'zone_keysize', 'zone_nsec', 'zone_status', 'resolver']:
    queries[key] = queries.apply(lambda row: row['qname_parts'][key], axis=1)
del queries['qname_parts']

first_query_time, last_query_time = queries['started'].min(), queries['finished'].max()





In [8]:
len(queries)

2686

In [9]:
len(dropped)

276

In [None]:
from datetime import datetime
import math

In [None]:
datetime.fromtimestamp(math.floor(first_query_time)), datetime.fromtimestamp(math.ceil(last_query_time))

In [None]:
%%bash

cd ../traffic
editcap -A "2021-06-17 01:43:25" -B "2021-06-17 02:50:06" eth0-20210616183851.pcap eth0-20210616183851.filtered.pcap

In [None]:
%%bash

ls -lh ../traffic/eth0*

In [None]:
s_packets = rdpcap('../traffic/eth0-20210616183851.filtered.pcap')

In [None]:
len(s_packets)

## Match Client and Server Data

In [None]:
# using the server logs, match packets captured at the server to the client's questions

candidates = []

for p in tqdm(s_packets):
    if queries['started'].min() <= p.time <= queries['finished'].max():
        try:
            p_dns = tag = p[DNS]
        except IndexError:
            logging.warning(f'packet with timestamp {p.time} could not be parsed as DNS packet')
            continue
        
        if p_dns.qd is None:
            logging.warning(f'packet with timestamp {p.time} did not contain a query name')
            continue
        
        tag = p_dns.qd.qname.split(b'.', 1)[0]
        candidate_queries = (queries['started'] <= p.time) & (p.time <= queries['finished']) & (queries['tag'] == tag)
        if sum(candidate_queries) == 0:
            continue
        elif sum(candidate_queries) == 1:
            queries[candidate_queries].iloc[0]['server_packets'].append(p)
        else:
            logging.warning(f'Initial DNS query not uniquely identified for packet with time stamp {p.time}')
        

## Clean and Organize Data

In [10]:
# drop rows that do not have exactly two client packets

queries['num_client_packets'] = queries.apply(lambda row: len(row['client_packets']), axis=1)
drop = queries['num_client_packets'] != 2
if sum(drop):
    logging.warning(f'Dropping {sum(drop)} of {len(queries)} queries as they do not have exactly two client packets')
    for count, num in zip(*np.unique(queries['num_client_packets'], return_counts=True)):
        if count == 2: continue
        logging.warning(f'- {num} queries had {count} client packet(s)')
queries = queries.drop(queries[drop].index)

queries['client_query'] = queries.apply(lambda row: row['client_packets'][0], axis=1)
queries['client_response'] = queries.apply(lambda row: row['client_packets'][1], axis=1)
del queries['client_packets']



In [11]:
queries['client_response_rcode'] = queries.apply(lambda row: row['client_response'].rcode, axis=1)
queries['client_response_ad'] = queries.apply(lambda row: row['client_response'].ad, axis=1)

In [None]:
# TODO: select server packet(s) for feature extraction

def take_first_query(packet_list):
    queries = filter(lambda p: p[DNS].qr == 0, packet_list)
    try:
        return next(iter(queries))
    except StopIteration:
        return None

queries['num_server_packets'] = queries.apply(lambda row: len(row['server_packets']), axis=1)
queries['server_query'] = queries.apply(lambda row: take_first_query(row['server_packets']), axis=1)

In [None]:
drop = queries['server_query'].isnull()
if sum(drop):
    logging.warning(f"Dropping {sum(drop)} of {len(queries)} queries as corresponding server packets could not be found!")
    queries = queries.drop(queries[drop].index)

In [12]:
queries['validated_response'] = (queries['client_response_rcode'] == 0) & (queries['client_response_ad'] == 1)
queries['rc0&ad1'] = queries['validated_response']
queries['rc0'] = queries['client_response_rcode'] == 0

In [None]:
queries['num_server_packets'].min(), queries['num_server_packets'].mean(), queries['num_server_packets'].max(), queries['num_server_packets'].unique()

## Extract Features and Label

In [None]:
# set the label

queries['label_rcode'] = queries['client_response_rcode']
queries['label_resolver'] = queries['resolver']
queries['label_rcode0andad1'] = queries['validated_response']

In [None]:
# set some features

queries['feature_ip_ttl'] = queries.apply(lambda row: row['server_query'][IP].ttl, axis=1)
queries['feature_ip_src'] = queries.apply(lambda row: row['server_query'][IP].src, axis=1)
queries['feature_ip_proto'] = queries.apply(lambda row: row['server_query'][IP].proto, axis=1)
queries['feature_ip_df'] = queries.apply(lambda row: row['server_query'][IP].flags == 2, axis=1)
queries['feature_ip_ihl'] = queries.apply(lambda row: row['server_query'][IP].ihl, axis=1)
queries['feature_ip_id_is_zero'] = queries.apply(lambda row: row['server_query'][IP].id == 0, axis=1)
queries['feature_udp_len'] = queries.apply(lambda row: row['server_query'][UDP].len, axis=1)
queries['feature_dns_qr'] = queries.apply(lambda row: row['server_query'][DNS].qr, axis=1)
queries['feature_dns_opcode'] = queries.apply(lambda row: row['server_query'][DNS].opcode, axis=1)
queries['feature_dns_aa'] = queries.apply(lambda row: row['server_query'][DNS].aa, axis=1)
queries['feature_dns_tc'] = queries.apply(lambda row: row['server_query'][DNS].tc, axis=1)
queries['feature_dns_rd'] = queries.apply(lambda row: row['server_query'][DNS].rd, axis=1)
queries['feature_dns_ra'] = queries.apply(lambda row: row['server_query'][DNS].ra, axis=1)
queries['feature_dns_z'] = queries.apply(lambda row: row['server_query'][DNS].z, axis=1)
queries['feature_dns_cd'] = queries.apply(lambda row: row['server_query'][DNS].cd, axis=1)
queries['feature_dns_rcode'] = queries.apply(lambda row: row['server_query'][DNS].rcode, axis=1)
queries['feature_dns_qdcount'] = queries.apply(lambda row: row['server_query'][DNS].qdcount, axis=1)
queries['feature_dns_ancount'] = queries.apply(lambda row: row['server_query'][DNS].ancount, axis=1)
queries['feature_dns_nscount'] = queries.apply(lambda row: row['server_query'][DNS].nscount, axis=1)
queries['feature_dns_arcount'] = queries.apply(lambda row: row['server_query'][DNS].arcount, axis=1)
queries['feature_dns_edns_requestors_udp_payload_size'] = queries.apply(lambda row: row['server_query'][DNS].ar.rclass, axis=1)
queries['feature_dns_0x20'] = queries.apply(lambda row: row['server_query'][DNS].qd.qname.lower() != row['server_query'][DNS].qd.qname and row['server_query'][DNS].qd.qname.upper() != row['server_query'][DNS].qd.qname, axis=1)
queries['feature_dns_edns_num_extra_attributes'] = queries.apply(lambda row: len(row['server_query'][DNS].ar.rdata), axis=1)
assert queries['feature_dns_edns_num_extra_attributes'].max() == 1
queries['feature_dns_edns_optcode'] = queries.apply(lambda row: row['server_query'][DNS].ar.rdata[0].optcode if row['feature_dns_edns_num_extra_attributes'] == 1 else -1, axis=1)
assert set(queries['feature_dns_edns_optcode'].unique()) == {-1, 8, 10}
queries['feature_dns_edns_cookie'] = queries['feature_dns_edns_optcode'] == 10
queries['feature_dns_edns_subnet'] = queries['feature_dns_edns_optcode'] == 8
del queries['feature_dns_edns_num_extra_attributes']
del queries['feature_dns_edns_optcode']
#queries['feature_zone_algorithm'] = queries['zone_algorithm']
#queries['feature_zone_keysize'] = queries['zone_keysize']
#queries['feature_zone_nsec'] = queries['zone_nsec']

In [None]:
for algo in queries['zone_algorithm'].unique():
    print(f'++++{algo}++++')
    for c in filter(lambda c: c.startswith('feature_'), queries.keys()):
        u = np.unique(queries[(queries['zone_algorithm'] == algo)][c], return_counts=True)
        if len(u[0]) > 1:
            print(f"{c}: {len(u[0])} unique values")

## Show Data
### All Columns

In [13]:
queries[queries['zone_status'] == 'signedok'].iloc[-10:].style.format({'server_packets': packet_list_fmt, 'client_query': packet_fmt, 'client_response': packet_fmt, 'server_query': packet_fmt})

Unnamed: 0,id,qname,tag,started,server_packets,finished,zone_algorithm,zone_keysize,zone_nsec,zone_status,resolver,num_client_packets,client_query,client_response,client_response_rcode,client_response_ad,validated_response,rc0&ad1,rc0
2657,41058,b'lab-ws2019-20210617024144.rsasha256-4096-1-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890622.762194,0 packets,1623890623.032772,rsasha256,4096,1,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha256-4096-1-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True
2660,25626,b'lab-ws2019-20210617024144.rsasha256-4096-3-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890623.383907,0 packets,1623890623.544726,rsasha256,4096,3,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha256-4096-3-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True
2663,46391,b'lab-ws2019-20210617024144.rsasha512-1024-1-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890624.09258,0 packets,1623890624.431599,rsasha512,1024,1,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha512-1024-1-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True
2666,46707,b'lab-ws2019-20210617024144.rsasha512-1024-3-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890624.782034,0 packets,1623890625.0099,rsasha512,1024,3,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha512-1024-3-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True
2669,64221,b'lab-ws2019-20210617024144.rsasha512-1871-1-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890625.465641,0 packets,1623890625.580663,rsasha512,1871,1,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha512-1871-1-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True
2672,55637,b'lab-ws2019-20210617024144.rsasha512-1871-3-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890625.910329,0 packets,1623890626.005194,rsasha512,1871,3,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha512-1871-3-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True
2675,65304,b'lab-ws2019-20210617024144.rsasha512-2048-1-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890626.360359,0 packets,1623890626.51447,rsasha512,2048,1,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha512-2048-1-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True
2678,16923,b'lab-ws2019-20210617024144.rsasha512-2048-3-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890627.141417,0 packets,1623890627.336249,rsasha512,2048,3,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha512-2048-3-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True
2681,13480,b'lab-ws2019-20210617024144.rsasha512-4096-1-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890627.753746,0 packets,1623890628.050598,rsasha512,4096,1,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha512-4096-1-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True
2684,4898,b'lab-ws2019-20210617024144.rsasha512-4096-3-signedok.a.dnstb.net.',b'lab-ws2019-20210617024144',1623890628.474268,0 packets,1623890628.574319,rsasha512,4096,3,signedok,ws2019,2,"DNS Qry ""b'lab-ws2019-20210617024144.rsasha512-4096-3-signedok.a.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,1,True,True,True


In [14]:
def validation_info(data):
    n = len(queries)
    for rcode in data['client_response_rcode'].unique():
        for ad in data['client_response_ad'].unique():
            c = sum((queries['client_response_rcode'] == rcode) & (queries['client_response_ad'] == ad))
            print(f"{c}/{n} ({c/n:.1%}) have rcode={rcode} ad={ad}")
            
print('all queries')
validation_info(queries)

print('signedok queries')
validation_info(queries[queries['zone_status'] == 'signedok'])

all queries
712/2102 (33.9%) have rcode=0 ad=0
528/2102 (25.1%) have rcode=0 ad=1
855/2102 (40.7%) have rcode=2 ad=0
7/2102 (0.3%) have rcode=2 ad=1
signedok queries
528/2102 (25.1%) have rcode=0 ad=1
712/2102 (33.9%) have rcode=0 ad=0
7/2102 (0.3%) have rcode=2 ad=1
855/2102 (40.7%) have rcode=2 ad=0


In [15]:
def validation_success_rate(c):
    return sum(c == 1) / len(c)

def sr(c):
    return float(sum(c == 1) / len(c))

In [16]:
queries['zone_keysize'] = queries['zone_keysize'].astype(np.int)

In [17]:
resolvers = set(queries['resolver'].unique())
resolvers_lab = {'bind9113',
 'powerdns460',
 'unbound167',
 'ws2012',
 'ws2012r2',
 'ws2016',
 'ws2019'}
resolvers_named_open = resolvers - resolvers_lab
resolver_order = list(sorted(resolvers_lab)) + list(sorted(resolvers_named_open))
resolver_order

['bind9113',
 'powerdns460',
 'unbound167',
 'ws2012',
 'ws2012r2',
 'ws2016',
 'ws2019',
 'cisco',
 'cloudflare',
 'comodo',
 'cznic',
 'freenom',
 'google',
 'kresd532',
 'neustar',
 'norton',
 'opennic',
 'oracle',
 'quad9']

In [18]:
def validation_status_color(val):
    if np.isnan(val):
        return 'color: grey'
    color = 'red' if val < 1 else 'black'
    return 'color: %s' % color

def best_algo(c):
    cnum = list(map(lambda name: ALGO_NUM[name], c))
    return ALGO_NAME[max(cnum)]

pd.options.display.precision = 0

resolver_support = queries[
    (queries['zone_status'] == 'signedok')  # only valid signatures / secure zones
    & ~((queries['zone_nsec'] == '3') & (queries['zone_algorithm'] == 'rsasha1'))  # ignore nsec3 with rsasha1
    & ~(queries['resolver'].isin(['freenom', 'opennic', 'quad9']))  # ignore resolvers with measurement/analysis errors
].groupby(['resolver', 'zone_algorithm', 'zone_keysize', 'zone_nsec']).agg({
    #'id': ['count'],
    'rc0&ad1': [sr],
}).reset_index()
resolver_best = resolver_support[resolver_support[('rc0&ad1', 'sr')]].groupby(['resolver']).agg({
    'zone_algorithm': [best_algo]
}).sort_values(['resolver'], key=lambda r: list(map(lambda e: resolver_order.index(e), r)), kind='mergesort')
resolver_best.to_pickle('resolver_best.pickle')
resolver_best

Unnamed: 0_level_0,zone_algorithm
Unnamed: 0_level_1,best_algo
resolver,Unnamed: 1_level_2
bind9113,ecdsap384sha384
powerdns460,ed448
unbound167,ed25519
ws2012,ecdsap384sha384
ws2012r2,ecdsap384sha384
ws2016,ecdsap384sha384
ws2019,ecdsap384sha384
cisco,ed448
cloudflare,ed25519
comodo,ed448


In [19]:
def response_status(row):
    rs = {
        (0, 1): 'validated',
        (0, 0): 'insecure', 
        (2, 0): 'failure',
        (2, 1): 'failure ad',
    }
    return rs[(row['client_response_rcode'], row['client_response_ad'])]

def single_value(c):
    if len(c) == 0:
        return None
    if len(c) == 1:
        return next(iter(c))
    raise ValueError()

def security_status_color(val):
    if isinstance(val, float) and math.isnan(val):
        color = 'grey'
    else:
        color = 'green' if val == 'validated' else 'red'
    #print(type(val), val, color)        
    return 'color: %s' % color

def latex_symbol(val):
    if val == 'validated':
        return r'\cmark'
    elif val == 'insecure':
        return r'\xmark'
    else:
        raise ValueError()

queries['resolver response'] = queries.apply(response_status, axis=1)
queries['key size'] = queries['zone_keysize']
queries['algorithm'] = queries['zone_algorithm']
queries['nsec'] = queries['zone_nsec']

resolver_support = queries[
    (queries['zone_status'] == 'signedok')  # only valid signatures / secure zones
    & (queries['zone_nsec'] == '1')  # only NSEC1
    & ~((queries['zone_nsec'] == '3') & (queries['zone_algorithm'] == 'rsasha1'))  # ignore nsec3 with rsasha1
    & ~(queries['zone_keysize'] > 2048)  # ignore large key sizes
    & ~(queries['zone_algorithm'].isin(['rsasha1', 'rsasha1nsec3sha1', 'rsasha256', 'rsasha512']) & (queries['zone_keysize'] < 2048))  # ignore small rsa keys
    & ~(queries['zone_algorithm'] == 'rsasha1nsec3sha1')  # ignore weird algorithms
    & ~(queries['resolver'].isin(['freenom', 'opennic', 'quad9']))  # ignore resolvers with measurement/analysis errors
].groupby(['resolver', 'algorithm', 'key size', 'nsec']).agg({
    #'id': ['count'],
    'resolver response': [single_value],
}).reset_index().sort_values(['key size']).sort_values(by=['algorithm'], key=lambda c: c.apply(
    lambda zone_algorithm: ALGO_NUM[zone_algorithm]), kind='mergesort').pivot(
    ['resolver'], ['algorithm', 'key size'], ['resolver response']).sort_values(
    ['resolver'], key=lambda r: list(map(lambda e: resolver_order.index(e), r)), kind='mergesort')

print(resolver_support.to_latex(
    formatters={
        c: latex_symbol
        for c in resolver_support.keys()
    },
    escape=False,
    column_format='lcccccc',
))
resolver_support.style.applymap(security_status_color)

\begin{tabular}{lcccccc}
\toprule
{} & \multicolumn{7}{l}{resolver response} \\
algorithm &           rsasha1 & rsasha256 & rsasha512 & ecdsap256sha256 & ecdsap384sha384 & ed25519 &   ed448 \\
key size &              2048 &      2048 &      2048 &            256  &            384  &    256  &    456  \\
resolver    &                   &           &           &                 &                 &         &         \\
\midrule
bind9113    &            \cmark &    \cmark &    \cmark &          \cmark &          \cmark &  \xmark &  \xmark \\
powerdns460 &            \cmark &    \cmark &    \cmark &          \cmark &          \cmark &  \cmark &  \cmark \\
unbound167  &            \cmark &    \cmark &    \cmark &          \cmark &          \cmark &  \cmark &  \xmark \\
ws2012      &            \cmark &    \cmark &    \cmark &          \cmark &          \cmark &  \xmark &  \xmark \\
ws2012r2    &            \cmark &    \cmark &    \cmark &          \cmark &          \cmark &  \xmark &  \xmark

Unnamed: 0_level_0,resolver response,resolver response,resolver response,resolver response,resolver response,resolver response,resolver response
algorithm,rsasha1,rsasha256,rsasha512,ecdsap256sha256,ecdsap384sha384,ed25519,ed448
key size,2048,2048,2048,256,384,256,456
resolver,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
bind9113,validated,validated,validated,validated,validated,insecure,insecure
powerdns460,validated,validated,validated,validated,validated,validated,validated
unbound167,validated,validated,validated,validated,validated,validated,insecure
ws2012,validated,validated,validated,validated,validated,insecure,insecure
ws2012r2,validated,validated,validated,validated,validated,insecure,insecure
ws2016,validated,validated,validated,validated,validated,insecure,insecure
ws2019,validated,validated,validated,validated,validated,insecure,insecure
cisco,validated,validated,validated,validated,validated,validated,validated
cloudflare,validated,validated,validated,validated,validated,validated,insecure
comodo,validated,validated,validated,validated,validated,validated,validated


### ML Columns

In [None]:
# collect columns relevant for ML
ML = list(filter(lambda c: c.startswith('label_') or c.startswith('feature_'), queries.keys()))

In [None]:
# collect rows relevant for ML
signedok = queries['zone_status'] == 'signedok'

In [None]:
# save table to disk
for algorithm in queries['zone_algorithm'].unique():
    queries[signedok & (queries['zone_algorithm'] == algorithm)].to_pickle(f'ml_data_{algorithm}.pickle')

In [None]:
%%bash
ls -lh ml_data_*.pickle

In [None]:
# show table
queries[signedok][ML].iloc[-10:].style.format({'server_packets': packet_list_fmt, 'client_query': packet_fmt, 'client_response': packet_fmt})

In [None]:
sum(signedok)

### Some Data Insights

In [None]:
# client query duration distribution

queries['duration'] = queries['finished'] - queries['started']

data = queries['duration'].astype(np.float).dropna()
bins = [b*10**p for p in [-3, -2, -1, 0] for b in [1,1.3,1.6,2,3,4,5,6.6,8.3]]
labels = bins[::3]
hist, bin_edges = np.histogram(data, bins=bins)
cdf = np.cumsum(hist)

plt.figure(figsize=(8, 6))
plt.grid(True)

ax, tax = plt.gca(), plt.twinx()

ax.hist(data, bins=bins, density=True)
ax.set_ylabel('probability density')
ax.set_xlabel('d [s]')

tax.plot(bin_edges[:-1], cdf/cdf[-1], c='r')
tax.set_ylabel('P[client query-response duration < d]')

plt.xscale('log')
plt.xticks(labels, map(str, labels))
plt.title('Distribution of Client Query Round Trip Duration')

None

In [None]:
# show a single query to the server

for p in queries.iloc[200]['server_packets']:
    if p[DNS].qr != 0:
        continue
    print(p[IP].show())