# Generate Training Data
## Preparation

In [3]:
from scapy.all import *
import pandas as pd
import numpy as np
from tqdm import tqdm

In [4]:
import logging

logging.basicConfig(level=logging.DEBUG)

In [5]:
# Formatters for displaying pandas data tables with columns of (lists of) scapy packets

def packet_list_fmt(lst):
    return f"{len(lst)} packets"

def packet_fmt(pkt):
    return pkt[DNS].summary()

In [6]:
ALGO_NAME = {
    0: 'unsigned',
    0: 0,
    5: 'rsasha1', 
    7: 'rsasha1nsec3sha1', 
    8: 'rsasha256', 
    10: 'rsasha512',
    13: 'ecdsap256sha256', 
    14: 'ecdsap384sha384', 
    15: 'ed25519', 
    16: 'ed448',
}
ALGO_NUM = {name: num for num, name in ALGO_NAME.items()}

In [25]:
# retrieve information encoded in DNS qnames

def split_qname(row):
    qname = row['qname']
    parts = qname.decode('ascii').split(".")
    try:
        if ".".join(parts[-3:]) == "dnstb.net.":
            if '-' not in parts[-4]:
                # this qname has a label directly under dnstb.net, store it
                # then remove to be compatible with qnames without label
                qlabel = parts[-4]
                del parts[-4]
            else:
                qlabel = ''
            
            resolver_parts = parts[-5].split("-")            
            if parts[-4] == 'unsigned':
                return {
                    'zone_algorithm': 0,
                    'zone_keysize': 0,
                    'zone_nsec': 0,
                    'zone_status': parts[-4],
                    'resolver': resolver_parts[1],
                    'qlabel': qlabel,                    
                }
            else:
                zone_parts = parts[-4].split("-")
                return {
                    'zone_algorithm': zone_parts[0],
                    'zone_keysize': zone_parts[1],
                    'zone_nsec': zone_parts[2],
                    'zone_status': zone_parts[3],
                    'resolver': resolver_parts[1],
                    'qlabel': qlabel,
                }
    except IndexError:
        logging.warning(f"could not split qname {qname} into meaningful information")
        return {
            'zone_algorithm': None,
            'zone_keysize': None,
            'zone_nsec': None,
            'zone_status': None,
            'resolver': None,
            'qlabel': None,
        }
    logging.warning(f"Could not split qname: {qname}")
    return {
        'zone_algorithm': None,
        'zone_keysize': None,
        'zone_nsec': None,
        'zone_status': None,
        'resolver': None,
        'qlabel': None,
    }

## Load Data

In [1]:
%%bash 

mkdir -p ../traffic
rsync -aP root@ns1.adnssec.dedyn.io:/var/log/tcplogger/ ../traffic/

receiving incremental file list
./
eth0-20210621232559.pcap
              0   0%    0.00kB/s    0:00:00       11,436,032   0%   10.91MB/s    0:19:46       23,199,744   0%   11.06MB/s    0:19:29       33,587,200   0%   10.68MB/s    0:20:10       36,765,696   0%    8.69MB/s    0:24:45       38,862,848   0%    6.48MB/s    0:33:14       40,828,928   0%    4.06MB/s    0:52:58       42,762,240   0%    2.08MB/s    1:43:20       44,957,696   0%    1.87MB/s    1:55:14       48,136,192   0%    2.11MB/s    1:41:55       51,970,048   0%    2.60MB/s    1:22:47       56,623,104   0%    3.28MB/s    1:05:29       61,997,056   0%    4.05MB/s    0:53:03       68,190,208   0%    4.78MB/s    0:44:55       75,038,720   0%    5.50MB/s    0:39:01       82,509,824   0%    6.17MB/s    0:34:46       90,537,984   0%    6.80MB/s    0:31:31       99,221,504   0%    7.39MB/s    0:28:59      107,577,344   0%    7.75MB/s    0:27:37      115,605,504   0%    7.89MB/s    0:27:07      124,420,096   0

In [47]:
# load the traffic logs
# TODO: loop over available traffic dumps

c_logs = filter(lambda s: s, """
tcpdump_opn_cisco-umbrella_vpn0_20210617014323.pcap
tcpdump_opn_cloudflare_vpn0_20210617014323.pcap
tcpdump_opn_comodo-secure-dns_vpn0_20210617014323.pcap
tcpdump_opn_cznic-odvr_vpn0_20210617014323.pcap
tcpdump_opn_freenom-world_vpn0_20210617014323.pcap
tcpdump_opn_google_vpn0_20210617014323.pcap
tcpdump_opn_neustar-free-recursive_vpn0_20210617014323.pcap
tcpdump_opn_norton-connectsafe_vpn0_20210617014323.pcap
tcpdump_opn_opennic_vpn0_20210617014323.pcap
tcpdump_opn_oracle-dyn_vpn0_20210617014323.pcap
tcpdump_opn_quad9_vpn0_20210617014323.pcap

tcpdump_lab_bind9113_vpn0_20210617024144.pcap
tcpdump_lab_kresd532_vpn0_20210617024144.pcap
tcpdump_lab_powerdns460_vpn0_20210617024144.pcap
tcpdump_lab_unbound167_vpn0_20210617024144.pcap
tcpdump_lab_ws2012r2_vpn0_20210617024144.pcap
tcpdump_lab_ws2012_vpn0_20210617024144.pcap
tcpdump_lab_ws2016_vpn0_20210617024144.pcap
tcpdump_lab_ws2019_vpn0_20210617024144.pcap
""".split("\n"))

c_logs = filter(lambda s: s, """
tcpdump_opn-anon_101-50-61-31_0xcb_ens160_20210619010947.pcap
tcpdump_opn-anon_102-22-192-121_0x11_ens160_20210619010947.pcap
tcpdump_opn-anon_102-223-5-1_0xfb_ens160_20210619010947.pcap
tcpdump_opn-anon_103-104-212-98_0x95_ens160_20210619010947.pcap
tcpdump_opn-anon_103-106-58-122_0x47_ens160_20210619010947.pcap
tcpdump_opn-anon_103-108-57-9_0x3_ens160_20210619010947.pcap
tcpdump_opn-anon_103-112-238-125_0x1a_ens160_20210619010947.pcap
tcpdump_opn-anon_103-112-238-190_0xfe_ens160_20210619010947.pcap
tcpdump_opn-anon_103-112-238-52_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_103-129-236-221_0x8d_ens160_20210619010947.pcap
tcpdump_opn-anon_103-132-181-244_0xfa_ens160_20210619010947.pcap
tcpdump_opn-anon_103-132-182-32_0xe9_ens160_20210619010947.pcap
tcpdump_opn-anon_103-132-183-239_0xbb_ens160_20210619010947.pcap
tcpdump_opn-anon_103-141-46-251_0x3_ens160_20210619010947.pcap
tcpdump_opn-anon_103-147-245-46_0x11_ens160_20210619010947.pcap
tcpdump_opn-anon_103-148-75-126_0xa1_ens160_20210619010947.pcap
tcpdump_opn-anon_103-149-73-123_0xfd_ens160_20210619010947.pcap
tcpdump_opn-anon_103-151-171-128_0x14_ens160_20210619010947.pcap
tcpdump_opn-anon_103-152-142-149_0x3e_ens160_20210619010947.pcap
tcpdump_opn-anon_103-153-210-232_0x30_ens160_20210619010947.pcap
tcpdump_opn-anon_103-153-211-155_0x10_ens160_20210619010947.pcap
tcpdump_opn-anon_103-154-156-142_0x14_ens160_20210619010947.pcap
tcpdump_opn-anon_103-154-156-221_0x12_ens160_20210619010947.pcap
tcpdump_opn-anon_103-193-77-74_0xda_ens160_20210619010947.pcap
tcpdump_opn-anon_103-195-2-22_0x11_ens160_20210619010947.pcap
tcpdump_opn-anon_103-202-53-50_0x9_ens160_20210619010947.pcap
tcpdump_opn-anon_103-202-54-239_0xcd_ens160_20210619010947.pcap
tcpdump_opn-anon_103-221-254-82_0xfb_ens160_20210619010947.pcap
tcpdump_opn-anon_103-248-210-164_0x10_ens160_20210619010947.pcap
tcpdump_opn-anon_103-255-15-203_0x23_ens160_20210619010947.pcap
tcpdump_opn-anon_103-40-148-225_0x8c_ens160_20210619010947.pcap
tcpdump_opn-anon_103-49-157-173_0xf_ens160_20210619010947.pcap
tcpdump_opn-anon_103-53-11-51_0xa5_ens160_20210619010947.pcap
tcpdump_opn-anon_103-84-119-182_0x9e_ens160_20210619010947.pcap
tcpdump_opn-anon_103-86-195-234_0x1c_ens160_20210619010947.pcap
tcpdump_opn-anon_103-88-25-182_0x77_ens160_20210619010947.pcap
tcpdump_opn-anon_103-88-25-242_0xf4_ens160_20210619010947.pcap
tcpdump_opn-anon_103-99-150-20_0x16_ens160_20210619010947.pcap
tcpdump_opn-anon_105-243-200-48_0x13_ens160_20210619010947.pcap
tcpdump_opn-anon_105-244-178-241_0x2e_ens160_20210619010947.pcap
tcpdump_opn-anon_106-184-59-37_0xa6_ens160_20210619010947.pcap
tcpdump_opn-anon_109-161-167-178_0x86_ens160_20210619010947.pcap
tcpdump_opn-anon_109-194-30-4_0xab_ens160_20210619010947.pcap
tcpdump_opn-anon_109-86-145-13_0x19_ens160_20210619010947.pcap
tcpdump_opn-anon_109-86-74-14_0x93_ens160_20210619010947.pcap
tcpdump_opn-anon_109-86-88-79_0x9_ens160_20210619010947.pcap
tcpdump_opn-anon_109-98-190-95_0x72_ens160_20210619010947.pcap
tcpdump_opn-anon_110-36-208-235_0xf2_ens160_20210619010947.pcap
tcpdump_opn-anon_110-78-81-178_0x5_ens160_20210619010947.pcap
tcpdump_opn-anon_113-161-219-209_0xc0_ens160_20210619010947.pcap
tcpdump_opn-anon_114-4-192-164_0x4_ens160_20210619010947.pcap
tcpdump_opn-anon_115-43-29-247_0xc9_ens160_20210619010947.pcap
tcpdump_opn-anon_115-92-96-113_0x10_ens160_20210619010947.pcap
tcpdump_opn-anon_116-213-171-245_0x88_ens160_20210619010947.pcap
tcpdump_opn-anon_116-92-231-70_0xd8_ens160_20210619010947.pcap
tcpdump_opn-anon_117-2-83-34_0x1a_ens160_20210619010947.pcap
tcpdump_opn-anon_118-69-133-28_0xff_ens160_20210619010947.pcap
tcpdump_opn-anon_119-63-95-24_0xfe_ens160_20210619010947.pcap
tcpdump_opn-anon_119-75-38-146_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_120-50-44-234_0x41_ens160_20210619010947.pcap
tcpdump_opn-anon_120-89-47-201_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_121-7-250-2_0x5_ens160_20210619010947.pcap
tcpdump_opn-anon_12-221-171-0_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_12-2-221-248_0x9c_ens160_20210619010947.pcap
tcpdump_opn-anon_12-246-85-22_0x5c_ens160_20210619010947.pcap
tcpdump_opn-anon_12-247-154-142_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_122-53-7-129_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_123-253-34-1_0xe9_ens160_20210619010947.pcap
tcpdump_opn-anon_124-197-42-125_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_125-16-83-73_0xa7_ens160_20210619010947.pcap
tcpdump_opn-anon_125-21-146-238_0x20_ens160_20210619010947.pcap
tcpdump_opn-anon_128-199-230-70_0xfa_ens160_20210619010947.pcap
tcpdump_opn-anon_128-201-76-233_0xf9_ens160_20210619010947.pcap
tcpdump_opn-anon_128-201-92-151_0x9_ens160_20210619010947.pcap
tcpdump_opn-anon_12-86-35-174_0xd6_ens160_20210619010947.pcap
tcpdump_opn-anon_12-89-37-230_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_130-93-96-237_0x48_ens160_20210619010947.pcap
tcpdump_opn-anon_131-0-164-70_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_134-90-232-43_0xa_ens160_20210619010947.pcap
tcpdump_opn-anon_136-233-84-33_0xed_ens160_20210619010947.pcap
tcpdump_opn-anon_137-59-225-180_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_139-255-93-162_0x3_ens160_20210619010947.pcap
tcpdump_opn-anon_14-160-3-78_0x14_ens160_20210619010947.pcap
tcpdump_opn-anon_143-202-192-160_0x5_ens160_20210619010947.pcap
tcpdump_opn-anon_143-208-172-91_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_145-255-4-150_0x4_ens160_20210619010947.pcap
tcpdump_opn-anon_147-135-3-107_0x6_ens160_20210619010947.pcap
tcpdump_opn-anon_148-81-127-130_0xc_ens160_20210619010947.pcap
tcpdump_opn-anon_149-202-150-142_0xa4_ens160_20210619010947.pcap
tcpdump_opn-anon_149-38-1-186_0xde_ens160_20210619010947.pcap
tcpdump_opn-anon_151-106-55-208_0x3_ens160_20210619010947.pcap
tcpdump_opn-anon_154-127-208-221_0xd9_ens160_20210619010947.pcap
tcpdump_opn-anon_154-73-26-51_0xf_ens160_20210619010947.pcap
tcpdump_opn-anon_158-174-104-234_0x5_ens160_20210619010947.pcap
tcpdump_opn-anon_158-174-110-33_0xff_ens160_20210619010947.pcap
tcpdump_opn-anon_162-159-36-90_0x4_ens160_20210619010947.pcap
tcpdump_opn-anon_162-159-46-226_0xf7_ens160_20210619010947.pcap
tcpdump_opn-anon_162-159-50-63_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_162-216-66-62_0xb_ens160_20210619010947.pcap
tcpdump_opn-anon_164-160-126-89_0x1a_ens160_20210619010947.pcap
tcpdump_opn-anon_164-163-134-146_0x9_ens160_20210619010947.pcap
tcpdump_opn-anon_167-114-106-233_0xff_ens160_20210619010947.pcap
tcpdump_opn-anon_167-172-177-203_0xe0_ens160_20210619010947.pcap
tcpdump_opn-anon_167-71-228-143_0x24_ens160_20210619010947.pcap
tcpdump_opn-anon_170-233-220-48_0xf7_ens160_20210619010947.pcap
tcpdump_opn-anon_170-244-187-10_0x89_ens160_20210619010947.pcap
tcpdump_opn-anon_171-103-146-75_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_171-103-69-37_0xf_ens160_20210619010947.pcap
tcpdump_opn-anon_171-103-88-235_0x5_ens160_20210619010947.pcap
tcpdump_opn-anon_172-64-37-173_0x87_ens160_20210619010947.pcap
tcpdump_opn-anon_172-64-37-231_0x3c_ens160_20210619010947.pcap
tcpdump_opn-anon_172-64-37-26_0xfc_ens160_20210619010947.pcap
tcpdump_opn-anon_175-107-201-78_0x5_ens160_20210619010947.pcap
tcpdump_opn-anon_175-107-202-235_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_176-102-204-30_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_176-104-9-22_0x43_ens160_20210619010947.pcap
tcpdump_opn-anon_176-215-1-121_0xc_ens160_20210619010947.pcap
tcpdump_opn-anon_176-236-89-75_0x74_ens160_20210619010947.pcap
tcpdump_opn-anon_176-62-237-42_0x46_ens160_20210619010947.pcap
tcpdump_opn-anon_178-206-231-71_0x73_ens160_20210619010947.pcap
tcpdump_opn-anon_178-54-190-136_0x16_ens160_20210619010947.pcap
tcpdump_opn-anon_178-54-6-187_0x91_ens160_20210619010947.pcap
tcpdump_opn-anon_179-49-83-109_0xa_ens160_20210619010947.pcap
tcpdump_opn-anon_179-49-83-131_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_179-51-134-210_0x8_ens160_20210619010947.pcap
tcpdump_opn-anon_181-16-234-7_0xb4_ens160_20210619010947.pcap
tcpdump_opn-anon_181-210-70-157_0x8_ens160_20210619010947.pcap
tcpdump_opn-anon_181-224-187-27_0xa_ens160_20210619010947.pcap
tcpdump_opn-anon_181-225-42-21_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_182-162-22-86_0xfc_ens160_20210619010947.pcap
tcpdump_opn-anon_182-48-64-136_0xa_ens160_20210619010947.pcap
tcpdump_opn-anon_182-48-86-99_0xb_ens160_20210619010947.pcap
tcpdump_opn-anon_182-52-50-160_0xe4_ens160_20210619010947.pcap
tcpdump_opn-anon_183-182-109-203_0x14_ens160_20210619010947.pcap
tcpdump_opn-anon_183-3-138-163_0xef_ens160_20210619010947.pcap
tcpdump_opn-anon_184-190-213-9_0x14_ens160_20210619010947.pcap
tcpdump_opn-anon_184-190-215-201_0x1c_ens160_20210619010947.pcap
tcpdump_opn-anon_185-124-200-49_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_185-141-236-80_0x18_ens160_20210619010947.pcap
tcpdump_opn-anon_185-157-241-131_0x10_ens160_20210619010947.pcap
tcpdump_opn-anon_185-172-35-195_0x59_ens160_20210619010947.pcap
tcpdump_opn-anon_185-179-169-177_0xc5_ens160_20210619010947.pcap
tcpdump_opn-anon_185-185-230-78_0x57_ens160_20210619010947.pcap
tcpdump_opn-anon_185-205-249-46_0x1d_ens160_20210619010947.pcap
tcpdump_opn-anon_185-22-217-69_0xea_ens160_20210619010947.pcap
tcpdump_opn-anon_185-228-169-230_0x7f_ens160_20210619010947.pcap
tcpdump_opn-anon_185-239-104-5_0x1b_ens160_20210619010947.pcap
tcpdump_opn-anon_185-255-47-186_0x19_ens160_20210619010947.pcap
tcpdump_opn-anon_185-39-180-69_0xed_ens160_20210619010947.pcap
tcpdump_opn-anon_185-46-222-205_0x6_ens160_20210619010947.pcap
tcpdump_opn-anon_185-68-26-202_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_185-85-36-12_0x1b_ens160_20210619010947.pcap
tcpdump_opn-anon_185-85-38-225_0xf8_ens160_20210619010947.pcap
tcpdump_opn-anon_186-136-58-134_0xea_ens160_20210619010947.pcap
tcpdump_opn-anon_186-179-99-43_0xd5_ens160_20210619010947.pcap
tcpdump_opn-anon_186-194-240-254_0x5_ens160_20210619010947.pcap
tcpdump_opn-anon_187-115-3-186_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_187-19-198-134_0x9_ens160_20210619010947.pcap
tcpdump_opn-anon_187-6-110-94_0x17_ens160_20210619010947.pcap
tcpdump_opn-anon_188-127-39-78_0x76_ens160_20210619010947.pcap
tcpdump_opn-anon_188-246-4-137_0x66_ens160_20210619010947.pcap
tcpdump_opn-anon_188-32-203-229_0x37_ens160_20210619010947.pcap
tcpdump_opn-anon_189-1-123-2_0x40_ens160_20210619010947.pcap
tcpdump_opn-anon_189-203-172-198_0xdb_ens160_20210619010947.pcap
tcpdump_opn-anon_190-13-217-64_0xc_ens160_20210619010947.pcap
tcpdump_opn-anon_190-136-180-95_0x45_ens160_20210619010947.pcap
tcpdump_opn-anon_190-216-69-5_0x14_ens160_20210619010947.pcap
tcpdump_opn-anon_190-57-20-153_0xe3_ens160_20210619010947.pcap
tcpdump_opn-anon_192-119-157-131_0x9f_ens160_20210619010947.pcap
tcpdump_opn-anon_192-141-104-153_0xa_ens160_20210619010947.pcap
tcpdump_opn-anon_193-34-140-58_0xa_ens160_20210619010947.pcap
tcpdump_opn-anon_193-36-117-209_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_193-59-26-249_0x16_ens160_20210619010947.pcap
tcpdump_opn-anon_193-70-16-130_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_193-95-3-51_0x12_ens160_20210619010947.pcap
tcpdump_opn-anon_194-145-240-7_0x44_ens160_20210619010947.pcap
tcpdump_opn-anon_194-186-153-58_0xf1_ens160_20210619010947.pcap
tcpdump_opn-anon_194-76-188-8_0x3_ens160_20210619010947.pcap
tcpdump_opn-anon_194-9-69-57_0x4f_ens160_20210619010947.pcap
tcpdump_opn-anon_195-136-206-189_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_195-14-114-1_0xc_ens160_20210619010947.pcap
tcpdump_opn-anon_195-42-154-133_0x3_ens160_20210619010947.pcap
tcpdump_opn-anon_195-77-206-84_0x99_ens160_20210619010947.pcap
tcpdump_opn-anon_196-15-172-194_0xb_ens160_20210619010947.pcap
tcpdump_opn-anon_196-201-108-244_0x31_ens160_20210619010947.pcap
tcpdump_opn-anon_196-213-213-205_0xc_ens160_20210619010947.pcap
tcpdump_opn-anon_196-214-163-89_0x13_ens160_20210619010947.pcap
tcpdump_opn-anon_196-214-190-234_0x12_ens160_20210619010947.pcap
tcpdump_opn-anon_196-27-105-130_0xd_ens160_20210619010947.pcap
tcpdump_opn-anon_196-27-107-120_0x18_ens160_20210619010947.pcap
tcpdump_opn-anon_199-203-201-80_0x12_ens160_20210619010947.pcap
tcpdump_opn-anon_200-71-72-179_0x6_ens160_20210619010947.pcap
tcpdump_opn-anon_201-159-160-145_0x9b_ens160_20210619010947.pcap
tcpdump_opn-anon_201-17-28-27_0xf5_ens160_20210619010947.pcap
tcpdump_opn-anon_201-20-89-5_0xff_ens160_20210619010947.pcap
tcpdump_opn-anon_201-218-166-95_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_202-125-83-254_0xe6_ens160_20210619010947.pcap
tcpdump_opn-anon_202-155-210-34_0x6e_ens160_20210619010947.pcap
tcpdump_opn-anon_202-179-144-191_0xaf_ens160_20210619010947.pcap
tcpdump_opn-anon_202-188-31-140_0xb3_ens160_20210619010947.pcap
tcpdump_opn-anon_202-88-42-186_0x60_ens160_20210619010947.pcap
tcpdump_opn-anon_206-189-245-83_0x7b_ens160_20210619010947.pcap
tcpdump_opn-anon_207-164-224-181_0xf_ens160_20210619010947.pcap
tcpdump_opn-anon_208-104-81-153_0xde_ens160_20210619010947.pcap
tcpdump_opn-anon_209-6-122-21_0xf6_ens160_20210619010947.pcap
tcpdump_opn-anon_210-55-57-67_0xd9_ens160_20210619010947.pcap
tcpdump_opn-anon_210-61-116-229_0xa9_ens160_20210619010947.pcap
tcpdump_opn-anon_212-160-69-94_0xb_ens160_20210619010947.pcap
tcpdump_opn-anon_212-248-101-11_0xf0_ens160_20210619010947.pcap
tcpdump_opn-anon_212-26-252-192_0xe5_ens160_20210619010947.pcap
tcpdump_opn-anon_212-5-108-30_0xc4_ens160_20210619010947.pcap
tcpdump_opn-anon_212-54-197-168_0xa0_ens160_20210619010947.pcap
tcpdump_opn-anon_212-56-216-102_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_213-178-34-172_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_213-207-152-245_0xeb_ens160_20210619010947.pcap
tcpdump_opn-anon_213-3-19-193_0x15_ens160_20210619010947.pcap
tcpdump_opn-anon_213-39-125-115_0x96_ens160_20210619010947.pcap
tcpdump_opn-anon_213-4-125-122_0xc_ens160_20210619010947.pcap
tcpdump_opn-anon_213-59-156-77_0xaa_ens160_20210619010947.pcap
tcpdump_opn-anon_216-194-33-53_0x10_ens160_20210619010947.pcap
tcpdump_opn-anon_216-195-244-13_0x13_ens160_20210619010947.pcap
tcpdump_opn-anon_217-150-37-109_0xdd_ens160_20210619010947.pcap
tcpdump_opn-anon_217-181-176-2_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_217-199-77-204_0x3d_ens160_20210619010947.pcap
tcpdump_opn-anon_218-189-102-186_0x8_ens160_20210619010947.pcap
tcpdump_opn-anon_218-28-133-54_0xe8_ens160_20210619010947.pcap
tcpdump_opn-anon_220-108-187-68_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_220-247-130-155_0x11_ens160_20210619010947.pcap
tcpdump_opn-anon_23-111-81-84_0x94_ens160_20210619010947.pcap
tcpdump_opn-anon_24-104-140-235_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_24-115-99-183_0x4_ens160_20210619010947.pcap
tcpdump_opn-anon_24-172-246-222_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_31-13-188-237_0x55_ens160_20210619010947.pcap
tcpdump_opn-anon_31-206-60-38_0x9a_ens160_20210619010947.pcap
tcpdump_opn-anon_31-210-154-157_0xe7_ens160_20210619010947.pcap
tcpdump_opn-anon_3-213-112-178_0x69_ens160_20210619010947.pcap
tcpdump_opn-anon_37-114-30-123_0x36_ens160_20210619010947.pcap
tcpdump_opn-anon_39-61-56-129_0xe8_ens160_20210619010947.pcap
tcpdump_opn-anon_41-205-48-40_0xd_ens160_20210619010947.pcap
tcpdump_opn-anon_41-219-18-27_0xe1_ens160_20210619010947.pcap
tcpdump_opn-anon_41-223-233-53_0x2f_ens160_20210619010947.pcap
tcpdump_opn-anon_43-248-33-100_0x1c_ens160_20210619010947.pcap
tcpdump_opn-anon_43-249-115-178_0xec_ens160_20210619010947.pcap
tcpdump_opn-anon_45-143-28-185_0x6a_ens160_20210619010947.pcap
tcpdump_opn-anon_45-160-13-123_0x11_ens160_20210619010947.pcap
tcpdump_opn-anon_45-160-221-200_0x9_ens160_20210619010947.pcap
tcpdump_opn-anon_45-168-21-137_0xb_ens160_20210619010947.pcap
tcpdump_opn-anon_45-176-203-240_0x5_ens160_20210619010947.pcap
tcpdump_opn-anon_45-178-222-138_0x10_ens160_20210619010947.pcap
tcpdump_opn-anon_45-178-222-210_0x17_ens160_20210619010947.pcap
tcpdump_opn-anon_45-179-151-56_0xe0_ens160_20210619010947.pcap
tcpdump_opn-anon_45-180-140-226_0x13_ens160_20210619010947.pcap
tcpdump_opn-anon_45-200-5-49_0xa_ens160_20210619010947.pcap
tcpdump_opn-anon_45-224-96-55_0xfd_ens160_20210619010947.pcap
tcpdump_opn-anon_45-227-53-71_0xf1_ens160_20210619010947.pcap
tcpdump_opn-anon_45-227-54-234_0xec_ens160_20210619010947.pcap
tcpdump_opn-anon_45-230-242-167_0x8a_ens160_20210619010947.pcap
tcpdump_opn-anon_45-232-87-66_0x4_ens160_20210619010947.pcap
tcpdump_opn-anon_45-5-48-191_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_45-5-49-185_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_45-73-14-69_0x8_ens160_20210619010947.pcap
tcpdump_opn-anon_45-76-183-194_0x6_ens160_20210619010947.pcap
tcpdump_opn-anon_45-80-220-160_0xf3_ens160_20210619010947.pcap
tcpdump_opn-anon_45-90-30-114_0x17_ens160_20210619010947.pcap
tcpdump_opn-anon_45-90-30-186_0xe4_ens160_20210619010947.pcap
tcpdump_opn-anon_45-90-30-7_0x2a_ens160_20210619010947.pcap
tcpdump_opn-anon_46-149-33-20_0x6d_ens160_20210619010947.pcap
tcpdump_opn-anon_46-18-203-81_0xc_ens160_20210619010947.pcap
tcpdump_opn-anon_46-249-85-231_0x4_ens160_20210619010947.pcap
tcpdump_opn-anon_46-252-249-162_0xeb_ens160_20210619010947.pcap
tcpdump_opn-anon_46-252-35-124_0xd8_ens160_20210619010947.pcap
tcpdump_opn-anon_46-40-247-34_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_47-214-25-24_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_49-128-185-41_0xc3_ens160_20210619010947.pcap
tcpdump_opn-anon_49-231-174-13_0xa8_ens160_20210619010947.pcap
tcpdump_opn-anon_50-227-177-144_0x4_ens160_20210619010947.pcap
tcpdump_opn-anon_50-232-49-22_0x3_ens160_20210619010947.pcap
tcpdump_opn-anon_5-175-46-61_0xf_ens160_20210619010947.pcap
tcpdump_opn-anon_5-188-115-74_0x97_ens160_20210619010947.pcap
tcpdump_opn-anon_5-2-203-11_0x9_ens160_20210619010947.pcap
tcpdump_opn-anon_58-246-94-234_0x19_ens160_20210619010947.pcap
tcpdump_opn-anon_59-152-235-62_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_60-51-178-73_0xf_ens160_20210619010947.pcap
tcpdump_opn-anon_61-8-75-44_0xf0_ens160_20210619010947.pcap
tcpdump_opn-anon_62-75-252-43_0x65_ens160_20210619010947.pcap
tcpdump_opn-anon_62-77-247-227_0xff_ens160_20210619010947.pcap
tcpdump_opn-anon_66-211-85-126_0x14_ens160_20210619010947.pcap
tcpdump_opn-anon_67-79-70-174_0x3_ens160_20210619010947.pcap
tcpdump_opn-anon_69-39-68-78_0x80_ens160_20210619010947.pcap
tcpdump_opn-anon_69-67-169-60_0x9_ens160_20210619010947.pcap
tcpdump_opn-anon_70-171-48-88_0x68_ens160_20210619010947.pcap
tcpdump_opn-anon_70-191-254-190_0x4_ens160_20210619010947.pcap
tcpdump_opn-anon_70-33-158-253_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_72-93-90-219_0x2_ens160_20210619010947.pcap
tcpdump_opn-anon_73-77-67-189_0xf_ens160_20210619010947.pcap
tcpdump_opn-anon_74-198-163-80_0xbd_ens160_20210619010947.pcap
tcpdump_opn-anon_75-145-116-113_0x12_ens160_20210619010947.pcap
tcpdump_opn-anon_77-222-128-139_0xcc_ens160_20210619010947.pcap
tcpdump_opn-anon_77-226-240-92_0x5e_ens160_20210619010947.pcap
tcpdump_opn-anon_78-25-155-61_0xbc_ens160_20210619010947.pcap
tcpdump_opn-anon_78-8-160-206_0xda_ens160_20210619010947.pcap
tcpdump_opn-anon_78-8-160-222_0x8_ens160_20210619010947.pcap
tcpdump_opn-anon_80-91-160-20_0x13_ens160_20210619010947.pcap
tcpdump_opn-anon_81-170-231-13_0x3f_ens160_20210619010947.pcap
tcpdump_opn-anon_81-26-138-211_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_81-30-220-151_0x1a_ens160_20210619010947.pcap
tcpdump_opn-anon_81-62-194-225_0x6_ens160_20210619010947.pcap
tcpdump_opn-anon_8-20-247-145_0x10_ens160_20210619010947.pcap
tcpdump_opn-anon_82-152-191-92_0x21_ens160_20210619010947.pcap
tcpdump_opn-anon_8-24-105-218_0x12_ens160_20210619010947.pcap
tcpdump_opn-anon_82-80-57-167_0xfc_ens160_20210619010947.pcap
tcpdump_opn-anon_83-228-116-214_0x22_ens160_20210619010947.pcap
tcpdump_opn-anon_85-132-115-58_0xbe_ens160_20210619010947.pcap
tcpdump_opn-anon_85-187-116-51_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_85-31-161-97_0xee_ens160_20210619010947.pcap
tcpdump_opn-anon_85-96-196-109_0xd_ens160_20210619010947.pcap
tcpdump_opn-anon_86-106-129-223_0xa_ens160_20210619010947.pcap
tcpdump_opn-anon_87-234-200-113_0xf6_ens160_20210619010947.pcap
tcpdump_opn-anon_88-193-146-187_0xe_ens160_20210619010947.pcap
tcpdump_opn-anon_88-216-112-77_0x1b_ens160_20210619010947.pcap
tcpdump_opn-anon_88-255-53-149_0xd7_ens160_20210619010947.pcap
tcpdump_opn-anon_88-86-80-151_0x0_ens160_20210619010947.pcap
tcpdump_opn-anon_89-107-142-66_0x4e_ens160_20210619010947.pcap
tcpdump_opn-anon_89-22-35-8_0xe7_ens160_20210619010947.pcap
tcpdump_opn-anon_89-236-106-218_0xfe_ens160_20210619010947.pcap
tcpdump_opn-anon_90-188-37-17_0x6_ens160_20210619010947.pcap
tcpdump_opn-anon_91-224-183-2_0x14_ens160_20210619010947.pcap
tcpdump_opn-anon_91-239-249-37_0xc_ens160_20210619010947.pcap
tcpdump_opn-anon_91-25-100-130_0x6_ens160_20210619010947.pcap
tcpdump_opn-anon_91-98-102-156_0x3_ens160_20210619010947.pcap
tcpdump_opn-anon_92-242-57-30_0x6_ens160_20210619010947.pcap
tcpdump_opn-anon_92-243-138-23_0xa_ens160_20210619010947.pcap
tcpdump_opn-anon_93-159-155-125_0xad_ens160_20210619010947.pcap
tcpdump_opn-anon_93-78-39-198_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_94-177-30-8_0x1_ens160_20210619010947.pcap
tcpdump_opn-anon_94-254-93-216_0x7_ens160_20210619010947.pcap
tcpdump_opn-anon_94-45-149-133_0x8_ens160_20210619010947.pcap
tcpdump_opn-anon_94-45-81-10_0x3b_ens160_20210619010947.pcap
tcpdump_opn-anon_95-129-56-114_0x50_ens160_20210619010947.pcap
tcpdump_opn-anon_96-103-155-210_0x4_ens160_20210619010947.pcap
tcpdump_opn-anon_96-81-97-67_0xa2_ens160_20210619010947.pcap
tcpdump_opn-anon_96-90-10-169_0xf9_ens160_20210619010947.pcap
""".split("\n"))


c_packets = reduce(lambda a,b: a + b, (rdpcap(f"../traffic/{f}") for f in c_logs))

In [48]:
# using the client logs, get a list of queries

queries = []
queries_by_id = {}
dropped = []

for p in c_packets:
    try:
        p[DNS]
    except IndexError:
        logging.warning(f'Broken DNS packet at time {p.time}?')
        dropped.append(p)
        continue
    if p[DNS].qr == 0:
        # query
        qid = p[DNS].id
        q = {
            'id': qid,
            'qname': p[DNS].qd.qname,
            'tag': p[DNS].qd.qname.split(b'.', 1)[0],
            'started': p.time,
            'client_packets': [p],
            'server_packets': [],
        }
        queries.append(q)
        queries_by_id[qid] = q
    else:
        # response
        qid = p[DNS].id
        try:
            queries_by_id[qid]
        except KeyError:
            logging.warning(f"Response without query at time {p.time}")
            continue
        queries_by_id[qid]['finished'] = p.time
        queries_by_id[qid]['client_packets'].append(p)
    
queries = pd.DataFrame(queries)
queries['qname_parts'] = queries.apply(split_qname, axis=1)
for key in ['zone_algorithm', 'zone_keysize', 'zone_nsec', 'zone_status', 'resolver']:
    queries[key] = queries.apply(lambda row: row['qname_parts'][key], axis=1)
del queries['qname_parts']

first_query_time, last_query_time = queries['started'].min(), queries['finished'].max()



























































































In [49]:
len(queries)

57654

In [50]:
len(dropped)

4789

In [51]:
from datetime import datetime
import math

In [52]:
datetime.fromtimestamp(math.floor(first_query_time)), datetime.fromtimestamp(math.ceil(last_query_time))

(datetime.datetime(2021, 6, 19, 1, 9, 49),
 datetime.datetime(2021, 6, 19, 6, 19, 46))

In [39]:
%%bash

ls -lh ../traffic/eth0*

-rw-r----- 1 nils nils   24 Jun  4 13:48 ../traffic/eth0-20210604114854.pcap
-rw-r----- 1 nils nils  13K Jun  4 13:51 ../traffic/eth0-20210604114855.pcap
-rw-r----- 1 nils nils   24 Jun  4 13:51 ../traffic/eth0-20210604115148.pcap
-rw-r----- 1 nils nils 5.4K Jun  4 13:57 ../traffic/eth0-20210604115149.pcap
-rw-r----- 1 nils nils   24 Jun  4 13:57 ../traffic/eth0-20210604115751.pcap
-rw-r----- 1 nils nils 3.0M Jun  7 10:49 ../traffic/eth0-20210604115752.pcap
-rw-r----- 1 nils nils   24 Jun  7 10:49 ../traffic/eth0-20210607084932.pcap
-rw-r----- 1 nils nils   24 Jun  7 10:49 ../traffic/eth0-20210607084933.pcap
-rw-r----- 1 nils nils   24 Jun  7 10:49 ../traffic/eth0-20210607084940.pcap
-rw-r----- 1 nils nils  13M Jun 11 08:16 ../traffic/eth0-20210607084941.pcap
-rw-r----- 1 nils nils   24 Jun 11 11:29 ../traffic/eth0-20210611092938.pcap
-rw-r----- 1 nils nils 1.6K Jun 11 11:32 ../traffic/eth0-20210611092939.pcap
-rw-r----- 1 nils nils 122K Jun 11 14:03 ../traffic/eth0-20210611093249.pcap

In [34]:
%%bash

cd ../traffic
editcap -A "2021-06-19 01:09:49" -B "2021-06-19 06:19:46" eth0-20210616183851.pcap eth0-20210616183851.filtered.pcap

In [35]:
%%bash

ls -lh ../traffic/eth0*

-rw-r----- 1 nils nils   24 Jun  4 13:48 ../traffic/eth0-20210604114854.pcap
-rw-r----- 1 nils nils  13K Jun  4 13:51 ../traffic/eth0-20210604114855.pcap
-rw-r----- 1 nils nils   24 Jun  4 13:51 ../traffic/eth0-20210604115148.pcap
-rw-r----- 1 nils nils 5.4K Jun  4 13:57 ../traffic/eth0-20210604115149.pcap
-rw-r----- 1 nils nils   24 Jun  4 13:57 ../traffic/eth0-20210604115751.pcap
-rw-r----- 1 nils nils 3.0M Jun  7 10:49 ../traffic/eth0-20210604115752.pcap
-rw-r----- 1 nils nils   24 Jun  7 10:49 ../traffic/eth0-20210607084932.pcap
-rw-r----- 1 nils nils   24 Jun  7 10:49 ../traffic/eth0-20210607084933.pcap
-rw-r----- 1 nils nils   24 Jun  7 10:49 ../traffic/eth0-20210607084940.pcap
-rw-r----- 1 nils nils  13M Jun 11 08:16 ../traffic/eth0-20210607084941.pcap
-rw-r----- 1 nils nils   24 Jun 11 11:29 ../traffic/eth0-20210611092938.pcap
-rw-r----- 1 nils nils 1.6K Jun 11 11:32 ../traffic/eth0-20210611092939.pcap
-rw-r----- 1 nils nils 122K Jun 11 14:03 ../traffic/eth0-20210611093249.pcap

In [14]:
s_packets = rdpcap('../traffic/eth0-20210616183851.filtered.pcap')

In [15]:
len(s_packets)

157399

## Match Client and Server Data

In [16]:
# using the server logs, match packets captured at the server to the client's questions

candidates = []

for p in tqdm(s_packets):
    if queries['started'].min() <= p.time <= queries['finished'].max():
        try:
            p_dns = tag = p[DNS]
        except IndexError:
            logging.warning(f'packet with timestamp {p.time} could not be parsed as DNS packet')
            continue
        
        if p_dns.qd is None:
            logging.warning(f'packet with timestamp {p.time} did not contain a query name')
            continue
        
        tag = p_dns.qd.qname.split(b'.', 1)[0]
        candidate_queries = (queries['started'] <= p.time) & (p.time <= queries['finished']) & (queries['tag'] == tag)
        if sum(candidate_queries) == 0:
            continue
        elif sum(candidate_queries) == 1:
            queries[candidate_queries].iloc[0]['server_packets'].append(p)
        else:
            logging.warning(f'Initial DNS query not uniquely identified for packet with time stamp {p.time}')
        



100%|██████████| 157399/157399 [06:11<00:00, 423.14it/s]


## Clean and Organize Data

In [53]:
# drop rows that do not have exactly two client packets

queries['num_client_packets'] = queries.apply(lambda row: len(row['client_packets']), axis=1)
drop = queries['num_client_packets'] != 2
if sum(drop):
    logging.warning(f'Dropping {sum(drop)} of {len(queries)} queries as they do not have exactly two client packets')
    for count, num in zip(*np.unique(queries['num_client_packets'], return_counts=True)):
        if count == 2: continue
        logging.warning(f'- {num} queries had {count} client packet(s)')
queries = queries.drop(queries[drop].index)

queries['client_query'] = queries.apply(lambda row: row['client_packets'][0], axis=1)
queries['client_response'] = queries.apply(lambda row: row['client_packets'][1], axis=1)
del queries['client_packets']



In [54]:
queries['client_response_rcode'] = queries.apply(lambda row: row['client_response'].rcode, axis=1)
queries['client_response_ad'] = queries.apply(lambda row: row['client_response'].ad, axis=1)

In [None]:
# TODO: select server packet(s) for feature extraction

def take_first_query(packet_list):
    queries = filter(lambda p: p[DNS].qr == 0, packet_list)
    try:
        return next(iter(queries))
    except StopIteration:
        return None

queries['num_server_packets'] = queries.apply(lambda row: len(row['server_packets']), axis=1)
queries['server_query'] = queries.apply(lambda row: take_first_query(row['server_packets']), axis=1)

In [None]:
drop = queries['server_query'].isnull()
if sum(drop):
    logging.warning(f"Dropping {sum(drop)} of {len(queries)} queries as corresponding server packets could not be found!")
    queries = queries.drop(queries[drop].index)

In [55]:
queries['validated_response'] = (queries['client_response_rcode'] == 0) & (queries['client_response_ad'] == 1)

In [56]:
queries['num_server_packets'].min(), queries['num_server_packets'].mean(), queries['num_server_packets'].max(), queries['num_server_packets'].unique()

KeyError: 'num_server_packets'

## Extract Features and Label

In [None]:
# set the label

queries['label_rcode'] = queries['client_response_rcode']
queries['label_resolver'] = queries['resolver']
queries['label_rcode0andad1'] = queries['validated_response']

In [None]:
# set some features

queries['feature_ip_ttl'] = queries.apply(lambda row: row['server_query'][IP].ttl, axis=1)
queries['feature_ip_src'] = queries.apply(lambda row: row['server_query'][IP].src, axis=1)
queries['feature_ip_proto'] = queries.apply(lambda row: row['server_query'][IP].proto, axis=1)
queries['feature_udp_len'] = queries.apply(lambda row: row['server_query'][UDP].len, axis=1)
queries['feature_dns_qr'] = queries.apply(lambda row: row['server_query'][DNS].qr, axis=1)
queries['feature_dns_opcode'] = queries.apply(lambda row: row['server_query'][DNS].opcode, axis=1)
queries['feature_dns_aa'] = queries.apply(lambda row: row['server_query'][DNS].aa, axis=1)
queries['feature_dns_tc'] = queries.apply(lambda row: row['server_query'][DNS].tc, axis=1)
queries['feature_dns_rd'] = queries.apply(lambda row: row['server_query'][DNS].rd, axis=1)
queries['feature_dns_ra'] = queries.apply(lambda row: row['server_query'][DNS].ra, axis=1)
queries['feature_dns_z'] = queries.apply(lambda row: row['server_query'][DNS].z, axis=1)
queries['feature_dns_cd'] = queries.apply(lambda row: row['server_query'][DNS].cd, axis=1)
queries['feature_dns_rcode'] = queries.apply(lambda row: row['server_query'][DNS].rcode, axis=1)
queries['feature_dns_qdcount'] = queries.apply(lambda row: row['server_query'][DNS].qdcount, axis=1)
queries['feature_dns_ancount'] = queries.apply(lambda row: row['server_query'][DNS].ancount, axis=1)
queries['feature_dns_nscount'] = queries.apply(lambda row: row['server_query'][DNS].nscount, axis=1)
queries['feature_dns_arcount'] = queries.apply(lambda row: row['server_query'][DNS].arcount, axis=1)
#queries['feature_zone_algorithm'] = queries['zone_algorithm']
#queries['feature_zone_keysize'] = queries['zone_keysize']
#queries['feature_zone_nsec'] = queries['zone_nsec']

## Show Data
### All Columns

In [None]:
queries.iloc[-10:].style.format({'server_packets': packet_list_fmt, 'client_query': packet_fmt, 'client_response': packet_fmt, 'server_query': packet_fmt})

In [57]:
def validation_info(data):
    n = len(queries)
    for rcode in data['client_response_rcode'].unique():
        for ad in data['client_response_ad'].unique():
            c = sum((queries['client_response_rcode'] == rcode) & (queries['client_response_ad'] == ad))
            print(f"{c}/{n} ({c/n:.1%}) have rcode={rcode} ad={ad}")
            
print('all queries')
validation_info(queries)

print('signedok queries')
validation_info(queries[queries['zone_status'] == 'signedok'])

all queries
2270/34855 (6.5%) have rcode=0 ad=1
10350/34855 (29.7%) have rcode=0 ad=0
259/34855 (0.7%) have rcode=2 ad=1
21958/34855 (63.0%) have rcode=2 ad=0
0/34855 (0.0%) have rcode=1 ad=1
16/34855 (0.0%) have rcode=1 ad=0
1/34855 (0.0%) have rcode=5 ad=1
1/34855 (0.0%) have rcode=5 ad=0
signedok queries
2270/34855 (6.5%) have rcode=0 ad=1
10350/34855 (29.7%) have rcode=0 ad=0
259/34855 (0.7%) have rcode=2 ad=1
21958/34855 (63.0%) have rcode=2 ad=0
0/34855 (0.0%) have rcode=1 ad=1
16/34855 (0.0%) have rcode=1 ad=0
1/34855 (0.0%) have rcode=5 ad=1
1/34855 (0.0%) have rcode=5 ad=0


In [58]:
def validation_success_rate(c):
    return sum(c == 1) / len(c)

In [67]:
queries = queries[queries['client_response_rcode'].isin([0, 2])]

In [70]:
queries = queries[~queries['zone_keysize'].isna()]

In [71]:
queries['zone_keysize'] = queries['zone_keysize'].astype(np.int)

In [73]:
len(queries)

33492

In [76]:
queries[['resolver', 'qname']]

Unnamed: 0,resolver,qname
4,anon,b'opn-anon-101-50-61-31-20210619010947.ecdsap2...
5,anon,b'opn-anon-101-50-61-31-20210619010947.ecdsap2...
6,anon,b'opn-anon-101-50-61-31-20210619010947.ecdsap2...
7,anon,b'opn-anon-101-50-61-31-20210619010947.ecdsap2...
8,anon,b'opn-anon-101-50-61-31-20210619010947.ecdsap2...
...,...,...
57610,anon,b'opn-anon-96-90-10-169-20210619010947.rsasha5...
57625,anon,b'opn-anon-96-90-10-169-20210619010947.rsasha5...
57627,anon,b'opn-anon-96-90-10-169-20210619010947.rsasha5...
57630,anon,b'opn-anon-96-90-10-169-20210619010947.rsasha5...


In [75]:
def validation_status_color(val):
    if np.isnan(val):
        return 'color: grey'
    color = 'red' if val < 1 else 'black'
    return 'color: %s' % color

pd.options.display.precision = 2

queries

queries[queries['zone_status'] == 'signedok'].groupby(['resolver', 'zone_algorithm', 'zone_keysize', 'zone_nsec']).agg({
    'id': ['count'],
    'validated_response': [validation_success_rate, 'unique'],
}).reset_index().sort_values(['zone_keysize']).sort_values(by=['zone_algorithm'], key=lambda c: c.apply(
    lambda zone_algorithm: ALGO_NUM[zone_algorithm]), kind='mergesort').pivot(
    ['resolver', 'zone_nsec'], ['zone_algorithm', 'zone_keysize'], [('validated_response', 'validation_success_rate')]).style.applymap(validation_status_color)

  return array(a, dtype, copy=False, order=order)


Unnamed: 0_level_0,Unnamed: 1_level_0,"('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')","('validated_response', 'validation_success_rate')"
Unnamed: 0_level_1,zone_algorithm,rsasha1,rsasha1,rsasha1,rsasha1,rsasha1nsec3sha1,rsasha1nsec3sha1,rsasha1nsec3sha1,rsasha1nsec3sha1,rsasha256,rsasha256,rsasha256,rsasha256,rsasha512,rsasha512,rsasha512,rsasha512,ecdsap256sha256,ecdsap384sha384,ed25519,ed448
Unnamed: 0_level_2,zone_keysize,1024,1871,2048,4096,1024,1871,2048,4096,1024,1871,2048,4096,1024,1871,2048,4096,256,384,256,456
resolver,zone_nsec,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
anon,1,0.23,0.2,0.22,0.19,0.21,0.22,0.22,0.19,0.22,0.22,0.23,0.19,0.23,0.23,0.21,0.2,0.28,0.22,0.21,0.06
anon,3,0.0,0.0,0.0,0.0,0.22,0.22,0.19,0.2,0.21,0.24,0.22,0.19,0.2,0.22,0.2,0.19,0.3,0.23,0.2,0.05


In [78]:
def validation_status_color(val):
    if np.isnan(val):
        return 'color: grey'
    color = 'red' if val < 1 else 'black'
    return 'color: %s' % color

def response_success_rate(s):
    return sum(s == 0) / len(s)

pd.options.display.precision = 2

queries

queries[queries['zone_status'] == 'signedok'].groupby(['resolver', 'zone_algorithm', 'zone_keysize', 'zone_nsec']).agg({
    'id': ['count'],
    'client_response_rcode': [response_success_rate],
}).reset_index().sort_values(['zone_keysize']).sort_values(by=['zone_algorithm'], key=lambda c: c.apply(
    lambda zone_algorithm: ALGO_NUM[zone_algorithm]), kind='mergesort').pivot(
    ['resolver', 'zone_nsec'], ['zone_algorithm', 'zone_keysize'], [('client_response_rcode', 'response_success_rate')]).style.applymap(validation_status_color)

  return array(a, dtype, copy=False, order=order)


Unnamed: 0_level_0,Unnamed: 1_level_0,"('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')","('client_response_rcode', 'response_success_rate')"
Unnamed: 0_level_1,zone_algorithm,rsasha1,rsasha1,rsasha1,rsasha1,rsasha1nsec3sha1,rsasha1nsec3sha1,rsasha1nsec3sha1,rsasha1nsec3sha1,rsasha256,rsasha256,rsasha256,rsasha256,rsasha512,rsasha512,rsasha512,rsasha512,ecdsap256sha256,ecdsap384sha384,ed25519,ed448
Unnamed: 0_level_2,zone_keysize,1024,1871,2048,4096,1024,1871,2048,4096,1024,1871,2048,4096,1024,1871,2048,4096,256,384,256,456
resolver,zone_nsec,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
anon,1,0.51,0.49,0.51,0.49,0.49,0.5,0.5,0.45,0.5,0.48,0.5,0.46,0.49,0.5,0.48,0.47,0.55,0.51,0.56,0.52
anon,3,0.05,0.05,0.04,0.06,0.49,0.51,0.46,0.48,0.49,0.51,0.49,0.46,0.45,0.48,0.47,0.46,0.58,0.53,0.54,0.54


### ML Columns

In [32]:
# collect columns relevant for ML
ML = list(filter(lambda c: c.startswith('label_') or c.startswith('feature_'), queries.keys()))

In [38]:
# collect rows relevant for ML
signedok = queries['zone_status'] == 'signedok'

In [40]:
# save table to disk
for algorithm in queries['zone_algorithm'].unique():
    queries[signedok & (queries['zone_algorithm'] == algorithm)][ML].to_pickle(f'ml_data_{algorithm}.pickle')

In [41]:
%%bash
ls -lh ml_data_*.pickle

-rw-rw-r-- 1 nils nils 1.5K Jun 18 17:12 ml_data_0.pickle
-rw-rw-r-- 1 nils nils 6.7K Jun 18 17:12 ml_data_ecdsap256sha256.pickle
-rw-rw-r-- 1 nils nils 6.5K Jun 18 17:12 ml_data_ecdsap384sha384.pickle
-rw-rw-r-- 1 nils nils 6.7K Jun 18 17:12 ml_data_ed25519.pickle
-rw-rw-r-- 1 nils nils 7.0K Jun 18 17:12 ml_data_ed448.pickle
-rw-rw-r-- 1 nils nils  18K Jun 18 17:12 ml_data_rsasha1nsec3sha1.pickle
-rw-rw-r-- 1 nils nils  22K Jun 18 17:12 ml_data_rsasha1.pickle
-rw-rw-r-- 1 nils nils  21K Jun 18 17:12 ml_data_rsasha256.pickle
-rw-rw-r-- 1 nils nils  21K Jun 18 17:12 ml_data_rsasha512.pickle


In [None]:
# show table
queries[signedok][ML].iloc[-10:].style.format({'server_packets': packet_list_fmt, 'client_query': packet_fmt, 'client_response': packet_fmt})

In [None]:
sum(signedok)

### Some Data Insights

In [None]:
# client query duration distribution

queries['duration'] = queries['finished'] - queries['started']

data = queries['duration'].astype(np.float).dropna()
bins = [b*10**p for p in [-3, -2, -1, 0] for b in [1,1.3,1.6,2,3,4,5,6.6,8.3]]
labels = bins[::3]
hist, bin_edges = np.histogram(data, bins=bins)
cdf = np.cumsum(hist)

plt.figure(figsize=(8, 6))
plt.grid(True)

ax, tax = plt.gca(), plt.twinx()

ax.hist(data, bins=bins, density=True)
ax.set_ylabel('probability density')
ax.set_xlabel('d [s]')

tax.plot(bin_edges[:-1], cdf/cdf[-1], c='r')
tax.set_ylabel('P[client query-response duration < d]')

plt.xscale('log')
plt.xticks(labels, map(str, labels))
plt.title('Distribution of Client Query Round Trip Duration')

None

In [None]:
# show a single query to the server

for p in queries.iloc[200]['server_packets']:
    if p[DNS].qr != 0:
        continue
    print(p[IP].show())

In [None]:
# show response code grouped by zone status, resolver, algorithm, key size, nsec

pd.options.display.max_rows = None
queries.groupby(['zone_status', 'resolver', 'zone_algorithm', 'zone_keysize', 'zone_nsec']).agg({
    'client_response_rcode': ['min', 'mean', 'max']
})