# Generate Training Data
## Preparation

In [1]:
from scapy.all import *
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
import logging

logging.basicConfig(level=logging.DEBUG)

In [3]:
# Formatters for displaying pandas data tables with columns of (lists of) scapy packets

def packet_list_fmt(lst):
    return f"{len(lst)} packets"

def packet_fmt(pkt):
    return pkt[DNS].summary()

In [4]:
# retrieve information encoded in DNS qnames

def split_qname(row):
    qname = row['qname']
    parts = qname.split(b".")
    if b".".join(parts[-3:]) == b"dnstb.net.":
        zone_parts = parts[-4].split(b"-")
        resolver_parts = parts[-5].split(b"-")
        return {
            'zone_algorithm': zone_parts[0],
            'zone_keysize': zone_parts[1],
            'zone_nsec': zone_parts[2],
            'zone_status': zone_parts[3],
            'resolver': resolver_parts[2],
        }
    return None

## Load Data

In [5]:
%%bash 

mkdir -p ../traffic
rsync -aP root@ns1.dnstb.net:/var/log/tcplogger/ ../traffic/

receiving incremental file list
./
br-0922aed171aa-20210607084932.pcap
              0   0%    0.00kB/s    0:00:00               24 100%   23.44kB/s    0:00:00 (xfr#1, to-chk=62/64)
br-0922aed171aa-20210607084933.pcap
              0   0%    0.00kB/s    0:00:00               24 100%   23.44kB/s    0:00:00 (xfr#2, to-chk=61/64)
br-0922aed171aa-20210607084940.pcap
              0   0%    0.00kB/s    0:00:00               24 100%   23.44kB/s    0:00:00 (xfr#3, to-chk=60/64)
br-0922aed171aa-20210607084941.pcap
              0   0%    0.00kB/s    0:00:00          720,896   6%  612.71kB/s    0:00:16        1,605,632  15%  722.91kB/s    0:00:12        2,686,976  25%  824.90kB/s    0:00:09        4,325,376  40% 1008.36kB/s    0:00:06        7,307,264  69%    1.55MB/s    0:00:02       10,190,848  96%    2.04MB/s    0:00:00       10,559,488 100%    1.51MB/s    0:00:06 (xfr#4, to-chk=59/64)
br-189c8eb7515d-20210604115751.pcap
              0   0%    0.00kB/s    0:00:00            

In [6]:
# load the traffic logs
# TODO: loop over available traffic dumps

c_packets = rdpcap('../traffic/tcpdump_lab_dev_vpn0_powerdns460_20210610190354.pcap') + rdpcap('../traffic/tcpdump_lab_dev_vpn0_neustar-free-recursive_20210610192648.pcap')
if not 's_packets' in globals().keys():
    s_packets = rdpcap('../traffic/eth0-20210607084941.pcap')

In [7]:
# using the client logs, get a list of queries

queries = []
queries_by_id = {}

for p in c_packets:
    if p[DNS].qr == 0:
        # query
        qid = p[DNS].id
        q = {
            'id': qid,
            'qname': p[DNS].qd.qname,
            'tag': p[DNS].qd.qname.split(b'.', 1)[0],
            'started': p.time,
            'client_packets': [p],
            'server_packets': [],
        }
        queries.append(q)
        queries_by_id[qid] = q
    else:
        # response
        qid = p[DNS].id
        queries_by_id[qid]['finished'] = p.time
        queries_by_id[qid]['client_packets'].append(p)
    
queries = pd.DataFrame(queries)
queries.style.format({'server_packets': packet_list_fmt, 'client_packets': packet_list_fmt})
queries['qname_parts'] = queries.apply(split_qname, axis=1)
for key in ['zone_algorithm', 'zone_keysize', 'zone_nsec', 'zone_status', 'resolver']:
    queries[key] = queries.apply(lambda row: row['qname_parts'][key], axis=1)
del queries['qname_parts']

first_query_time, last_query_time = queries['started'].min(), queries['finished'].max()

## Match Client and Server Data

In [8]:
# filter server packets to relevant times

s_packets = s_packets.filter(lambda p: first_query_time <= p.time <= last_query_time)

In [9]:
# using the server logs, match packets captured at the server to the client's questions

candidates = []

for p in tqdm(s_packets):
    if queries['started'].min() <= p.time <= queries['finished'].max():
        tag = p[DNS].qd.qname.split(b'.', 1)[0]
        candidate_queries = (queries['started'] <= p.time) & (p.time <= queries['finished']) & (queries['tag'] == tag)
        if sum(candidate_queries) == 0:
            continue
        elif sum(candidate_queries) == 1:
            queries[candidate_queries].iloc[0]['server_packets'].append(p)
        else:
            logging.warning(f'Initial DNS query not uniquely identified for packet {p.show()}')
        

100%|██████████| 10358/10358 [00:10<00:00, 955.48it/s]


## Clean and Organize Data

In [10]:
# drop rows that do not have exactly two client packets

queries['num_client_packets'] = queries.apply(lambda row: len(row['client_packets']), axis=1)
drop = queries['num_client_packets'] != 2
if sum(drop):
    logging.warning(f'Dropping {sum(drop)} of {len(queries)} queries as they do not have exactly two client packets')
    for count, num in np.unique(queries['num_client_packets'], return_counts=True):
        if count == 2: continue
        logging.warning(f'- {num} queries had {count} client packet(s)')
queries = queries.drop(queries[drop].index)

queries['client_query'] = queries.apply(lambda row: row['client_packets'][0], axis=1)
queries['client_response'] = queries.apply(lambda row: row['client_packets'][1], axis=1)
del queries['client_packets']



In [11]:
queries['client_response_rcode'] = queries.apply(lambda row: row['client_response'].rcode, axis=1)

In [12]:
# TODO: select server packet(s) for feature extraction

## Extract Features and Label

In [13]:
# set the label

queries['label_rcode'] = queries['client_response_rcode']
queries['label_resolver'] = queries['resolver']

In [14]:
# set some feature

queries['feature_tcp_ttl'] = queries.apply(lambda row: row['client_query'][IP].ttl, axis=1)

## Show Data
### All Columns

In [15]:
queries.iloc[-10:].style.format({'server_packets': packet_list_fmt, 'client_query': packet_fmt, 'client_response': packet_fmt})

Unnamed: 0,id,qname,tag,started,server_packets,finished,zone_algorithm,zone_keysize,zone_nsec,zone_status,resolver,num_client_packets,client_query,client_response,client_response_rcode,label_rcode,label_resolver,feature_tcp_ttl
224,40177,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-1-signedbrokenwrongds.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346497.097196,2 packets,1623346497.327115,b'rsasha512',b'2048',b'1',b'signedbrokenwrongds',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-1-signedbrokenwrongds.dnstb.net.'""",DNS Ans,2,2,b'neustar',64
225,52029,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-1-signedok.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346497.330677,2 packets,1623346497.428865,b'rsasha512',b'2048',b'1',b'signedok',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-1-signedok.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,0,b'neustar',64
226,31248,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-3-signedbrokennods.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346497.434702,2 packets,1623346497.510711,b'rsasha512',b'2048',b'3',b'signedbrokennods',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-3-signedbrokennods.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,0,b'neustar',64
227,17647,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-3-signedbrokenwrongds.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346497.514154,2 packets,1623346497.73602,b'rsasha512',b'2048',b'3',b'signedbrokenwrongds',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-3-signedbrokenwrongds.dnstb.net.'""",DNS Ans,2,2,b'neustar',64
228,27316,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-3-signedok.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346497.739234,2 packets,1623346497.834006,b'rsasha512',b'2048',b'3',b'signedok',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-2048-3-signedok.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,0,b'neustar',64
229,14447,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-1-signedbrokennods.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346497.83624,2 packets,1623346497.911923,b'rsasha512',b'4096',b'1',b'signedbrokennods',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-1-signedbrokennods.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,0,b'neustar',64
230,57676,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-1-signedbrokenwrongds.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346497.914967,2 packets,1623346498.146381,b'rsasha512',b'4096',b'1',b'signedbrokenwrongds',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-1-signedbrokenwrongds.dnstb.net.'""",DNS Ans,2,2,b'neustar',64
231,58653,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-1-signedok.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346498.155244,2 packets,1623346498.355878,b'rsasha512',b'4096',b'1',b'signedok',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-1-signedok.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,0,b'neustar',64
232,15075,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-3-signedbrokennods.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346498.359319,2 packets,1623346498.443433,b'rsasha512',b'4096',b'3',b'signedbrokennods',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-3-signedbrokennods.dnstb.net.'""","DNS Ans ""8.8.8.8""",0,0,b'neustar',64
233,22424,b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-3-signedbrokenwrongds.dnstb.net.',b'lab-dev-neustar-free-recursive-20210610192648',1623346498.449607,2 packets,1623346498.759942,b'rsasha512',b'4096',b'3',b'signedbrokenwrongds',b'neustar',2,"DNS Qry ""b'lab-dev-neustar-free-recursive-20210610192648.rsasha512-4096-3-signedbrokenwrongds.dnstb.net.'""",DNS Ans,2,2,b'neustar',64


### ML Columns

In [16]:
# collect columns relevant for ML
ML = list(filter(lambda c: c.startswith('label_') or c.startswith('feature_'), queries.keys()))

In [17]:
# save table to disk
queries[ML].to_pickle('ml_data.pickle')

In [18]:
# show table
queries[ML].iloc[-10:].style.format({'server_packets': packet_list_fmt, 'client_query': packet_fmt, 'client_response': packet_fmt})

Unnamed: 0,label_rcode,label_resolver,feature_tcp_ttl
224,2,b'neustar',64
225,0,b'neustar',64
226,0,b'neustar',64
227,2,b'neustar',64
228,0,b'neustar',64
229,0,b'neustar',64
230,2,b'neustar',64
231,0,b'neustar',64
232,0,b'neustar',64
233,2,b'neustar',64


### Some Data Insights

In [19]:
# show a single query to the server

for p in queries.iloc[0]['server_packets']:
    if p[DNS].qr != 0:
        continue
    print(p[IP].show())

###[ IP ]### 
  version   = 4
  ihl       = 5
  tos       = 0x0
  len       = 140
  id        = 51876
  flags     = DF
  frag      = 0
  ttl       = 51
  proto     = udp
  chksum    = 0x6bc
  src       = 141.12.128.13
  dst       = 130.149.230.81
  \options   \
###[ UDP ]### 
     sport     = 50249
     dport     = domain
     len       = 120
     chksum    = 0x4f4d
###[ DNS ]### 
        id        = 38930
        qr        = 0
        opcode    = QUERY
        aa        = 0
        tc        = 0
        rd        = 0
        ra        = 0
        z         = 0
        ad        = 0
        cd        = 0
        rcode     = ok
        qdcount   = 1
        ancount   = 0
        nscount   = 0
        arcount   = 1
        \qd        \
         |###[ DNS Question Record ]### 
         |  qname     = 'lab-dev-powerdns460-20210610190354.ecdsap256sha256-256-1-signedbrokennods.dnstb.net.'
         |  qtype     = A
         |  qclass    = IN
        an        = None
        ns        = None
 

In [20]:
# show response code grouped by zone status, resolver, algorithm, key size, nsec

pd.options.display.max_rows = None
queries.groupby(['zone_status', 'resolver', 'zone_algorithm', 'zone_keysize', 'zone_nsec']).agg({
    'client_response_rcode': ['min', 'mean', 'max']
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,client_response_rcode,client_response_rcode,client_response_rcode
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,min,mean,max
zone_status,resolver,zone_algorithm,zone_keysize,zone_nsec,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
b'signedbrokennods',b'neustar',b'ecdsap256sha256',b'256',b'1',0,0,0
b'signedbrokennods',b'neustar',b'ecdsap256sha256',b'256',b'3',0,0,0
b'signedbrokennods',b'neustar',b'ecdsap384sha384',b'384',b'1',0,0,0
b'signedbrokennods',b'neustar',b'ecdsap384sha384',b'384',b'3',0,0,0
b'signedbrokennods',b'neustar',b'ed25519',b'256',b'1',0,0,0
b'signedbrokennods',b'neustar',b'ed25519',b'256',b'3',0,0,0
b'signedbrokennods',b'neustar',b'ed448',b'456',b'1',0,0,0
b'signedbrokennods',b'neustar',b'ed448',b'456',b'3',0,0,0
b'signedbrokennods',b'neustar',b'rsasha1',b'1024',b'1',0,0,0
b'signedbrokennods',b'neustar',b'rsasha1',b'1024',b'3',0,0,0
