In [1]:
import os
import os.path
import sys
import pandas as pd
import numpy as np

In [2]:
LOCAL_PATH = 'data/'
RAW_TRACE = 'CAP_NIC1_00931_20130727230801.dms'
TRACE_CSV_FILE = 'trace.csv'

In [3]:
if not os.path.exists(LOCAL_PATH + RAW_TRACE):
    if not os.path.exists(LOCAL_PATH):
        os.mkdir(LOCAL_PATH)
    
    import boto3
    import botocore

    BUCKET_NAME = 'edu.nyu.hsn.ddos-data' # replace with your bucket name
    KEY = 'CAP_NIC1_00931_20130727230801.dms' # replace with your object key

    s3 = boto3.resource('s3')

    try:
        s3.Bucket(BUCKET_NAME).download_file(KEY, LOCAL_PATH + RAW_TRACE)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise

In [4]:
import subprocess
if not os.path.exists(LOCAL_PATH + TRACE_CSV_FILE):
    tshark_command = subprocess.Popen('tshark -r {} -Y tcp -T fields -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e tcp.len -e frame.time_relative -e tcp.seq -e tcp.ack -e tcp.flags.syn -e tcp.flags.fin -Eheader=y -Eseparator=, > {}'.format(LOCAL_PATH + RAW_TRACE, LOCAL_PATH + TRACE_CSV_FILE), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out_data, err_data = tshark_command.communicate()
    out_data, err_data = out_data.decode('utf-8'), err_data.decode('utf-8')
    if err_data != '':
        print(err_data)

In [5]:
trace_df = pd.read_csv(LOCAL_PATH + TRACE_CSV_FILE)

In [6]:
trace_df.shape

(3275767, 10)

In [7]:
trace_df.columns

Index(['ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'tcp.len',
       'frame.time_relative', 'tcp.seq', 'tcp.ack', 'tcp.flags.syn',
       'tcp.flags.fin'],
      dtype='object')

In [8]:
trace_df.describe()

Unnamed: 0,tcp.srcport,tcp.dstport,tcp.len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.syn,tcp.flags.fin
count,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0
mean,19590.4,11897.82,1001.266,27.70661,389656500.0,55832030.0,0.006052323,0.005241521
std,21304.04,17097.13,645.4179,11.89198,462089200.0,158493700.0,0.07756091,0.07220837
min,21.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,445.0,2770.0,118.0,17.34808,803314.0,1.0,0.0,0.0
50%,2770.0,2901.0,1460.0,27.84759,236458800.0,303409.0,0.0,0.0
75%,40920.0,9876.0,1460.0,37.92557,642160200.0,1348768.0,0.0,0.0
max,65534.0,65516.0,1460.0,48.28968,4294967000.0,4294967000.0,1.0,1.0


In [9]:
def extract_tcp_tuple(record):
    return (record['ip.src'], record['tcp.srcport'], record['ip.dst'], record['tcp.dstport'])

def reverse_tcp_tuple(tcp_tuple):
    return (tcp_tuple[2], tcp_tuple[3], tcp_tuple[0], tcp_tuple[1])

def is_in(record, pcap_statistics):
    time_delta_threshold = 60
    tcp_tuple = extract_tcp_tuple(record)
    reversed_tcp_tuple = reverse_tcp_tuple(tcp_tuple)
    if tcp_tuple not in pcap_statistics and reversed_tcp_tuple not in pcap_statistics:
        return False
    else:
        tcp_flow_list = pcap_statistics[tcp_tuple] if tcp_tuple in pcap_statistics else pcap_statistics[reversed_tcp_tuple]
        last_tcp_flow = tcp_flow_list[-1]
        return record['frame.time_relative'] - (last_tcp_flow['rel_start'] + last_tcp_flow['duration']) <= time_delta_threshold

def extract_useful_info(record):
    useful_info = {\
        "src_ip":record['ip.src'],\
        "src_port":record['tcp.srcport'],\
        "dst_ip":record['ip.dst'],\
        "dst_port":record['tcp.dstport'],\
        "rel_start":record['frame.time_relative'],\
        "duration":0,\
        "packet_count":1,\
        "byte_count":record['tcp.len'],\
        "forward_packet_count":1,\
        "forward_byte_count":record['tcp.len'],\
        "backward_packet_count":0,\
        "backward_byte_count":0,\
        "inter_arrival_time_summed":0\
    }
    return useful_info

def add_in_statistics(pcap_statistics, tcp_tuple, record):
    if tcp_tuple not in pcap_statistics:
        pcap_statistics[tcp_tuple] = []
    pcap_statistics[tcp_tuple].append(extract_useful_info(record))
    return pcap_statistics

def update_statistics_info(pcap_statistics,tcp_tuple,record,is_forward_stream):
    pcap_statistics[tcp_tuple][-1]['inter_arrival_time_summed'] += record['frame.time_relative'] - (pcap_statistics[tcp_tuple][-1]['rel_start'] + pcap_statistics[tcp_tuple][-1]['duration'])
    pcap_statistics[tcp_tuple][-1]['duration'] = max(pcap_statistics[tcp_tuple][-1]['duration'], record['frame.time_relative'] - pcap_statistics[tcp_tuple][-1]['rel_start'])
    pcap_statistics[tcp_tuple][-1]['packet_count'] += 1
    pcap_statistics[tcp_tuple][-1]['byte_count'] += record['tcp.len']
    if is_forward_stream:
        pcap_statistics[tcp_tuple][-1]['forward_packet_count'] += 1
        pcap_statistics[tcp_tuple][-1]['forward_byte_count'] += record['tcp.len']
    else:
        pcap_statistics[tcp_tuple][-1]['backward_packet_count'] += 1
        pcap_statistics[tcp_tuple][-1]['backward_byte_count'] += record['tcp.len']
    return pcap_statistics

def update_statistics(pcap_statistics, tcp_tuple, record):  
    if tcp_tuple in pcap_statistics:
        return update_statistics_info(pcap_statistics,tcp_tuple,record,is_forward_stream=True)
    else:
        return update_statistics_info(pcap_statistics,reverse_tcp_tuple(tcp_tuple),record,is_forward_stream=False)

def flatten_dict(pcap_tcp_statistics):
    result_dict = []
    for tcp_tuple, tcp_flow_list in pcap_tcp_statistics.items():
        for tcp_flow in tcp_flow_list:
            result_dict.append(tcp_flow)
    return result_dict

In [17]:
import time
from tqdm import tqdm
def to_feature_df(pcap_df,drop_rate = 0.0):
    pcap_tcp_statistics = {}
    np.random.seed(int(time.time()))
    for index, row in tqdm(pcap_df.iterrows()):
        rand = np.random.rand()
        if rand >= drop_rate:
            tcp_tuple = extract_tcp_tuple(row)
            if not is_in(row, pcap_tcp_statistics):
                add_in_statistics(pcap_tcp_statistics, tcp_tuple, row)
            else:
                update_statistics(pcap_tcp_statistics, tcp_tuple, row)
    pcap_tcp_statistics = flatten_dict(pcap_tcp_statistics)
    return pd.DataFrame(pcap_tcp_statistics).rename(columns=\
                {\
                 'backward_byte_count': 'back_byte',\
                 'backward_packet_count': 'back_pkt',\
                 'byte_count':'tot_byte',\
                 'forward_byte_count':'forw_byte',\
                 'forward_packet_count':'forw_pkt',\
                 'packet_count':'tot_packet'\
                })

In [18]:
# packet-based random sampling
for sampling_percent in range(20,101,20):
    drop_percent = 100 - sampling_percent
    drop_rate = drop_percent / 100.0
    to_feature_df(trace_df, drop_rate).to_csv('{PATH}packet_rand_{PERCENT}%.csv'.format(PATH=LOCAL_PATH,PERCENT=sampling_percent),index=False)

3275767it [07:06, 7684.62it/s]
3275767it [08:19, 6553.66it/s]
3275767it [10:47, 5061.41it/s]
3275767it [12:48, 4259.94it/s]
3275767it [14:28, 3771.52it/s]
