In [1]:
import os
import os.path
import sys
import pandas as pd
import numpy as np

In [2]:
LOCAL_PATH = 'data/'
RAW_TRACE = 'CAP_NIC1_00931_20130727230801.dms'
TRACE_CSV_FILE = 'trace.csv'

In [3]:
if not os.path.exists(LOCAL_PATH + RAW_TRACE):
    if not os.path.exists(LOCAL_PATH):
        os.mkdir(LOCAL_PATH)
    
    import boto3
    import botocore

    BUCKET_NAME = 'edu.nyu.hsn.ddos-data' # replace with your bucket name
    KEY = 'CAP_NIC1_00931_20130727230801.dms' # replace with your object key

    s3 = boto3.resource('s3')

    try:
        s3.Bucket(BUCKET_NAME).download_file(KEY, LOCAL_PATH + RAW_TRACE)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise

In [4]:
import subprocess
if not os.path.exists(LOCAL_PATH + TRACE_CSV_FILE):
    tshark_command = subprocess.Popen('tshark -r {} -Y tcp -T fields -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e tcp.len -e frame.time_relative -e tcp.seq -e tcp.ack -e tcp.flags.syn -e tcp.flags.fin -Eheader=y -Eseparator=, > {}'.format(LOCAL_PATH + RAW_TRACE, LOCAL_PATH + TRACE_CSV_FILE), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out_data, err_data = tshark_command.communicate()
    out_data, err_data = out_data.decode('utf-8'), err_data.decode('utf-8')
    if err_data != '':
        print(err_data)

In [5]:
trace_df = pd.read_csv(LOCAL_PATH + TRACE_CSV_FILE)

In [6]:
trace_df.shape

(3275767, 10)

In [7]:
trace_df.columns

Index(['ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'tcp.len',
       'frame.time_relative', 'tcp.seq', 'tcp.ack', 'tcp.flags.syn',
       'tcp.flags.fin'],
      dtype='object')

In [17]:
trace_df.describe()

Unnamed: 0,tcp.srcport,tcp.dstport,tcp.len,frame.time_relative
count,3275767.0,3275767.0,3275767.0,3275767.0
mean,19590.4,11897.82,1001.266,27.70661
std,21304.04,17097.13,645.4179,11.89198
min,21.0,21.0,0.0,0.0
25%,445.0,2770.0,118.0,17.34808
50%,2770.0,2901.0,1460.0,27.84759
75%,40920.0,9876.0,1460.0,37.92557
max,65534.0,65516.0,1460.0,48.28968


In [8]:
def extract_tcp_tuple(record):
    return (record['ip.src'], record['tcp.srcport'], record['ip.dst'], record['tcp.dstport'])

def reverse_tcp_tuple(tcp_tuple):
    return (tcp_tuple[2], tcp_tuple[3], tcp_tuple[0], tcp_tuple[1])

def is_in(record, pcap_statistics):
    time_delta_threshold = 60
    tcp_tuple = extract_tcp_tuple(record)
    reversed_tcp_tuple = reverse_tcp_tuple(tcp_tuple)
    if tcp_tuple not in pcap_statistics and reversed_tcp_tuple not in pcap_statistics:
        return False
    else:
        tcp_flow_list = pcap_statistics[tcp_tuple] if tcp_tuple in pcap_statistics else pcap_statistics[reversed_tcp_tuple]
        last_tcp_flow = tcp_flow_list[-1]
        return record['frame.time_relative'] - (last_tcp_flow['rel_start'] + last_tcp_flow['duration']) <= time_delta_threshold

def extract_useful_info(record):
    useful_info = {\
        "src_ip":record['ip.src'],\
        "src_port":record['tcp.srcport'],\
        "dst_ip":record['ip.dst'],\
        "dst_port":record['tcp.dstport'],\
        "rel_start":record['frame.time_relative'],\
        "duration":0,\
        "packet_count":1,\
        "byte_count":record['tcp.len'],\
        "forward_packet_count":1,\
        "forward_byte_count":record['tcp.len'],\
        "backward_packet_count":0,\
        "backward_byte_count":0,\
        "inter_arrival_time_summed":0\
    }
    return useful_info

def add_in_statistics(pcap_statistics, tcp_tuple, record):
    if tcp_tuple not in pcap_statistics:
        pcap_statistics[tcp_tuple] = []
    pcap_statistics[tcp_tuple].append(extract_useful_info(record))
    return pcap_statistics

def update_statistics_info(pcap_statistics,tcp_tuple,record,is_forward_stream):
    pcap_statistics[tcp_tuple][-1]['inter_arrival_time_summed'] += record['frame.time_relative'] - (pcap_statistics[tcp_tuple][-1]['rel_start'] + pcap_statistics[tcp_tuple][-1]['duration'])
    pcap_statistics[tcp_tuple][-1]['duration'] = max(pcap_statistics[tcp_tuple][-1]['duration'], record['frame.time_relative'] - pcap_statistics[tcp_tuple][-1]['rel_start'])
    pcap_statistics[tcp_tuple][-1]['packet_count'] += 1
    pcap_statistics[tcp_tuple][-1]['byte_count'] += record['tcp.len']
    if is_forward_stream:
        pcap_statistics[tcp_tuple][-1]['forward_packet_count'] += 1
        pcap_statistics[tcp_tuple][-1]['forward_byte_count'] += record['tcp.len']
    else:
        pcap_statistics[tcp_tuple][-1]['backward_packet_count'] += 1
        pcap_statistics[tcp_tuple][-1]['backward_byte_count'] += record['tcp.len']
    return pcap_statistics

def update_statistics(pcap_statistics, tcp_tuple, record):  
    if tcp_tuple in pcap_statistics:
        return update_statistics_info(pcap_statistics,tcp_tuple,record,is_forward_stream=True)
    else:
        return update_statistics_info(pcap_statistics,reverse_tcp_tuple(tcp_tuple),record,is_forward_stream=False)

def flatten_dict(pcap_tcp_statistics):
    result_dict = []
    for tcp_tuple, tcp_flow_list in pcap_tcp_statistics.items():
        for tcp_flow in tcp_flow_list:
            result_dict.append(tcp_flow)
    return result_dict

In [13]:
def to_feature_df(pcap_df):
    pcap_tcp_statistics = {}
    for index, row in pcap_df.iterrows():
        tcp_tuple = extract_tcp_tuple(row)
        if not is_in(row, pcap_tcp_statistics):
            add_in_statistics(pcap_tcp_statistics, tcp_tuple, row)
        else:
            update_statistics(pcap_tcp_statistics, tcp_tuple, row)
    pcap_tcp_statistics = flatten_dict(pcap_tcp_statistics)
    return pd.DataFrame(pcap_tcp_statistics)

In [14]:
to_feature_df(trace_df)

Unnamed: 0,backward_byte_count,backward_packet_count,byte_count,dst_ip,dst_port,duration,forward_byte_count,forward_packet_count,inter_arrival_time_summed,packet_count,rel_start,src_ip,src_port
0,0,0,29515252,10.2.4.172,63633,48.289258,29515252,30351,48.289258,30351,0.000000,10.2.21.28,389
1,0,0,1402727437,10.2.4.146,9876,48.289677,1402727437,960992,48.289677,960992,0.000001,10.2.4.179,40920
2,9460,11,10300,10.2.25.108,60788,0.004498,840,8,0.004498,19,0.000154,10.2.253.247,51127
3,3378036,7853,8554000,10.2.4.179,47892,48.289389,5175964,4106,48.289389,11959,0.000155,10.2.4.168,50334
4,51192,79,67624,10.2.27.29,1571,48.000220,16432,165,48.000220,244,0.000309,10.2.27.33,55476
5,51192,79,67624,10.2.27.29,1571,48.000223,16432,165,48.000223,244,0.000310,10.2.27.33,55467
6,50544,78,66768,10.2.27.29,1571,48.000223,16224,164,48.000223,242,0.000311,10.2.27.33,55480
7,42364,996,273678,10.2.27.29,1571,48.268538,231314,1322,48.268538,2318,0.000312,10.2.27.33,65016
8,8948,271,1353990,10.2.1.37,43634,37.644050,1345042,998,37.644050,1269,0.000313,10.2.1.32,1433
9,427507,718,1209346,10.2.4.166,26080,45.090801,781839,756,45.090801,1474,0.000313,10.2.21.29,389
