In [9]:
import os
import os.path
import sys
import pandas as pd
import numpy as np

In [10]:
LOCAL_PATH = 'data'
RAW_TRACE = 'ashakan_raw.pcapng'
TRACE_FEATURE_FILE = 'tcp_flow_features.csv'
BUCKET_NAME = 'edu.nyu.hsn.ddos-data' # replace with your bucket name
KEY = 'CAP_NIC1_00931_20130727230801.dms' # replace with your object key

In [6]:
if not os.path.exists(LOCAL_PATH + "/" + RAW_TRACE):
    if not os.path.exists(LOCAL_PATH):
        os.mkdir(LOCAL_PATH)
    
    import boto3
    import botocore

    s3 = boto3.resource('s3')

    try:
        s3.Bucket(BUCKET_NAME).download_file(KEY, LOCAL_PATH + "/" + RAW_TRACE)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise

In [7]:
import subprocess
if not os.path.exists(LOCAL_PATH + "/" + TRACE_FEATURE_FILE):
    tshark_command = subprocess.Popen('tshark -r {} -Y tcp -T fields -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e tcp.len -e frame.time_relative -e tcp.seq -e tcp.ack -e tcp.flags.ack -e tcp.flags.syn -e tcp.flags.fin -e tcp.stream -Eheader=y -Eseparator=, > {}'.format(LOCAL_PATH + "/" + RAW_TRACE, LOCAL_PATH + "/" + TRACE_FEATURE_FILE), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out_data, err_data = tshark_command.communicate()
    out_data, err_data = out_data.decode('utf-8'), err_data.decode('utf-8')
    if err_data != '':
        pass
#         print(err_data)

In [8]:
trace_df = pd.read_csv(LOCAL_PATH + "/" + TRACE_FEATURE_FILE)

In [9]:
trace_df.shape

(3275767, 12)

In [10]:
trace_df.columns

Index(['ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'tcp.len',
       'frame.time_relative', 'tcp.seq', 'tcp.ack', 'tcp.flags.ack',
       'tcp.flags.syn', 'tcp.flags.fin', 'tcp.stream'],
      dtype='object')

In [11]:
trace_df.describe()

Unnamed: 0,tcp.srcport,tcp.dstport,tcp.len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream
count,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0
mean,19590.4,11897.82,1001.266,27.70661,389656500.0,55832030.0,0.9967156,0.006052323,0.005241521,1130.049
std,21304.04,17097.13,645.4179,11.89198,462089200.0,158493700.0,0.05721569,0.07756091,0.07220837,3101.892
min,21.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,445.0,2770.0,118.0,17.34808,803314.0,1.0,1.0,0.0,0.0,1.0
50%,2770.0,2901.0,1460.0,27.84759,236458800.0,303409.0,1.0,0.0,0.0,14.0
75%,40920.0,9876.0,1460.0,37.92557,642160200.0,1348768.0,1.0,0.0,0.0,38.0
max,65534.0,65516.0,1460.0,48.28968,4294967000.0,4294967000.0,1.0,1.0,1.0,17983.0


In [19]:
def is_in(record, pcap_statistics):
    return record['tcp.stream'] in pcap_statistics

def obtain_tcp_tuple(record):
    return {'src':{'ip':record['ip.src'], 'port':record['tcp.srcport']}, 'dst':{'ip':record['ip.dst'], 'port':record['tcp.srcport']}}

def extract_useful_info(record):
    useful_info = {\
        "src_ip":record['ip.src'],\
        "src_port":record['tcp.srcport'],\
        "dst_ip":record['ip.dst'],\
        "dst_port":record['tcp.dstport'],\
        "rel_start":record['frame.time_relative'],\
        "duration":0,\
        "packet_count":1,\
        "byte_count":record['tcp.len'],\
        "forward_packet_count":1,\
        "forward_byte_count":record['tcp.len'],\
        "backward_packet_count":0,\
        "backward_byte_count":0,\
        "inter_arrival_time_summed":0\
    }
    return useful_info

def add_in_statistics(pcap_statistics, record):
    key = record['tcp.stream']
    value = extract_useful_info(record)
    pcap_statistics[key] = value
    return key, value

def update_statistics_info(pcap_statistics,record,is_forward_stream):
    key = record['tcp.stream']
    pcap_statistics[key]['inter_arrival_time_summed'] += record['frame.time_relative'] - (pcap_statistics[key]['rel_start'] + pcap_statistics[key]['duration'])
    pcap_statistics[key]['duration'] = max(pcap_statistics[key]['duration'], record['frame.time_relative'] - pcap_statistics[key]['rel_start'])
    pcap_statistics[key]['packet_count'] += 1
    pcap_statistics[key]['byte_count'] += record['tcp.len']
    if is_forward_stream:
        pcap_statistics[key]['forward_packet_count'] += 1
        pcap_statistics[key]['forward_byte_count'] += record['tcp.len']
    else:
        pcap_statistics[key]['backward_packet_count'] += 1
        pcap_statistics[key]['backward_byte_count'] += record['tcp.len']
    return key, pcap_statistics[key]

def update_statistics(pcap_statistics, record): 
    key = record['tcp.stream']
    value = pcap_statistics[key]
    tcp_tuple = obtain_tcp_tuple(record)
    if tcp_tuple['src']['ip'] == value['src_ip'] and tcp_tuple['src']['port'] == value['src_port']:
        return update_statistics_info(pcap_statistics,record,is_forward_stream=True)
    else:
        return update_statistics_info(pcap_statistics,record,is_forward_stream=False)

In [20]:
import time
from tqdm import tqdm
def to_feature_df(pcap_df,drop_rate = 0.0):
    pcap_tcp_statistics = {}
    np.random.seed(int(time.time()))
    for index, row in tqdm(pcap_df.iterrows()):
        rand = np.random.rand()
        if rand >= drop_rate:
            if not is_in(row, pcap_tcp_statistics):
                add_in_statistics(pcap_tcp_statistics, row)
            else:
                update_statistics(pcap_tcp_statistics, row)
    pcap_tcp_statistics = list(pcap_tcp_statistics.values())
    return pd.DataFrame(pcap_tcp_statistics).rename(columns=\
                {\
                 'backward_byte_count': 'back_byte',\
                 'backward_packet_count': 'back_pkt',\
                 'byte_count':'tot_byte',\
                 'forward_byte_count':'forw_byte',\
                 'forward_packet_count':'forw_pkt',\
                 'packet_count':'tot_packet'\
                })

In [21]:
# packet-based random sampling
for sampling_percent in [100]:
    drop_percent = 100 - sampling_percent
    drop_rate = drop_percent / 100.0
    to_feature_df(trace_df, drop_rate).to_csv('{PATH}/packet_rand_{PERCENT}%.csv'.format(PATH=LOCAL_PATH,PERCENT=sampling_percent),index=False)

3275767it [12:34, 4342.81it/s]


# Test

In [11]:
df = pd.read_csv('data/packet_rand_100%.csv')

In [15]:
df = df.sort_values(by=['rel_start'])

In [16]:
src_ip_constr = df['src_ip'] == '10.2.27.52'
src_port_constr = df['src_port'] == 2770
dst_ip_constr = df['dst_ip'] == '10.2.27.131'
dst_port_constr = df['dst_port'] == 445
df[src_ip_constr]

Unnamed: 0,back_byte,back_pkt,tot_byte,dst_ip,dst_port,duration,forw_byte,forw_pkt,inter_arrival_time_summed,tot_packet,rel_start,src_ip,src_port
14,1249217498,875204,1251649613,10.2.27.131,445,48.289033,2432115,417610,48.289033,1292814,0.000645,10.2.27.52,2770


In [17]:
df.head()

Unnamed: 0,back_byte,back_pkt,tot_byte,dst_ip,dst_port,duration,forw_byte,forw_pkt,inter_arrival_time_summed,tot_packet,rel_start,src_ip,src_port
0,0,0,29515252,10.2.4.172,63633,48.289258,29515252,30351,48.289258,30351,0.0,10.2.21.28,389
1,0,0,1402727437,10.2.4.146,9876,48.289677,1402727437,960992,48.289677,960992,1e-06,10.2.4.179,40920
2,9460,11,10300,10.2.25.108,60788,0.004498,840,8,0.004498,19,0.000154,10.2.253.247,51127
3,3378036,7853,8554000,10.2.4.179,47892,48.289389,5175964,4106,48.289389,11959,0.000155,10.2.4.168,50334
4,51192,79,67624,10.2.27.29,1571,48.00022,16432,165,48.00022,244,0.000309,10.2.27.33,55476
