In [1]:
import os
import os.path
import sys
import pandas as pd
import numpy as np

In [2]:
LOCAL_PATH = 'data'
RAW_TRACE = 'ashakan_raw.pcapng'
TRACE_FEATURE_FILE = 'tcp_flow_features.csv'
BUCKET_NAME = 'edu.nyu.hsn.ddos-data' # replace with your bucket name
KEY = 'CAP_NIC1_00931_20130727230801.dms' # replace with your object key

In [3]:
if not os.path.exists(LOCAL_PATH + "/" + RAW_TRACE):
    if not os.path.exists(LOCAL_PATH):
        os.mkdir(LOCAL_PATH)
    
    import boto3
    import botocore

    s3 = boto3.resource('s3')

    try:
        s3.Bucket(BUCKET_NAME).download_file(KEY, LOCAL_PATH + "/" + RAW_TRACE)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise

In [4]:
import subprocess
if not os.path.exists(LOCAL_PATH + "/" + TRACE_FEATURE_FILE):
    tshark_command = subprocess.Popen('tshark -r {} -Y tcp -T fields -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e tcp.len -e frame.time_relative -e tcp.seq -e tcp.ack -e tcp.flags.ack -e tcp.flags.syn -e tcp.flags.fin -e tcp.stream -Eheader=y -Eseparator=, > {}'.format(LOCAL_PATH + "/" + RAW_TRACE, LOCAL_PATH + "/" + TRACE_FEATURE_FILE), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out_data, err_data = tshark_command.communicate()
    out_data, err_data = out_data.decode('utf-8'), err_data.decode('utf-8')
    if err_data != '':
        pass
#         print(err_data)

In [5]:
trace_df = pd.read_csv(LOCAL_PATH + "/" + TRACE_FEATURE_FILE)
trace_df['src_addr'] = trace_df['ip.src'] + ":" + trace_df['tcp.srcport'].apply(str)
trace_df['dst_addr'] = trace_df['ip.dst'] + ":" + trace_df['tcp.dstport'].apply(str)

In [6]:
trace_df.shape

(3275767, 14)

In [7]:
trace_df.dtypes

ip.src                  object
ip.dst                  object
tcp.srcport              int64
tcp.dstport              int64
tcp.len                  int64
frame.time_relative    float64
tcp.seq                  int64
tcp.ack                  int64
tcp.flags.ack            int64
tcp.flags.syn            int64
tcp.flags.fin            int64
tcp.stream               int64
src_addr                object
dst_addr                object
dtype: object

In [8]:
trace_df.describe()

Unnamed: 0,tcp.srcport,tcp.dstport,tcp.len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream
count,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0,3275767.0
mean,19590.4,11897.82,1001.266,27.70661,389656500.0,55832030.0,0.9967156,0.006052323,0.005241521,1130.049
std,21304.04,17097.13,645.4179,11.89198,462089200.0,158493700.0,0.05721569,0.07756091,0.07220837,3101.892
min,21.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,445.0,2770.0,118.0,17.34808,803314.0,1.0,1.0,0.0,0.0,1.0
50%,2770.0,2901.0,1460.0,27.84759,236458800.0,303409.0,1.0,0.0,0.0,14.0
75%,40920.0,9876.0,1460.0,37.92557,642160200.0,1348768.0,1.0,0.0,0.0,38.0
max,65534.0,65516.0,1460.0,48.28968,4294967000.0,4294967000.0,1.0,1.0,1.0,17983.0


In [9]:
trace_df.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,tcp.len,frame.time_relative,tcp.seq,tcp.ack,tcp.flags.ack,tcp.flags.syn,tcp.flags.fin,tcp.stream,src_addr,dst_addr
0,10.2.21.28,10.2.4.172,389,63633,293,0.0,1,1,1,0,0,0,10.2.21.28:389,10.2.4.172:63633
1,10.2.4.179,10.2.4.146,40920,9876,1460,1e-06,1,1,1,0,0,1,10.2.4.179:40920,10.2.4.146:9876
2,10.2.4.179,10.2.4.146,40920,9876,1460,2e-06,1461,1,1,0,0,1,10.2.4.179:40920,10.2.4.146:9876
3,10.2.4.179,10.2.4.146,40920,9876,1460,2e-06,2921,1,1,0,0,1,10.2.4.179:40920,10.2.4.146:9876
4,10.2.4.179,10.2.4.146,40920,9876,1460,2e-06,4381,1,1,0,0,1,10.2.4.179:40920,10.2.4.146:9876


In [10]:
# run 'jupyter nbextension enable --py --sys-prefix widgetsnbextension' first
from tqdm import tqdm
def to_feature_df(raw_trace_df,sampling_rate=1.0):
    def calculate_two_way_tcp(df):
        def get_statistical_features(df, criter, feature_name,name_pred):
            feature_avg = df[criter][feature_name].mean()
            feature_avg = -1 if pd.isnull(feature_avg) else feature_avg
            feature_min = df[criter][feature_name].min()
            feature_min = -1 if pd.isnull(feature_min) else feature_min
            feature_max = df[criter][feature_name].max()
            feature_max = -1 if pd.isnull(feature_max) else feature_max
            feature_std = df[criter][feature_name].std()
            feature_std = -1 if pd.isnull(feature_std) else feature_std
            feature_sum = df[criter][feature_name].sum()
            feature_sum = -1 if pd.isnull(feature_sum) else feature_sum
            feature_count = df[criter][feature_name].count()
            return {'avg('+name_pred+')':feature_avg,'std('+name_pred+')':feature_std,'min('+name_pred+')':feature_min,'max('+name_pred+')':feature_max,'count('+name_pred[0:8]+')':feature_count, 'sum('+name_pred+')':feature_sum}
        
        addrs = list(set(np.append(df['src_addr'].unique(), df['dst_addr'].unique())))
        if len(addrs) != 2:
            raise
        stat = get_statistical_features(df, df['src_addr'] == addrs[0],'tcp.len','forw_pkt_len')
        stat.update(get_statistical_features(df, df['src_addr'] == addrs[1],'tcp.len','back_pkt_len'))
        return pd.Series(stat)

    trace_df = raw_trace_df.sample(frac=sampling_rate)
    tcp_flow_df = pd.DataFrame()
    tcp_flow_df['avg(tcp_pkt_len)'] = trace_df.groupby('tcp.stream')['tcp.len'].mean()
    tcp_flow_df['stddev(tcp_pkt_len)'] = trace_df.groupby('tcp.stream')['tcp.len'].std().fillna(-1)
    tcp_flow_df['min(tcp_pkt_len)'] = trace_df.groupby('tcp.stream')['tcp.len'].min()
    tcp_flow_df['max(tcp_pkt_len)'] = trace_df.groupby('tcp.stream')['tcp.len'].max()
    tcp_flow_df['tot_pkt'] = trace_df.groupby('tcp.stream')['tcp.len'].count()
    tcp_flow_df['tot_byte'] = trace_df.groupby('tcp.stream')['tcp.len'].sum()
    tcp_flow_df['rel_start'] = trace_df.groupby('tcp.stream')['frame.time_relative'].min()
    tcp_flow_df['duration'] = trace_df.groupby('tcp.stream')['frame.time_relative'].max() - tcp_flow_df['rel_start']
    tqdm.pandas(desc='{} samp rate'.format(sampling_rate))
    two_way_flow_df = trace_df.groupby('tcp.stream')[['tcp.len','src_addr','dst_addr']].progress_apply(calculate_two_way_tcp)
    tcp_flow_df = pd.concat([tcp_flow_df,two_way_flow_df],axis=1)
    return tcp_flow_df

In [11]:
# run 'jupyter nbextension enable --py --sys-prefix widgetsnbextension' first
from tqdm import tqdm_notebook
# packet-based random sampling
for sampling_percent in tqdm_notebook([20,40,60,80,100],desc='Sampling'):
    sampling_rate = sampling_percent / 100.0
    to_feature_df(trace_df, sampling_rate).to_csv('{PATH}/packet_rand_{PERCENT}%.csv'.format(PATH=LOCAL_PATH,PERCENT=sampling_percent))

0.2 samp rate: 100%|██████████| 14487/14487 [02:11<00:00, 110.02it/s]
0.4 samp rate: 100%|██████████| 16768/16768 [02:18<00:00, 120.96it/s]
0.6 samp rate: 100%|██████████| 17507/17507 [02:22<00:00, 135.43it/s]
0.8 samp rate: 100%|██████████| 17833/17833 [02:23<00:00, 123.99it/s]
1.0 samp rate: 100%|██████████| 17985/17985 [02:35<00:00, 115.87it/s]



